301 files changed, 11478 insertions, 6949 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index e9cb57f07546..9a1d42630751 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry,
        return 0;
 }
-int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+int v9fs_acl_mode(struct inode *dir, umode_t *modep,
                  struct posix_acl **dpacl, struct posix_acl **pacl)
 {
        int retval = 0;
-        mode_t mode = *modep;
+        umode_t mode = *modep;
        struct posix_acl *acl = NULL;
        if (!S_ISLNK(mode)) {
@@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        umode_t mode = inode->i_mode;
                        retval = posix_acl_equiv_mode(acl, &mode);
                        if (retval < 0)
                                goto err_out;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ddb7ae19d971..559556411965 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
                               struct posix_acl **, struct posix_acl **);
-extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
                         struct posix_acl **dpacl, struct posix_acl **pacl);
 #else
 #define v9fs_iop_get_acl NULL
@@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry,
 {
        return 0;
 }
-static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep,
+static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
                                struct posix_acl **dpacl,
                                struct posix_acl **pacl)
 {
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 46ce357ca1ab..410ffd6ceb5f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-                    struct inode *inode, int mode);
+                    struct inode *inode, int mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
        return;
 }
+int v9fs_open_to_dotl_flags(int flags);
 #endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3c173fcc2c5a..62857a810a79 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
-                omode = file->f_flags;
+                omode = v9fs_open_to_dotl_flags(file->f_flags);
        else
                omode = v9fs_uflags2omode(file->f_flags,
                                        v9fs_proto_dotu(v9ses));
@@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
        /* convert posix lock to p9 tlock args */
        memset(&flock, 0, sizeof(flock));
-        flock.type = fl->fl_type;
+        /* map the lock type */
+        switch (fl->fl_type) {
+        case F_RDLCK:
+                flock.type = P9_LOCK_TYPE_RDLCK;
+                break;
+        case F_WRLCK:
+                flock.type = P9_LOCK_TYPE_WRLCK;
+                break;
+        case F_UNLCK:
+                flock.type = P9_LOCK_TYPE_UNLCK;
+                break;
+        }
        flock.start = fl->fl_start;
        if (fl->fl_end == OFFSET_MAX)
                flock.length = 0;
@@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
        /* convert posix lock to p9 tgetlock args */
        memset(&glock, 0, sizeof(glock));
-        glock.type = fl->fl_type;
+        glock.type  = P9_LOCK_TYPE_UNLCK;
        glock.start = fl->fl_start;
        if (fl->fl_end == OFFSET_MAX)
                glock.length = 0;
@@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
        res = p9_client_getlock_dotl(fid, &glock);
        if (res < 0)
                return res;
-        if (glock.type != F_UNLCK) {
+        /* map 9p lock type to os lock type */
-                fl->fl_type = glock.type;
+        switch (glock.type) {
+        case P9_LOCK_TYPE_RDLCK:
+                fl->fl_type = F_RDLCK;
+                break;
+        case P9_LOCK_TYPE_WRLCK:
+                fl->fl_type = F_WRLCK;
+                break;
+        case P9_LOCK_TYPE_UNLCK:
+                fl->fl_type = F_UNLCK;
+                break;
+        }
+        if (glock.type != P9_LOCK_TYPE_UNLCK) {
                fl->fl_start = glock.start;
                if (glock.length == 0)
                        fl->fl_end = OFFSET_MAX;
                else
                        fl->fl_end = glock.start + glock.length - 1;
                fl->fl_pid = glock.proc_id;
-        } else
+        }
-                fl->fl_type = F_UNLCK;
        return res;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8bb5507e822f..e3c03db3c788 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 /**
 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
 * @v9ses: v9fs session information
- * @mode: mode to convert
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
 *
 */
+static int p9mode2unixmode(struct v9fs_session_info *v9ses,
-static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+                           struct p9_wstat *stat, dev_t *rdev)
 {
        int res;
+        int mode = stat->mode;
-        res = mode & 0777;
+        res = mode & S_IALLUGO;
+        *rdev = 0;
        if ((mode & P9_DMDIR) == P9_DMDIR)
                res |= S_IFDIR;
@@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
                 && (v9ses->nodev == 0))
                res |= S_IFIFO;
        else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
-                 && (v9ses->nodev == 0))
+                 && (v9ses->nodev == 0)) {
-                res |= S_IFBLK;
+                char type = 0, ext[32];
-        else
+                int major = -1, minor = -1;
+                strncpy(ext, stat->extension, sizeof(ext));
+                sscanf(ext, "%c %u %u", &type, &major, &minor);
+                switch (type) {
+                case 'c':
+                        res |= S_IFCHR;
+                        break;
+                case 'b':
+                        res |= S_IFBLK;
+                        break;
+                default:
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                "Unknown special type %c %s\n", type,
+                                stat->extension);
+                };
+                *rdev = MKDEV(major, minor);
+        } else
                res |= S_IFREG;
        if (v9fs_proto_dotu(v9ses)) {
@@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
                if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
                        res |= S_ISVTX;
        }
        return res;
 }
@@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode)
 }
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-                    struct inode *inode, int mode)
+                    struct inode *inode, int mode, dev_t rdev)
 {
        int err = 0;
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
-        inode->i_rdev = 0;
+        inode->i_rdev = rdev;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_mapping->a_ops = &v9fs_addr_operations;
@@ -335,7 +354,7 @@ error:
 *
 */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
 {
        int err;
        struct inode *inode;
@@ -348,7 +367,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
                return ERR_PTR(-ENOMEM);
        }
-        err = v9fs_init_inode(v9ses, inode, mode);
+        err = v9fs_init_inode(v9ses, inode, mode, rdev);
        if (err) {
                iput(inode);
                return ERR_PTR(err);
@@ -435,11 +454,12 @@ void v9fs_evict_inode(struct inode *inode)
 static int v9fs_test_inode(struct inode *inode, void *data)
 {
        int umode;
+        dev_t rdev;
        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_wstat *st = (struct p9_wstat *)data;
        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-        umode = p9mode2unixmode(v9ses, st->mode);
+        umode = p9mode2unixmode(v9ses, st, &rdev);
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
                return 0;
@@ -473,6 +493,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
                                   struct p9_wstat *st,
                                   int new)
 {
+        dev_t rdev;
        int retval, umode;
        unsigned long i_ino;
        struct inode *inode;
@@ -496,8 +517,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
         * later.
         */
        inode->i_ino = i_ino;
-        umode = p9mode2unixmode(v9ses, st->mode);
+        umode = p9mode2unixmode(v9ses, st, &rdev);
-        retval = v9fs_init_inode(v9ses, inode, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode, rdev);
        if (retval)
                goto error;
@@ -532,6 +553,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 }
 /**
+ * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
+ * plan 9 AT flag.
+ * @flags: flags to convert
+ */
+static int v9fs_at_to_dotl_flags(int flags)
+{
+        int rflags = 0;
+        if (flags & AT_REMOVEDIR)
+                rflags |= P9_DOTL_AT_REMOVEDIR;
+        return rflags;
+}
+/**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
 * @dentry:  dentry that is being deleted
@@ -558,7 +592,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
                return retval;
        }
        if (v9fs_proto_dotl(v9ses))
-                retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags);
+                retval = p9_client_unlinkat(dfid, dentry->d_name.name,
+                                            v9fs_at_to_dotl_flags(flags));
        if (retval == -EOPNOTSUPP) {
                /* Try the one based on path */
                v9fid = v9fs_fid_clone(dentry);
@@ -645,13 +680,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
                goto error;
+        d_instantiate(dentry, inode);
        return ofid;
 error:
        if (ofid)
                p9_client_clunk(ofid);
@@ -792,6 +825,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
+        struct dentry *res;
        struct super_block *sb;
        struct v9fs_session_info *v9ses;
        struct p9_fid *dfid, *fid;
@@ -823,22 +857,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
+        /*
-        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+         * Make sure we don't use a wrong inode due to parallel
+         * unlink. For cached mode create calls request for new
+         * inode. But with cache disabled, lookup should do this.
+         */
+        if (v9ses->cache)
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+        else
+                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
                goto error;
        }
        result = v9fs_fid_add(dentry, fid);
        if (result < 0)
                goto error_iput;
 inst_out:
-        d_add(dentry, inode);
+        /*
-        return NULL;
+         * If we had a rename on the server and a parallel lookup
+         * for the new name, then make sure we instantiate with
+         * the new name. ie look up for a/b, while on server somebody
+         * moved b under k and client parallely did a lookup for
+         * k/b.
+         */
+        res = d_materialise_unique(dentry, inode);
+        if (!IS_ERR(res))
+                return res;
+        result = PTR_ERR(res);
 error_iput:
        iput(inode);
 error:
@@ -1002,7 +1049,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                return PTR_ERR(st);
        v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
-                generic_fillattr(dentry->d_inode, stat);
+        generic_fillattr(dentry->d_inode, stat);
        p9stat_free(st);
        kfree(st);
@@ -1086,6 +1133,7 @@ void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        struct super_block *sb)
 {
+        mode_t mode;
        char ext[32];
        char tag_name[14];
        unsigned int i_nlink;
@@ -1121,31 +1169,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
                                inode->i_nlink = i_nlink;
                }
        }
-        inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
+        mode = stat->mode & S_IALLUGO;
-        if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
+        mode |= inode->i_mode & ~S_IALLUGO;
-                char type = 0;
+        inode->i_mode = mode;
-                int major = -1;
-                int minor = -1;
-                strncpy(ext, stat->extension, sizeof(ext));
-                sscanf(ext, "%c %u %u", &type, &major, &minor);
-                switch (type) {
-                case 'c':
-                        inode->i_mode &= ~S_IFBLK;
-                        inode->i_mode |= S_IFCHR;
-                        break;
-                case 'b':
-                        break;
-                default:
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                "Unknown special type %c %s\n", type,
-                                stat->extension);
-                };
-                inode->i_rdev = MKDEV(major, minor);
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-        } else
-                inode->i_rdev = 0;
        i_size_write(inode, stat->length);
        /* not real number of blocks, but 512 byte ones ... */
@@ -1411,6 +1437,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
+        int umode;
+        dev_t rdev;
        loff_t i_size;
        struct p9_wstat *st;
        struct v9fs_session_info *v9ses;
@@ -1419,6 +1447,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
        st = p9_client_stat(fid);
        if (IS_ERR(st))
                return PTR_ERR(st);
+        /*
+         * Don't update inode if the file type is different
+         */
+        umode = p9mode2unixmode(v9ses, st, &rdev);
+        if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+                goto out;
        spin_lock(&inode->i_lock);
        /*
@@ -1430,6 +1464,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
        if (v9ses->cache)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
+out:
        p9stat_free(st);
        kfree(st);
        return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 9a26dce5a99f..aded79fcd5cf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
         * later.
         */
        inode->i_ino = i_ino;
-        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        retval = v9fs_init_inode(v9ses, inode,
+                                 st->st_mode, new_decode_dev(st->st_rdev));
        if (retval)
                goto error;
@@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        return inode;
 }
+struct dotl_openflag_map {
+        int open_flag;
+        int dotl_flag;
+};
+static int v9fs_mapped_dotl_flags(int flags)
+{
+        int i;
+        int rflags = 0;
+        struct dotl_openflag_map dotl_oflag_map[] = {
+                { O_CREAT,      P9_DOTL_CREATE },
+                { O_EXCL,       P9_DOTL_EXCL },
+                { O_NOCTTY,     P9_DOTL_NOCTTY },
+                { O_TRUNC,      P9_DOTL_TRUNC },
+                { O_APPEND,     P9_DOTL_APPEND },
+                { O_NONBLOCK,   P9_DOTL_NONBLOCK },
+                { O_DSYNC,      P9_DOTL_DSYNC },
+                { FASYNC,       P9_DOTL_FASYNC },
+                { O_DIRECT,     P9_DOTL_DIRECT },
+                { O_LARGEFILE,  P9_DOTL_LARGEFILE },
+                { O_DIRECTORY,  P9_DOTL_DIRECTORY },
+                { O_NOFOLLOW,   P9_DOTL_NOFOLLOW },
+                { O_NOATIME,    P9_DOTL_NOATIME },
+                { O_CLOEXEC,    P9_DOTL_CLOEXEC },
+                { O_SYNC,       P9_DOTL_SYNC},
+        };
+        for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+                if (flags & dotl_oflag_map[i].open_flag)
+                        rflags |= dotl_oflag_map[i].dotl_flag;
+        }
+        return rflags;
+}
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+        int rflags = 0;
+        /*
+         * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+         * and P9_DOTL_NOACCESS
+         */
+        rflags |= flags & O_ACCMODE;
+        rflags |= v9fs_mapped_dotl_flags(flags);
+        return rflags;
+}
 /**
 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
 * @dir: directory inode that is being created
@@ -206,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        int err = 0;
        gid_t gid;
        int flags;
-        mode_t mode;
+        umode_t mode;
        char *name = NULL;
        struct file *filp;
        struct p9_qid qid;
@@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                           "Failed to get acl values in creat %d\n", err);
                goto error;
        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+                                    mode, gid, &qid);
        if (err < 0) {
                P9_DPRINTK(P9_DEBUG_VFS,
                                "p9_client_open_dotl failed in creat %d\n",
@@ -281,10 +335,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
                goto error;
+        d_instantiate(dentry, inode);
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, &dacl, &pacl);
@@ -348,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        struct p9_fid *fid = NULL, *dfid = NULL;
        gid_t gid;
        char *name;
-        mode_t mode;
+        umode_t mode;
        struct inode *inode;
        struct p9_qid qid;
        struct dentry *dir_dentry;
@@ -403,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                                err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /*
@@ -414,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                 * inode with stat. We need to get an inode
                 * so that we can set the acl with dentry
                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
+                inode = v9fs_get_inode(dir->i_sb, mode, 0);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -540,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        mode_t mode;
        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -552,11 +607,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
                inode->i_uid = stat->st_uid;
                inode->i_gid = stat->st_gid;
                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                mode = stat->st_mode & S_IALLUGO;
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                mode |= inode->i_mode & ~S_IALLUGO;
+                inode->i_mode = mode;
                i_size_write(inode, stat->st_size);
                inode->i_blocks = stat->st_blocks;
@@ -657,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                                        err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -751,7 +805,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
        int err;
        gid_t gid;
        char *name;
-        mode_t mode;
+        umode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
@@ -810,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /*
                 * Not in cached mode. No need to populate inode with stat.
                 * socket syscall returns a fd, so we need instantiate
                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
+                inode = v9fs_get_inode(dir->i_sb, mode, rdev);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -886,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
        if (IS_ERR(st))
                return PTR_ERR(st);
+        /*
+         * Don't update inode if the file type is different
+         */
+        if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+                goto out;
        spin_lock(&inode->i_lock);
        /*
@@ -897,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
        if (v9ses->cache)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
+out:
        kfree(st);
        return 0;
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index feef6cdc1fd2..c70251d47ed1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        else
                sb->s_d_op = &v9fs_dentry_operations;
-        inode = v9fs_get_inode(sb, S_IFDIR | mode);
+        inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
                goto release_sb;
diff --git a/fs/Kconfig b/fs/Kconfig
index 19891aab9c6e..9fe0b349f4cd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL
        select TMPFS_XATTR
        select GENERIC_ACL
        help
-          POSIX Access Control Lists (ACLs) support permissions for users and
+          POSIX Access Control Lists (ACLs) support additional access rights
-          groups beyond the owner/group/world scheme.
+          for users and groups beyond the standard owner/group/world scheme,
+          and this option selects support for ACLs specifically for tmpfs
+          filesystems.
+          If you've selected TMPFS, it's possible that you'll also need
+          this option as there are a number of Linux distros that require
+          POSIX ACL support under /dev for certain features to work properly.
+          For example, some distros need this feature for ALSA-related /dev
+          files for sound to work properly.  In short, if you're not sure,
+          say Y.
          To learn more about Access Control Lists, visit the POSIX ACLs for
          Linux website <http://acl.bestbits.at/>.
-          If you don't know what Access Control Lists are, say N.
 config TMPFS_XATTR
        bool "Tmpfs extended attributes"
        depends on TMPFS
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 4d433d34736f..f11e43ed907d 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
 */
 static struct inode *anon_inode_mkinode(void)
 {
-        struct inode *inode = new_inode(anon_inode_mnt->mnt_sb);
+        struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 475f9c597cb7..326dc08d3e3f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -39,27 +39,17 @@
 /* #define DEBUG */
-#ifdef DEBUG
+#define DPRINTK(fmt, ...)                               \
-#define DPRINTK(fmt, args...)                           \
+        pr_debug("pid %d: %s: " fmt "\n",               \
-do {                                                    \
+                current->pid, __func__, ##__VA_ARGS__)
-        printk(KERN_DEBUG "pid %d: %s: " fmt "\n",      \
-                current->pid, __func__, ##args);        \
+#define AUTOFS_WARN(fmt, ...)                           \
-} while (0)
-#else
-#define DPRINTK(fmt, args...) do {} while (0)
-#endif
-#define AUTOFS_WARN(fmt, args...)                       \
-do {                                                    \
        printk(KERN_WARNING "pid %d: %s: " fmt "\n",    \
-                current->pid, __func__, ##args);        \
+                current->pid, __func__, ##__VA_ARGS__)
-} while (0)
-#define AUTOFS_ERROR(fmt, args...)                      \
+#define AUTOFS_ERROR(fmt, ...)                          \
-do {                                                    \
        printk(KERN_ERR "pid %d: %s: " fmt "\n",        \
-                current->pid, __func__, ##args);        \
+                current->pid, __func__, ##__VA_ARGS__)
-} while (0)
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 25435987d6ae..e1fbdeef85db 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        size_t pktsz;
        DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-                wq->wait_queue_token, wq->name.len, wq->name.name, type);
+                (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
        memset(&pkt,0,sizeof pkt); /* For security reasons */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 54b8c28bebc8..720d885e8dca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                befs_data_stream *data = &befs_ino->i_data.ds;
                befs_off_t len = data->size;
-                befs_debug(sb, "Follow long symlink");
+                if (len == 0) {
+                        befs_error(sb, "Long symlink with illegal length");
-                link = kmalloc(len, GFP_NOFS);
-                if (!link) {
-                        link = ERR_PTR(-ENOMEM);
-                } else if (befs_read_lsymlink(sb, data, link, len) != len) {
-                        kfree(link);
-                        befs_error(sb, "Failed to read entire long symlink");
                        link = ERR_PTR(-EIO);
                } else {
-                        link[len - 1] = '\0';
+                        befs_debug(sb, "Follow long symlink");
+                        link = kmalloc(len, GFP_NOFS);
+                        if (!link) {
+                                link = ERR_PTR(-ENOMEM);
+                        } else if (befs_read_lsymlink(sb, data, link, len) != len) {
+                                kfree(link);
+                                befs_error(sb, "Failed to read entire long symlink");
+                                link = ERR_PTR(-EIO);
+                        } else {
+                                link[len - 1] = '\0';
+                        }
                }
        } else {
                link = befs_ino->i_data.symlink;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c62fb84944d5..95f786ec7f08 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
 {
        return &BDEV_I(inode)->bdev;
 }
 EXPORT_SYMBOL(I_BDEV);
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
- * we need to move it onto the dirty list of @dst so that the inode is always
+ * need to move it onto the dirty list of @dst so that the inode is always on
- * on the right list.
+ * the right list.
 */
 static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
 {
-        spin_lock(&inode_wb_list_lock);
+        struct backing_dev_info *old = inode->i_data.backing_dev_info;
+        if (unlikely(dst == old))               /* deadlock avoidance */
+                return;
+        bdi_lock_two(&old->wb, &dst->wb);
        spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
        if (inode->i_state & I_DIRTY)
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
        spin_unlock(&inode->i_lock);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&old->wb.list_lock);
+        spin_unlock(&dst->wb.list_lock);
 }
 static sector_t max_block(struct block_device *bdev)
@@ -383,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
        struct inode *bd_inode = filp->f_mapping->host;
        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
+        
+        error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+        if (error)
+                return error;
        /*
         * There is no need to serialise calls to blkdev_issue_flush with
@@ -548,6 +556,7 @@ struct block_device *bdget(dev_t dev)
        if (inode->i_state & I_NEW) {
                bdev->bd_contains = NULL;
+                bdev->bd_super = NULL;
                bdev->bd_inode = inode;
                bdev->bd_block_size = (1 << inode->i_blkbits);
                bdev->bd_part_count = 0;
@@ -1420,6 +1429,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
+                /* ->release can cause the old bdi to disappear,
+                 * so must switch it out first
+                 */
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
@@ -1433,8 +1447,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev_inode_switch_bdi(bdev->bd_inode,
-                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf1cd25..40e6ac08c21f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
+           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 65a735d8f6e4..eb159aaa5a11 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,8 +28,6 @@
 #include "btrfs_inode.h"
 #include "xattr.h"
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
        int size;
@@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
        int ret, size = 0;
        const char *name;
        char *value = NULL;
-        mode_t mode;
        if (acl) {
                ret = posix_acl_valid(acl);
@@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
        switch (type) {
        case ACL_TYPE_ACCESS:
-                mode = inode->i_mode;
                name = POSIX_ACL_XATTR_ACCESS;
                if (acl) {
-                        ret = posix_acl_equiv_mode(acl, &mode);
+                        ret = posix_acl_equiv_mode(acl, &inode->i_mode);
                        if (ret < 0)
                                return ret;
-                        inode->i_mode = mode;
                }
                ret = 0;
                break;
@@ -222,19 +217,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
        }
        if (IS_POSIXACL(dir) && acl) {
-                mode_t mode = inode->i_mode;
                if (S_ISDIR(inode->i_mode)) {
                        ret = btrfs_set_acl(trans, inode, acl,
                                            ACL_TYPE_DEFAULT);
                        if (ret)
                                goto failed;
                }
-                ret = posix_acl_create(&acl, GFP_NOFS, &mode);
+                ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
                if (ret < 0)
                        return ret;
-                inode->i_mode = mode;
                if (ret > 0) {
                        /* we need an acl */
                        ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
@@ -282,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = {
        .get    = btrfs_xattr_acl_get,
        .set    = btrfs_xattr_acl_set,
 };
-#else /* CONFIG_BTRFS_FS_POSIX_ACL */
-int btrfs_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
-                   struct inode *inode, struct inode *dir)
-{
-        return 0;
-}
-#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 52d7eca8c7bf..d9f99a16edd6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,9 @@ struct btrfs_inode {
         */
        struct btrfs_key location;
+        /* Lock for counters */
+        spinlock_t lock;
        /* the extent_tree has caches of all the extent mappings to disk */
        struct extent_map_tree extent_tree;
@@ -134,8 +137,8 @@ struct btrfs_inode {
         * items we think we'll end up using, and reserved_extents is the number
         * of extent items we've reserved metadata for.
         */
-        atomic_t outstanding_extents;
+        unsigned outstanding_extents;
-        atomic_t reserved_extents;
+        unsigned reserved_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -173,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode)
 {
        u64 ino = BTRFS_I(inode)->location.objectid;
-        if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+        /*
+         * !ino: btree_inode
+         * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+         */
+        if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
                ino = inode->i_ino;
        return ino;
 }
@@ -184,4 +191,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
        BTRFS_I(inode)->disk_i_size = size;
 }
+static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
+                                       struct inode *inode)
+{
+        if (root == root->fs_info->tree_root ||
+            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+                return true;
+        return false;
+}
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfe42b03eaf9..8ec5d86f1734 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        u64 first_byte = disk_start;
        struct block_device *bdev;
        int ret;
+        int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
                        BUG_ON(ret);
-                        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+                        if (!skip_sum) {
-                        BUG_ON(ret);
+                                ret = btrfs_csum_one_bio(root, inode, bio,
+                                                         start, 1);
+                                BUG_ON(ret);
+                        }
                        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
                        BUG_ON(ret);
@@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
-        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+        if (!skip_sum) {
-        BUG_ON(ret);
+                ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+                BUG_ON(ret);
+        }
        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
        BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2e667868e0d2..011cab3aca8d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 {
        int i;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-                if (p->nodes[i] && p->locks[i])
+                if (!p->nodes[i] || !p->locks[i])
-                        btrfs_set_lock_blocking(p->nodes[i]);
+                        continue;
+                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
+                if (p->locks[i] == BTRFS_READ_LOCK)
+                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+                else if (p->locks[i] == BTRFS_WRITE_LOCK)
+                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
        }
 }
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 * for held
 */
 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
-                                        struct extent_buffer *held)
+                                        struct extent_buffer *held, int held_rw)
 {
        int i;
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
         * really sure by forcing the path to blocking before we clear
         * the path blocking.
         */
-        if (held)
+        if (held) {
-                btrfs_set_lock_blocking(held);
+                btrfs_set_lock_blocking_rw(held, held_rw);
+                if (held_rw == BTRFS_WRITE_LOCK)
+                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
+                else if (held_rw == BTRFS_READ_LOCK)
+                        held_rw = BTRFS_READ_LOCK_BLOCKING;
+        }
        btrfs_set_path_blocking(p);
 #endif
        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
-                if (p->nodes[i] && p->locks[i])
+                if (p->nodes[i] && p->locks[i]) {
-                        btrfs_clear_lock_blocking(p->nodes[i]);
+                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
+                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
+                                p->locks[i] = BTRFS_WRITE_LOCK;
+                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
+                                p->locks[i] = BTRFS_READ_LOCK;
+                }
        }
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (held)
-                btrfs_clear_lock_blocking(held);
+                btrfs_clear_lock_blocking_rw(held, held_rw);
 #endif
 }
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
                if (!p->nodes[i])
                        continue;
                if (p->locks[i]) {
-                        btrfs_tree_unlock(p->nodes[i]);
+                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
                        p->locks[i] = 0;
                }
                free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
        return eb;
 }
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+        struct extent_buffer *eb;
+        while (1) {
+                eb = btrfs_root_node(root);
+                btrfs_tree_read_lock(eb);
+                if (eb == root->node)
+                        break;
+                btrfs_tree_read_unlock(eb);
+                free_extent_buffer(eb);
+        }
+        return eb;
+}
 /* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
@@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        for (i = start_slot; i < end_slot; i++) {
                int close = 1;
-                if (!parent->map_token) {
-                        map_extent_buffer(parent,
-                                        btrfs_node_key_ptr_offset(i),
-                                        sizeof(struct btrfs_key_ptr),
-                                        &parent->map_token, &parent->kaddr,
-                                        &parent->map_start, &parent->map_len,
-                                        KM_USER1);
-                }
                btrfs_node_key(parent, &disk_key, i);
                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
                        continue;
@@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        last_block = blocknr;
                        continue;
                }
-                if (parent->map_token) {
-                        unmap_extent_buffer(parent, parent->map_token,
-                                            KM_USER1);
-                        parent->map_token = NULL;
-                }
                cur = btrfs_find_tree_block(root, blocknr, blocksize);
                if (cur)
@@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(cur);
                free_extent_buffer(cur);
        }
-        if (parent->map_token) {
-                unmap_extent_buffer(parent, parent->map_token,
-                                    KM_USER1);
-                parent->map_token = NULL;
-        }
        return err;
 }
@@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
        struct btrfs_disk_key *tmp = NULL;
        struct btrfs_disk_key unaligned;
        unsigned long offset;
-        char *map_token = NULL;
        char *kaddr = NULL;
        unsigned long map_start = 0;
        unsigned long map_len = 0;
@@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
                mid = (low + high) / 2;
                offset = p + mid * item_size;
-                if (!map_token || offset < map_start ||
+                if (!kaddr || offset < map_start ||
                    (offset + sizeof(struct btrfs_disk_key)) >
                    map_start + map_len) {
-                        if (map_token) {
-                                unmap_extent_buffer(eb, map_token, KM_USER0);
-                                map_token = NULL;
-                        }
                        err = map_private_extent_buffer(eb, offset,
                                                sizeof(struct btrfs_disk_key),
-                                                &map_token, &kaddr,
+                                                &kaddr, &map_start, &map_len);
-                                                &map_start, &map_len, KM_USER0);
                        if (!err) {
                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
                        high = mid;
                else {
                        *slot = mid;
-                        if (map_token)
-                                unmap_extent_buffer(eb, map_token, KM_USER0);
                        return 0;
                }
        }
        *slot = low;
-        if (map_token)
-                unmap_extent_buffer(eb, map_token, KM_USER0);
        return 1;
 }
@@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        mid = path->nodes[level];
-        WARN_ON(!path->locks[level]);
+        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
+                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root,
        u32 nr;
        u32 blocksize;
        u32 nscan = 0;
-        bool map = true;
        if (level != 1)
                return;
@@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root,
        nritems = btrfs_header_nritems(node);
        nr = slot;
-        if (node->map_token || path->skip_locking)
-                map = false;
        while (1) {
-                if (map && !node->map_token) {
-                        unsigned long offset = btrfs_node_key_ptr_offset(nr);
-                        map_private_extent_buffer(node, offset,
-                                                  sizeof(struct btrfs_key_ptr),
-                                                  &node->map_token,
-                                                  &node->kaddr,
-                                                  &node->map_start,
-                                                  &node->map_len, KM_USER1);
-                }
                if (direction < 0) {
                        if (nr == 0)
                                break;
@@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root,
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        gen = btrfs_node_ptr_generation(node, nr);
-                        if (map && node->map_token) {
-                                unmap_extent_buffer(node, node->map_token,
-                                                    KM_USER1);
-                                node->map_token = NULL;
-                        }
                        readahead_tree_block(root, search, blocksize, gen);
                        nread += blocksize;
                }
@@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root,
                if ((nread > 65536 || nscan > 32))
                        break;
        }
-        if (map && node->map_token) {
-                unmap_extent_buffer(node, node->map_token, KM_USER1);
-                node->map_token = NULL;
-        }
 }
 /*
@@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
                t = path->nodes[i];
                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
-                        btrfs_tree_unlock(t);
+                        btrfs_tree_unlock_rw(t, path->locks[i]);
                        path->locks[i] = 0;
                }
        }
@@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
                        continue;
                if (!path->locks[i])
                        continue;
-                btrfs_tree_unlock(path->nodes[i]);
+                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
                path->locks[i] = 0;
        }
 }
@@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
                         * we can trust our generation number
                         */
                        free_extent_buffer(tmp);
+                        btrfs_set_path_blocking(p);
                        tmp = read_tree_block(root, blocknr, blocksize, gen);
                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
                                *eb_ret = tmp;
@@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 static int
 setup_nodes_for_search(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *p,
-                       struct extent_buffer *b, int level, int ins_len)
+                       struct extent_buffer *b, int level, int ins_len,
+                       int *write_lock_level)
 {
        int ret;
        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
                int sret;
+                if (*write_lock_level < level + 1) {
+                        *write_lock_level = level + 1;
+                        btrfs_release_path(p);
+                        goto again;
+                }
                sret = reada_for_balance(root, p, level);
                if (sret)
                        goto again;
                btrfs_set_path_blocking(p);
                sret = split_node(trans, root, p, level);
-                btrfs_clear_path_blocking(p, NULL);
+                btrfs_clear_path_blocking(p, NULL, 0);
                BUG_ON(sret > 0);
                if (sret) {
@@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
                int sret;
+                if (*write_lock_level < level + 1) {
+                        *write_lock_level = level + 1;
+                        btrfs_release_path(p);
+                        goto again;
+                }
                sret = reada_for_balance(root, p, level);
                if (sret)
                        goto again;
                btrfs_set_path_blocking(p);
                sret = balance_level(trans, root, p, level);
-                btrfs_clear_path_blocking(p, NULL);
+                btrfs_clear_path_blocking(p, NULL, 0);
                if (sret) {
                        ret = sret;
@@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        int err;
        int level;
        int lowest_unlock = 1;
+        int root_lock;
+        /* everything at write_lock_level or lower must be write locked */
+        int write_lock_level = 0;
        u8 lowest_level = 0;
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
        WARN_ON(p->nodes[0] != NULL);
-        if (ins_len < 0)
+        if (ins_len < 0) {
                lowest_unlock = 2;
+                /* when we are removing items, we might have to go up to level
+                 * two as we update tree pointers  Make sure we keep write
+                 * for those levels as well
+                 */
+                write_lock_level = 2;
+        } else if (ins_len > 0) {
+                /*
+                 * for inserting items, make sure we have a write lock on
+                 * level 1 so we can update keys
+                 */
+                write_lock_level = 1;
+        }
+        if (!cow)
+                write_lock_level = -1;
+        if (cow && (p->keep_locks || p->lowest_level))
+                write_lock_level = BTRFS_MAX_LEVEL;
 again:
+        /*
+         * we try very hard to do read locks on the root
+         */
+        root_lock = BTRFS_READ_LOCK;
+        level = 0;
        if (p->search_commit_root) {
+                /*
+                 * the commit roots are read only
+                 * so we always do read locks
+                 */
                b = root->commit_root;
                extent_buffer_get(b);
+                level = btrfs_header_level(b);
                if (!p->skip_locking)
-                        btrfs_tree_lock(b);
+                        btrfs_tree_read_lock(b);
        } else {
-                if (p->skip_locking)
+                if (p->skip_locking) {
                        b = btrfs_root_node(root);
-                else
+                        level = btrfs_header_level(b);
-                        b = btrfs_lock_root_node(root);
+                } else {
+                        /* we don't know the level of the root node
+                         * until we actually have it read locked
+                         */
+                        b = btrfs_read_lock_root_node(root);
+                        level = btrfs_header_level(b);
+                        if (level <= write_lock_level) {
+                                /* whoops, must trade for write lock */
+                                btrfs_tree_read_unlock(b);
+                                free_extent_buffer(b);
+                                b = btrfs_lock_root_node(root);
+                                root_lock = BTRFS_WRITE_LOCK;
+                                /* the level might have changed, check again */
+                                level = btrfs_header_level(b);
+                        }
+                }
        }
+        p->nodes[level] = b;
+        if (!p->skip_locking)
+                p->locks[level] = root_lock;
        while (b) {
                level = btrfs_header_level(b);
@@ -1644,10 +1696,6 @@ again:
                 * setup the path here so we can release it under lock
                 * contention with the cow code
                 */
-                p->nodes[level] = b;
-                if (!p->skip_locking)
-                        p->locks[level] = 1;
                if (cow) {
                        /*
                         * if we don't really need to cow this block
@@ -1659,6 +1707,16 @@ again:
                        btrfs_set_path_blocking(p);
+                        /*
+                         * must have write locks on this node and the
+                         * parent
+                         */
+                        if (level + 1 > write_lock_level) {
+                                write_lock_level = level + 1;
+                                btrfs_release_path(p);
+                                goto again;
+                        }
                        err = btrfs_cow_block(trans, root, b,
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
@@ -1671,10 +1729,7 @@ cow_done:
                BUG_ON(!cow && ins_len);
                p->nodes[level] = b;
-                if (!p->skip_locking)
+                btrfs_clear_path_blocking(p, NULL, 0);
-                        p->locks[level] = 1;
-                btrfs_clear_path_blocking(p, NULL);
                /*
                 * we have a lock on b and as long as we aren't changing
@@ -1700,7 +1755,7 @@ cow_done:
                        }
                        p->slots[level] = slot;
                        err = setup_nodes_for_search(trans, root, p, b, level,
-                                                     ins_len);
+                                             ins_len, &write_lock_level);
                        if (err == -EAGAIN)
                                goto again;
                        if (err) {
@@ -1710,6 +1765,19 @@ cow_done:
                        b = p->nodes[level];
                        slot = p->slots[level];
+                        /*
+                         * slot 0 is special, if we change the key
+                         * we have to update the parent pointer
+                         * which means we must have a write lock
+                         * on the parent
+                         */
+                        if (slot == 0 && cow &&
+                            write_lock_level < level + 1) {
+                                write_lock_level = level + 1;
+                                btrfs_release_path(p);
+                                goto again;
+                        }
                        unlock_up(p, level, lowest_unlock);
                        if (level == lowest_level) {
@@ -1728,23 +1796,42 @@ cow_done:
                        }
                        if (!p->skip_locking) {
-                                btrfs_clear_path_blocking(p, NULL);
+                                level = btrfs_header_level(b);
-                                err = btrfs_try_spin_lock(b);
+                                if (level <= write_lock_level) {
+                                        err = btrfs_try_tree_write_lock(b);
-                                if (!err) {
+                                        if (!err) {
-                                        btrfs_set_path_blocking(p);
+                                                btrfs_set_path_blocking(p);
-                                        btrfs_tree_lock(b);
+                                                btrfs_tree_lock(b);
-                                        btrfs_clear_path_blocking(p, b);
+                                                btrfs_clear_path_blocking(p, b,
+                                                                  BTRFS_WRITE_LOCK);
+                                        }
+                                        p->locks[level] = BTRFS_WRITE_LOCK;
+                                } else {
+                                        err = btrfs_try_tree_read_lock(b);
+                                        if (!err) {
+                                                btrfs_set_path_blocking(p);
+                                                btrfs_tree_read_lock(b);
+                                                btrfs_clear_path_blocking(p, b,
+                                                                  BTRFS_READ_LOCK);
+                                        }
+                                        p->locks[level] = BTRFS_READ_LOCK;
                                }
+                                p->nodes[level] = b;
                        }
                } else {
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
+                                if (write_lock_level < 1) {
+                                        write_lock_level = 1;
+                                        btrfs_release_path(p);
+                                        goto again;
+                                }
                                btrfs_set_path_blocking(p);
                                err = split_leaf(trans, root, key,
                                                 p, ins_len, ret == 0);
-                                btrfs_clear_path_blocking(p, NULL);
+                                btrfs_clear_path_blocking(p, NULL, 0);
                                BUG_ON(err > 0);
                                if (err) {
@@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        add_root_to_dirty_list(root);
        extent_buffer_get(c);
        path->nodes[level] = c;
-        path->locks[level] = 1;
+        path->locks[level] = BTRFS_WRITE_LOCK;
        path->slots[level] = 0;
        return 0;
 }
@@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                if (path->slots[0] == i)
                        push_space += data_size;
-                if (!left->map_token) {
-                        map_extent_buffer(left, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &left->map_token, &left->kaddr,
-                                        &left->map_start, &left->map_len,
-                                        KM_USER1);
-                }
                this_item_size = btrfs_item_size(left, item);
                if (this_item_size + sizeof(*item) + push_space > free_space)
                        break;
@@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                        break;
                i--;
        }
-        if (left->map_token) {
-                unmap_extent_buffer(left, left->map_token, KM_USER1);
-                left->map_token = NULL;
-        }
        if (push_items == 0)
                goto out_unlock;
@@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
                item = btrfs_item_nr(right, i);
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
                push_space -= btrfs_item_size(right, item);
                btrfs_set_item_offset(right, item, push_space);
        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
        left_nritems -= push_items;
        btrfs_set_header_nritems(left, left_nritems);
@@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        for (i = 0; i < nr; i++) {
                item = btrfs_item_nr(right, i);
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
                if (!empty && push_items > 0) {
                        if (path->slots[0] < i)
@@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                push_space += this_item_size + sizeof(*item);
        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
        if (push_items == 0) {
                ret = 1;
                goto out;
@@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                u32 ioff;
                item = btrfs_item_nr(left, i);
-                if (!left->map_token) {
-                        map_extent_buffer(left, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &left->map_token, &left->kaddr,
-                                        &left->map_start, &left->map_len,
-                                        KM_USER1);
-                }
                ioff = btrfs_item_offset(left, item);
                btrfs_set_item_offset(left, item,
                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
        }
        btrfs_set_header_nritems(left, old_left_nritems + push_items);
-        if (left->map_token) {
-                unmap_extent_buffer(left, left->map_token, KM_USER1);
-                left->map_token = NULL;
-        }
        /* fixup right node */
        if (push_items > right_nritems) {
@@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        for (i = 0; i < right_nritems; i++) {
                item = btrfs_item_nr(right, i);
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
                push_space = push_space - btrfs_item_size(right, item);
                btrfs_set_item_offset(right, item, push_space);
        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
        btrfs_mark_buffer_dirty(left);
        if (right_nritems)
@@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
                struct btrfs_item *item = btrfs_item_nr(right, i);
                u32 ioff;
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
                ioff = btrfs_item_offset(right, item);
                btrfs_set_item_offset(right, item, ioff + rt_data_off);
        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
        btrfs_set_header_nritems(l, mid);
        ret = 0;
        btrfs_item_key(right, &disk_key, 0);
@@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
                u32 ioff;
                item = btrfs_item_nr(leaf, i);
-                if (!leaf->map_token) {
-                        map_extent_buffer(leaf, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &leaf->map_token, &leaf->kaddr,
-                                        &leaf->map_start, &leaf->map_len,
-                                        KM_USER1);
-                }
                ioff = btrfs_item_offset(leaf, item);
                btrfs_set_item_offset(leaf, item, ioff + size_diff);
        }
-        if (leaf->map_token) {
-                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                leaf->map_token = NULL;
-        }
        /* shift the data */
        if (from_end) {
                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
@@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
                u32 ioff;
                item = btrfs_item_nr(leaf, i);
-                if (!leaf->map_token) {
-                        map_extent_buffer(leaf, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &leaf->map_token, &leaf->kaddr,
-                                        &leaf->map_start, &leaf->map_len,
-                                        KM_USER1);
-                }
                ioff = btrfs_item_offset(leaf, item);
                btrfs_set_item_offset(leaf, item, ioff - data_size);
        }
-        if (leaf->map_token) {
-                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                leaf->map_token = NULL;
-        }
        /* shift the data */
        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
                      data_end - data_size, btrfs_leaf_data(leaf) +
@@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
                 */
                /* first correct the data pointers */
-                WARN_ON(leaf->map_token);
                for (i = slot; i < nritems; i++) {
                        u32 ioff;
                        item = btrfs_item_nr(leaf, i);
-                        if (!leaf->map_token) {
-                                map_extent_buffer(leaf, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &leaf->map_token, &leaf->kaddr,
-                                        &leaf->map_start, &leaf->map_len,
-                                        KM_USER1);
-                        }
                        ioff = btrfs_item_offset(leaf, item);
                        btrfs_set_item_offset(leaf, item, ioff - total_data);
                }
-                if (leaf->map_token) {
-                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                        leaf->map_token = NULL;
-                }
                /* shift the items */
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
                              btrfs_item_nr_offset(slot),
@@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
                 */
                /* first correct the data pointers */
-                WARN_ON(leaf->map_token);
                for (i = slot; i < nritems; i++) {
                        u32 ioff;
                        item = btrfs_item_nr(leaf, i);
-                        if (!leaf->map_token) {
-                                map_extent_buffer(leaf, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &leaf->map_token, &leaf->kaddr,
-                                        &leaf->map_start, &leaf->map_len,
-                                        KM_USER1);
-                        }
                        ioff = btrfs_item_offset(leaf, item);
                        btrfs_set_item_offset(leaf, item, ioff - total_data);
                }
-                if (leaf->map_token) {
-                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                        leaf->map_token = NULL;
-                }
                /* shift the items */
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
                              btrfs_item_nr_offset(slot),
@@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        u32 ioff;
                        item = btrfs_item_nr(leaf, i);
-                        if (!leaf->map_token) {
-                                map_extent_buffer(leaf, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &leaf->map_token, &leaf->kaddr,
-                                        &leaf->map_start, &leaf->map_len,
-                                        KM_USER1);
-                        }
                        ioff = btrfs_item_offset(leaf, item);
                        btrfs_set_item_offset(leaf, item, ioff + dsize);
                }
-                if (leaf->map_token) {
-                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                        leaf->map_token = NULL;
-                }
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
                              btrfs_item_nr_offset(slot + nr),
                              sizeof(struct btrfs_item) *
@@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
        WARN_ON(!path->keep_locks);
 again:
-        cur = btrfs_lock_root_node(root);
+        cur = btrfs_read_lock_root_node(root);
        level = btrfs_header_level(cur);
        WARN_ON(path->nodes[level]);
        path->nodes[level] = cur;
-        path->locks[level] = 1;
+        path->locks[level] = BTRFS_READ_LOCK;
        if (btrfs_header_generation(cur) < min_trans) {
                ret = 1;
@@ -4098,12 +4049,12 @@ find_next_key:
                cur = read_node_slot(root, cur, slot);
                BUG_ON(!cur);
-                btrfs_tree_lock(cur);
+                btrfs_tree_read_lock(cur);
-                path->locks[level - 1] = 1;
+                path->locks[level - 1] = BTRFS_READ_LOCK;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1);
-                btrfs_clear_path_blocking(path, NULL);
+                btrfs_clear_path_blocking(path, NULL, 0);
        }
 out:
        if (ret == 0)
@@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
        u32 nritems;
        int ret;
        int old_spinning = path->leave_spinning;
-        int force_blocking = 0;
+        int next_rw_lock = 0;
        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;
-        /*
-         * we take the blocks in an order that upsets lockdep.  Using
-         * blocking mode is the only way around it.
-         */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        force_blocking = 1;
-#endif
        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 again:
        level = 1;
        next = NULL;
+        next_rw_lock = 0;
        btrfs_release_path(path);
        path->keep_locks = 1;
+        path->leave_spinning = 1;
-        if (!force_blocking)
-                path->leave_spinning = 1;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        path->keep_locks = 0;
@@ -4281,11 +4223,12 @@ again:
                }
                if (next) {
-                        btrfs_tree_unlock(next);
+                        btrfs_tree_unlock_rw(next, next_rw_lock);
                        free_extent_buffer(next);
                }
                next = c;
+                next_rw_lock = path->locks[level];
                ret = read_block_for_search(NULL, root, path, &next, level,
                                            slot, &key);
                if (ret == -EAGAIN)
@@ -4297,15 +4240,14 @@ again:
                }
                if (!path->skip_locking) {
-                        ret = btrfs_try_spin_lock(next);
+                        ret = btrfs_try_tree_read_lock(next);
                        if (!ret) {
                                btrfs_set_path_blocking(path);
-                                btrfs_tree_lock(next);
+                                btrfs_tree_read_lock(next);
-                                if (!force_blocking)
+                                btrfs_clear_path_blocking(path, next,
-                                        btrfs_clear_path_blocking(path, next);
+                                                          BTRFS_READ_LOCK);
                        }
-                        if (force_blocking)
+                        next_rw_lock = BTRFS_READ_LOCK;
-                                btrfs_set_lock_blocking(next);
                }
                break;
        }
@@ -4314,14 +4256,13 @@ again:
                level--;
                c = path->nodes[level];
                if (path->locks[level])
-                        btrfs_tree_unlock(c);
+                        btrfs_tree_unlock_rw(c, path->locks[level]);
                free_extent_buffer(c);
                path->nodes[level] = next;
                path->slots[level] = 0;
                if (!path->skip_locking)
-                        path->locks[level] = 1;
+                        path->locks[level] = next_rw_lock;
                if (!level)
                        break;
@@ -4336,16 +4277,14 @@ again:
                }
                if (!path->skip_locking) {
-                        btrfs_assert_tree_locked(path->nodes[level]);
+                        ret = btrfs_try_tree_read_lock(next);
-                        ret = btrfs_try_spin_lock(next);
                        if (!ret) {
                                btrfs_set_path_blocking(path);
-                                btrfs_tree_lock(next);
+                                btrfs_tree_read_lock(next);
-                                if (!force_blocking)
+                                btrfs_clear_path_blocking(path, next,
-                                        btrfs_clear_path_blocking(path, next);
+                                                          BTRFS_READ_LOCK);
                        }
-                        if (force_blocking)
+                        next_rw_lock = BTRFS_READ_LOCK;
-                                btrfs_set_lock_blocking(next);
                }
        }
        ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe9287b06496..03912c5c6f49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -755,6 +755,8 @@ struct btrfs_space_info {
                                   chunks for this space */
        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
+        unsigned int flush:1;           /* set if we are trying to make space */
        unsigned int force_alloc;       /* set if we need to force a chunk
                                           alloc for this space */
@@ -764,7 +766,7 @@ struct btrfs_space_info {
        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
-        atomic_t caching_threads;
+        wait_queue_head_t wait;
 };
 struct btrfs_block_rsv {
@@ -824,6 +826,7 @@ struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
        wait_queue_head_t wait;
+        struct btrfs_work work;
        struct btrfs_block_group_cache *block_group;
        u64 progress;
        atomic_t count;
@@ -1032,6 +1035,8 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
+        struct btrfs_workers caching_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
@@ -1410,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)             \
 static inline u##bits btrfs_##name(struct extent_buffer *eb)            \
 {                                                                       \
-        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        type *p = page_address(eb->first_page);                         \
        u##bits res = le##bits##_to_cpu(p->member);                     \
-        kunmap_atomic(p, KM_USER0);                                     \
        return res;                                                     \
 }                                                                       \
 static inline void btrfs_set_##name(struct extent_buffer *eb,           \
                                    u##bits val)                        \
 {                                                                       \
-        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        type *p = page_address(eb->first_page);                         \
        p->member = cpu_to_le##bits(val);                               \
-        kunmap_atomic(p, KM_USER0);                                     \
 }
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)              \
@@ -2128,7 +2131,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 /* extent-tree.c */
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
-                                                 int num_items)
+                                                 unsigned num_items)
 {
        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
                3 * num_items;
@@ -2222,9 +2225,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2330,7 +2330,7 @@ struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
 void btrfs_clear_path_blocking(struct btrfs_path *p,
-                               struct extent_buffer *held);
+                               struct extent_buffer *held, int held_rw);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2365,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root,
+void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref);
+                         struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2404,8 +2404,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
-int btrfs_set_root_node(struct btrfs_root_item *item,
+void btrfs_set_root_node(struct btrfs_root_item *item,
-                        struct extent_buffer *node);
+                         struct extent_buffer *node);
 void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
 /* dir-item.c */
@@ -2521,6 +2521,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 #define PageChecked PageFsMisc
 #endif
+/* This forces readahead on a given range of bytes in an inode */
+static inline void btrfs_force_ra(struct address_space *mapping,
+                                  struct file_ra_state *ra, struct file *file,
+                                  pgoff_t offset, unsigned long req_size)
+{
+        page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+}
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2549,9 +2557,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         size_t size, struct bio *bio, unsigned long bio_flags);
-unsigned long btrfs_force_ra(struct address_space *mapping,
-                              struct file_ra_state *ra, struct file *file,
-                              pgoff_t offset, pgoff_t last_index);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
@@ -2646,12 +2651,21 @@ do {								\
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
-#else
-#define btrfs_get_acl NULL
-#endif
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
                   struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+#else
+#define btrfs_get_acl NULL
+static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
+                                 struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+static inline int btrfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+#endif
 /* relocation.c */
 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9b..b52c672f4c18 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
        }
        /* reset all the locked nodes in the patch to spinning locks. */
-        btrfs_clear_path_blocking(path, NULL);
+        btrfs_clear_path_blocking(path, NULL, 0);
        /* insert the keys of the items */
        ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 8d27af4bd8b9..7083d08b2a21 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -25,7 +25,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "ctree.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 685f2593c4f0..31d84e78129b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
        data_size = sizeof(*dir_item) + name_len + data_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
-        /*
+        if (IS_ERR(dir_item))
-         * FIXME: at some point we should handle xattr's that are larger than
+                return PTR_ERR(dir_item);
-         * what we can fit in our leaf.  We set location to NULL b/c we arent
-         * pointing at anything else, that will change if we store the xattr
-         * data in a separate inode.
-         */
-        BUG_ON(IS_ERR(dir_item));
        memset(&location, 0, sizeof(location));
        leaf = path->nodes[0];
@@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        int ins_len = mod < 0 ? -1 : 0;
        int cow = mod != 0;
-        struct btrfs_key found_key;
-        struct extent_buffer *leaf;
        key.objectid = dir;
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
        if (ret < 0)
                return ERR_PTR(ret);
-        if (ret > 0) {
+        if (ret > 0)
-                if (path->slots[0] == 0)
-                        return NULL;
-                path->slots[0]--;
-        }
-        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-        if (found_key.objectid != dir ||
-            btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
-            found_key.offset != key.offset)
                return NULL;
        return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        int ins_len = mod < 0 ? -1 : 0;
        int cow = mod != 0;
-        struct btrfs_key found_key;
-        struct extent_buffer *leaf;
        key.objectid = dir;
        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
        if (ret < 0)
                return ERR_PTR(ret);
-        if (ret > 0) {
+        if (ret > 0)
-                if (path->slots[0] == 0)
-                        return NULL;
-                path->slots[0]--;
-        }
-        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-        if (found_key.objectid != dir ||
-            btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
-            found_key.offset != key.offset)
                return NULL;
        return btrfs_match_dir_item_name(root, path, name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b231ae13b269..07b3ac662e19 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
        struct btrfs_work work;
 };
-/* These are used to set the lockdep class on the extent buffer locks.
+/*
- * The class is set by the readpage_end_io_hook after the buffer has
+ * Lockdep class keys for extent_buffer->lock's in this root.  For a given
- * passed csum validation but before the pages are unlocked.
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets.  As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid.  This ensures that all special purpose roots
+ * have separate keysets.
 *
- * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * Lock-nesting across peer nodes is always done with the immediate parent
- * allocated blocks.
+ * node locked thus preventing deadlock.  As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
 *
- * The class is based on the level in the tree block, which allows lockdep
+ * The key is set by the readpage_end_io_hook after the buffer has passed
- * to know that lower nodes nest inside the locks of higher nodes.
+ * csum validation but before the pages are unlocked.  It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
 *
- * We also add a check to make sure the highest level of the tree is
+ * We also add a check to make sure the highest level of the tree is the
- * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
- * code needs update as well.
+ * needs update as well.
 */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # if BTRFS_MAX_LEVEL != 8
 #  error
 # endif
-static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
-static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+static struct btrfs_lockdep_keyset {
-        /* leaf */
+        u64                     id;             /* root objectid */
-        "btrfs-extent-00",
+        const char              *name_stem;     /* lock name stem */
-        "btrfs-extent-01",
+        char                    names[BTRFS_MAX_LEVEL + 1][20];
-        "btrfs-extent-02",
+        struct lock_class_key   keys[BTRFS_MAX_LEVEL + 1];
-        "btrfs-extent-03",
+} btrfs_lockdep_keysets[] = {
-        "btrfs-extent-04",
+        { .id = BTRFS_ROOT_TREE_OBJECTID,       .name_stem = "root"     },
-        "btrfs-extent-05",
+        { .id = BTRFS_EXTENT_TREE_OBJECTID,     .name_stem = "extent"   },
-        "btrfs-extent-06",
+        { .id = BTRFS_CHUNK_TREE_OBJECTID,      .name_stem = "chunk"    },
-        "btrfs-extent-07",
+        { .id = BTRFS_DEV_TREE_OBJECTID,        .name_stem = "dev"      },
-        /* highest possible level */
+        { .id = BTRFS_FS_TREE_OBJECTID,         .name_stem = "fs"       },
-        "btrfs-extent-08",
+        { .id = BTRFS_CSUM_TREE_OBJECTID,       .name_stem = "csum"     },
+        { .id = BTRFS_ORPHAN_OBJECTID,          .name_stem = "orphan"   },
+        { .id = BTRFS_TREE_LOG_OBJECTID,        .name_stem = "log"      },
+        { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
+        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
+        { .id = 0,                              .name_stem = "tree"     },
 };
+void __init btrfs_init_lockdep(void)
+{
+        int i, j;
+        /* initialize lockdep class names */
+        for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+                struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+                for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+                        snprintf(ks->names[j], sizeof(ks->names[j]),
+                                 "btrfs-%s-%02d", ks->name_stem, j);
+        }
+}
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+                                    int level)
+{
+        struct btrfs_lockdep_keyset *ks;
+        BUG_ON(level >= ARRAY_SIZE(ks->keys));
+        /* find the matching keyset, id 0 is the default entry */
+        for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+                if (ks->id == objectid)
+                        break;
+        lockdep_set_class_and_name(&eb->lock,
+                                   &ks->keys[level], ks->names[level]);
+}
 #endif
 /*
@@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
        unsigned long len;
        unsigned long cur_len;
        unsigned long offset = BTRFS_CSUM_SIZE;
-        char *map_token = NULL;
        char *kaddr;
        unsigned long map_start;
        unsigned long map_len;
@@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
        len = buf->len - offset;
        while (len > 0) {
                err = map_private_extent_buffer(buf, offset, 32,
-                                        &map_token, &kaddr,
+                                        &kaddr, &map_start, &map_len);
-                                        &map_start, &map_len, KM_USER0);
                if (err)
                        return 1;
                cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                                      crc, cur_len);
                len -= cur_len;
                offset += cur_len;
-                unmap_extent_buffer(buf, map_token, KM_USER0);
        }
        if (csum_size > sizeof(inline_result)) {
                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
        return 0;
 }
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
-{
-        lockdep_set_class_and_name(&eb->lock,
-                           &btrfs_eb_class[level],
-                           btrfs_eb_name[level]);
-}
-#endif
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        found_level = btrfs_header_level(eb);
-        btrfs_set_buffer_lockdep_class(eb, found_level);
+        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+                                       eb, found_level);
        ret = csum_tree_block(root, eb, 1);
        if (ret) {
@@ -1598,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_bdi;
        }
-        fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1802,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           fs_info->thread_pool_size),
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->caching_workers, "cache",
+                           2, &fs_info->generic_worker);
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
         * devices
@@ -1855,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
        btrfs_start_workers(&fs_info->delayed_workers, 1);
+        btrfs_start_workers(&fs_info->caching_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2112,6 +2150,7 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
+        btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
        kfree(fs_info->delayed_root);
 fail_iput:
@@ -2577,6 +2616,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
+        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a67aae..bec3ea4bd67f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+void btrfs_init_lockdep(void);
+void btrfs_set_buffer_lockdep_class(u64 objectid,
+                                    struct extent_buffer *eb, int level);
 #else
-static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+static inline void btrfs_init_lockdep(void)
-                                                 int level)
+{ }
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+                                        struct extent_buffer *eb, int level)
 {
 }
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71cd456fdb60..f5be06a2462f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
        return total_added;
 }
-static int caching_kthread(void *data)
+static noinline void caching_thread(struct btrfs_work *work)
 {
-        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_block_group_cache *block_group;
-        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        struct btrfs_fs_info *fs_info;
-        struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+        struct btrfs_caching_control *caching_ctl;
-        struct btrfs_root *extent_root = fs_info->extent_root;
+        struct btrfs_root *extent_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key;
@@ -334,9 +334,14 @@ static int caching_kthread(void *data)
        u32 nritems;
        int ret = 0;
+        caching_ctl = container_of(work, struct btrfs_caching_control, work);
+        block_group = caching_ctl->block_group;
+        fs_info = block_group->fs_info;
+        extent_root = fs_info->extent_root;
        path = btrfs_alloc_path();
        if (!path)
-                return -ENOMEM;
+                goto out;
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -433,13 +438,11 @@ err:
        free_excluded_extents(extent_root, block_group);
        mutex_unlock(&caching_ctl->mutex);
+out:
        wake_up(&caching_ctl->wait);
        put_caching_control(caching_ctl);
-        atomic_dec(&block_group->space_info->caching_threads);
        btrfs_put_block_group(block_group);
-        return 0;
 }
 static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
-        struct task_struct *tsk;
        int ret = 0;
        smp_mb();
@@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->progress = cache->key.objectid;
        /* one for caching kthread, one for caching block group list */
        atomic_set(&caching_ctl->count, 2);
+        caching_ctl->work.func = caching_thread;
        spin_lock(&cache->lock);
        if (cache->cached != BTRFS_CACHE_NO) {
@@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->extent_commit_sem);
-        atomic_inc(&cache->space_info->caching_threads);
        btrfs_get_block_group(cache);
-        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+        btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
-                          cache->key.objectid);
-        if (IS_ERR(tsk)) {
-                ret = PTR_ERR(tsk);
-                printk(KERN_ERR "error running thread %d\n", ret);
-                BUG();
-        }
        return ret;
 }
@@ -667,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
        struct btrfs_path *path;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        key.objectid = start;
        key.offset = len;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1784,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                        if (!stripe->dev->can_discard)
+                                continue;
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
                                                  stripe->length);
@@ -1791,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                                discarded_bytes += stripe->length;
                        else if (ret != -EOPNOTSUPP)
                                break;
+                        /*
+                         * Just in case we get back EOPNOTSUPP for some reason,
+                         * just ignore the return value so we don't screw up
+                         * people calling discard_extent.
+                         */
+                        ret = 0;
                }
                kfree(multi);
        }
-        if (discarded_bytes && ret == -EOPNOTSUPP)
-                ret = 0;
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
@@ -2932,9 +2938,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->full = 0;
        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
        found->chunk_alloc = 0;
+        found->flush = 0;
+        init_waitqueue_head(&found->wait);
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
-        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -3275,6 +3282,9 @@ again:
        }
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+        if (ret < 0 && ret != -ENOSPC)
+                goto out;
        spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
@@ -3284,6 +3294,7 @@ again:
        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
+out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
 }
@@ -3314,6 +3325,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        if (reserved == 0)
                return 0;
+        smp_mb();
+        if (root->fs_info->delalloc_bytes == 0) {
+                if (trans)
+                        return 0;
+                btrfs_wait_ordered_extents(root, 0, 0);
+                return 0;
+        }
        max_reclaim = min(reserved, to_reclaim);
        while (loops < 1024) {
@@ -3356,6 +3375,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                }
        }
+        if (reclaimed >= to_reclaim && !trans)
+                btrfs_wait_ordered_extents(root, 0, 0);
        return reclaimed >= to_reclaim;
 }
@@ -3380,15 +3401,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
        u64 num_bytes = orig_bytes;
        int retries = 0;
        int ret = 0;
-        bool reserved = false;
        bool committed = false;
+        bool flushing = false;
 again:
-        ret = -ENOSPC;
+        ret = 0;
-        if (reserved)
-                num_bytes = 0;
        spin_lock(&space_info->lock);
+        /*
+         * We only want to wait if somebody other than us is flushing and we are
+         * actually alloed to flush.
+         */
+        while (flush && !flushing && space_info->flush) {
+                spin_unlock(&space_info->lock);
+                /*
+                 * If we have a trans handle we can't wait because the flusher
+                 * may have to commit the transaction, which would mean we would
+                 * deadlock since we are waiting for the flusher to finish, but
+                 * hold the current transaction open.
+                 */
+                if (trans)
+                        return -EAGAIN;
+                ret = wait_event_interruptible(space_info->wait,
+                                               !space_info->flush);
+                /* Must have been interrupted, return */
+                if (ret)
+                        return -EINTR;
+                spin_lock(&space_info->lock);
+        }
+        ret = -ENOSPC;
        unused = space_info->bytes_used + space_info->bytes_reserved +
                 space_info->bytes_pinned + space_info->bytes_readonly +
                 space_info->bytes_may_use;
@@ -3403,8 +3445,7 @@ again:
        if (unused <= space_info->total_bytes) {
                unused = space_info->total_bytes - unused;
                if (unused >= num_bytes) {
-                        if (!reserved)
+                        space_info->bytes_reserved += orig_bytes;
-                                space_info->bytes_reserved += orig_bytes;
                        ret = 0;
                } else {
                        /*
@@ -3429,17 +3470,14 @@ again:
         * to reclaim space we can actually use it instead of somebody else
         * stealing it from us.
         */
-        if (ret && !reserved) {
+        if (ret && flush) {
-                space_info->bytes_reserved += orig_bytes;
+                flushing = true;
-                reserved = true;
+                space_info->flush = 1;
        }
        spin_unlock(&space_info->lock);
-        if (!ret)
+        if (!ret || !flush)
-                return 0;
-        if (!flush)
                goto out;
        /*
@@ -3447,11 +3485,11 @@ again:
         * metadata until after the IO is completed.
         */
        ret = shrink_delalloc(trans, root, num_bytes, 1);
-        if (ret > 0)
+        if (ret < 0)
-                return 0;
-        else if (ret < 0)
                goto out;
+        ret = 0;
        /*
         * So if we were overcommitted it's possible that somebody else flushed
         * out enough space and we simply didn't have enough space to reclaim,
@@ -3462,11 +3500,11 @@ again:
                goto again;
        }
-        spin_lock(&space_info->lock);
        /*
         * Not enough space to be reclaimed, don't bother committing the
         * transaction.
         */
+        spin_lock(&space_info->lock);
        if (space_info->bytes_pinned < orig_bytes)
                ret = -ENOSPC;
        spin_unlock(&space_info->lock);
@@ -3474,10 +3512,13 @@ again:
                goto out;
        ret = -EAGAIN;
-        if (trans || committed)
+        if (trans)
                goto out;
        ret = -ENOSPC;
+        if (committed)
+                goto out;
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                goto out;
@@ -3489,12 +3530,12 @@ again:
        }
 out:
-        if (reserved) {
+        if (flushing) {
                spin_lock(&space_info->lock);
-                space_info->bytes_reserved -= orig_bytes;
+                space_info->flush = 0;
+                wake_up_all(&space_info->wait);
                spin_unlock(&space_info->lock);
        }
        return ret;
 }
@@ -3704,7 +3745,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
        if (commit_trans) {
                if (trans)
                        return -EAGAIN;
                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
@@ -3874,26 +3914,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
        return 0;
 }
-int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 int num_items)
-{
-        u64 num_bytes;
-        int ret;
-        if (num_items == 0 || root->fs_info->chunk_root == root)
-                return 0;
-        num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                  num_bytes);
-        if (!ret) {
-                trans->bytes_reserved += num_bytes;
-                trans->block_rsv = &root->fs_info->trans_block_rsv;
-        }
-        return ret;
-}
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
@@ -3944,6 +3964,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
+static unsigned drop_outstanding_extent(struct inode *inode)
+{
+        unsigned dropped_extents = 0;
+        spin_lock(&BTRFS_I(inode)->lock);
+        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
+        BTRFS_I(inode)->outstanding_extents--;
+        /*
+         * If we have more or the same amount of outsanding extents than we have
+         * reserved then we need to leave the reserved extents count alone.
+         */
+        if (BTRFS_I(inode)->outstanding_extents >=
+            BTRFS_I(inode)->reserved_extents)
+                goto out;
+        dropped_extents = BTRFS_I(inode)->reserved_extents -
+                BTRFS_I(inode)->outstanding_extents;
+        BTRFS_I(inode)->reserved_extents -= dropped_extents;
+out:
+        spin_unlock(&BTRFS_I(inode)->lock);
+        return dropped_extents;
+}
 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
 {
        return num_bytes >>= 3;
@@ -3953,9 +3997,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
-        u64 to_reserve;
+        u64 to_reserve = 0;
-        int nr_extents;
+        unsigned nr_extents = 0;
-        int reserved_extents;
        int ret;
        if (btrfs_transaction_in_commit(root->fs_info))
@@ -3963,66 +4006,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
-        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+        spin_lock(&BTRFS_I(inode)->lock);
-        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+        BTRFS_I(inode)->outstanding_extents++;
+        if (BTRFS_I(inode)->outstanding_extents >
+            BTRFS_I(inode)->reserved_extents) {
+                nr_extents = BTRFS_I(inode)->outstanding_extents -
+                        BTRFS_I(inode)->reserved_extents;
+                BTRFS_I(inode)->reserved_extents += nr_extents;
-        if (nr_extents > reserved_extents) {
-                nr_extents -= reserved_extents;
                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
-        } else {
-                nr_extents = 0;
-                to_reserve = 0;
        }
+        spin_unlock(&BTRFS_I(inode)->lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
-        if (ret)
+        if (ret) {
+                unsigned dropped;
+                /*
+                 * We don't need the return value since our reservation failed,
+                 * we just need to clean up our counter.
+                 */
+                dropped = drop_outstanding_extent(inode);
+                WARN_ON(dropped > 1);
                return ret;
+        }
-        atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
-        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
-        if (block_rsv->size > 512 * 1024 * 1024)
-                shrink_delalloc(NULL, root, to_reserve, 0);
        return 0;
 }
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 to_free;
+        u64 to_free = 0;
-        int nr_extents;
+        unsigned dropped;
-        int reserved_extents;
        num_bytes = ALIGN(num_bytes, root->sectorsize);
-        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        dropped = drop_outstanding_extent(inode);
-        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-        do {
-                int old, new;
-                nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
-                if (nr_extents >= reserved_extents) {
-                        nr_extents = 0;
-                        break;
-                }
-                old = reserved_extents;
-                nr_extents = reserved_extents - nr_extents;
-                new = reserved_extents - nr_extents;
-                old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
-                                     reserved_extents, new);
-                if (likely(old == reserved_extents))
-                        break;
-                reserved_extents = old;
-        } while (1);
        to_free = calc_csum_metadata_size(inode, num_bytes);
-        if (nr_extents > 0)
+        if (dropped > 0)
-                to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
+                to_free += btrfs_calc_trans_metadata_size(root, dropped);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
@@ -4444,7 +4470,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                printk(KERN_ERR "umm, got %d back from search"
                                       ", was looking for %llu\n", ret,
                                       (unsigned long long)bytenr);
-                                btrfs_print_leaf(extent_root, path->nodes[0]);
+                                if (ret > 0)
+                                        btrfs_print_leaf(extent_root,
+                                                         path->nodes[0]);
                        }
                        BUG_ON(ret);
                        extent_slot = path->slots[0];
@@ -4990,14 +5018,10 @@ have_block_group:
                        }
                        /*
-                         * We only want to start kthread caching if we are at
+                         * The caching workers are limited to 2 threads, so we
-                         * the point where we will wait for caching to make
+                         * can queue as much work as we care to.
-                         * progress, or if our ideal search is over and we've
-                         * found somebody to start caching.
                         */
-                        if (loop > LOOP_CACHING_NOWAIT ||
+                        if (loop > LOOP_FIND_IDEAL) {
-                            (loop > LOOP_FIND_IDEAL &&
-                             atomic_read(&space_info->caching_threads) < 2)) {
                                ret = cache_block_group(block_group, trans,
                                                        orig_root, 0);
                                BUG_ON(ret);
@@ -5065,7 +5089,9 @@ have_block_group:
                         * group is does point to and try again
                         */
                        if (!last_ptr_loop && last_ptr->block_group &&
-                            last_ptr->block_group != block_group) {
+                            last_ptr->block_group != block_group &&
+                            index <=
+                                 get_block_group_index(last_ptr->block_group)) {
                                btrfs_put_block_group(block_group);
                                block_group = last_ptr->block_group;
@@ -5219,8 +5245,7 @@ loop:
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
-                        if (!ideal_cache_percent &&
+                        if (!ideal_cache_percent)
-                            atomic_read(&space_info->caching_threads))
                                goto search;
                        /*
@@ -5494,7 +5519,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5623,7 +5649,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
-        btrfs_set_buffer_lockdep_class(buf, level);
+        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
@@ -5910,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                        return 1;
                if (path->locks[level] && !wc->keep_locks) {
-                        btrfs_tree_unlock(eb);
+                        btrfs_tree_unlock_rw(eb, path->locks[level]);
                        path->locks[level] = 0;
                }
                return 0;
@@ -5934,7 +5960,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
         * keep the tree lock
         */
        if (path->locks[level] && level > 0) {
-                btrfs_tree_unlock(eb);
+                btrfs_tree_unlock_rw(eb, path->locks[level]);
                path->locks[level] = 0;
        }
        return 0;
@@ -6047,7 +6073,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        BUG_ON(level != btrfs_header_level(next));
        path->nodes[level] = next;
        path->slots[level] = 0;
-        path->locks[level] = 1;
+        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
        wc->level = level;
        if (wc->level == 1)
                wc->reada_slot = 0;
@@ -6118,7 +6144,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        BUG_ON(level == 0);
                        btrfs_tree_lock(eb);
                        btrfs_set_lock_blocking(eb);
-                        path->locks[level] = 1;
+                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                        ret = btrfs_lookup_extent_info(trans, root,
                                                       eb->start, eb->len,
@@ -6127,8 +6153,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
-                                btrfs_tree_unlock(eb);
+                                btrfs_tree_unlock_rw(eb, path->locks[level]);
-                                path->locks[level] = 0;
                                return 1;
                        }
                }
@@ -6150,7 +6175,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                    btrfs_header_generation(eb) == trans->transid) {
                        btrfs_tree_lock(eb);
                        btrfs_set_lock_blocking(eb);
-                        path->locks[level] = 1;
+                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
                clean_tree_block(trans, root, eb);
        }
@@ -6229,7 +6254,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                return 0;
                        if (path->locks[level]) {
-                                btrfs_tree_unlock(path->nodes[level]);
+                                btrfs_tree_unlock_rw(path->nodes[level],
+                                                     path->locks[level]);
                                path->locks[level] = 0;
                        }
                        free_extent_buffer(path->nodes[level]);
@@ -6251,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root,
+void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+                         struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6265,10 +6291,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                err = -ENOMEM;
+                goto out;
+        }
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
-        BUG_ON(!wc);
+        if (!wc) {
+                btrfs_free_path(path);
+                err = -ENOMEM;
+                goto out;
+        }
        trans = btrfs_start_transaction(tree_root, 0);
        BUG_ON(IS_ERR(trans));
@@ -6281,7 +6314,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                path->nodes[level] = btrfs_lock_root_node(root);
                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
-                path->locks[level] = 1;
+                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                memset(&wc->update_progress, 0,
                       sizeof(wc->update_progress));
        } else {
@@ -6296,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                path->lowest_level = 0;
                if (ret < 0) {
                        err = ret;
-                        goto out;
+                        goto out_free;
                }
                WARN_ON(ret > 0);
@@ -6403,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                free_extent_buffer(root->commit_root);
                kfree(root);
        }
-out:
+out_free:
        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
-        return err;
+out:
+        if (err)
+                btrfs_std_error(root->fs_info, err);
+        return;
 }
 /*
@@ -6449,7 +6485,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
-        path->locks[level] = 1;
+        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
        wc->refs[parent_level] = 1;
        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6524,30 +6560,48 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static int set_block_group_ro(struct btrfs_block_group_cache *cache)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
+        u64 min_allocable_bytes;
        int ret = -ENOSPC;
-        if (cache->ro)
-                return 0;
+        /*
+         * We need some metadata space and system metadata space for
+         * allocating chunks in some corner cases until we force to set
+         * it to be readonly.
+         */
+        if ((sinfo->flags &
+             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
+            !force)
+                min_allocable_bytes = 1 * 1024 * 1024;
+        else
+                min_allocable_bytes = 0;
        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);
+        if (cache->ro) {
+                ret = 0;
+                goto out;
+        }
        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-            cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
+            cache->reserved_pinned + num_bytes + min_allocable_bytes <=
+            sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
+out:
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
@@ -6571,7 +6625,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
                               CHUNK_ALLOC_FORCE);
-        ret = set_block_group_ro(cache);
+        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6579,7 +6633,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
-        ret = set_block_group_ro(cache);
+        ret = set_block_group_ro(cache, 0);
 out:
        btrfs_end_transaction(trans, root);
        return ret;
@@ -6680,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
+        u64 min_free;
+        u64 dev_min = 1;
+        u64 dev_nr = 0;
+        int index;
        int full = 0;
        int ret = 0;
@@ -6689,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (!block_group)
                return -1;
+        min_free = btrfs_block_group_used(&block_group->item);
        /* no bytes used, we're good */
-        if (!btrfs_block_group_used(&block_group->item))
+        if (!min_free)
                goto out;
        space_info = block_group->space_info;
@@ -6706,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         * all of the extents from this block group.  If we can, we're good
         */
        if ((space_info->total_bytes != block_group->key.offset) &&
-           (space_info->bytes_used + space_info->bytes_reserved +
+            (space_info->bytes_used + space_info->bytes_reserved +
-            space_info->bytes_pinned + space_info->bytes_readonly +
+             space_info->bytes_pinned + space_info->bytes_readonly +
-            btrfs_block_group_used(&block_group->item) <
+             min_free < space_info->total_bytes)) {
-            space_info->total_bytes)) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -6726,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (full)
                goto out;
+        /*
+         * index:
+         *      0: raid10
+         *      1: raid1
+         *      2: dup
+         *      3: raid0
+         *      4: single
+         */
+        index = get_block_group_index(block_group);
+        if (index == 0) {
+                dev_min = 4;
+                /* Divide by 2 */
+                min_free >>= 1;
+        } else if (index == 1) {
+                dev_min = 2;
+        } else if (index == 2) {
+                /* Multiply by 2 */
+                min_free <<= 1;
+        } else if (index == 3) {
+                dev_min = fs_devices->rw_devices;
+                do_div(min_free, dev_min);
+        }
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-                u64 min_free = btrfs_block_group_used(&block_group->item);
                u64 dev_offset;
                /*
@@ -6739,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                        ret = find_free_dev_extent(NULL, device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
+                                dev_nr++;
+                        if (dev_nr >= dev_min)
                                break;
                        ret = -1;
                }
        }
@@ -7016,7 +7101,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                        set_block_group_ro(cache);
+                        set_block_group_ro(cache, 1);
        }
        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -7030,9 +7115,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * mirrored block groups.
                 */
                list_for_each_entry(cache, &space_info->block_groups[3], list)
-                        set_block_group_ro(cache);
+                        set_block_group_ro(cache, 1);
                list_for_each_entry(cache, &space_info->block_groups[4], list)
-                        set_block_group_ro(cache);
+                        set_block_group_ro(cache, 1);
        }
        init_global_block_rsv(info);
@@ -7162,11 +7247,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&cluster->refill_lock);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
        inode = lookup_free_space_inode(root, block_group, path);
        if (!IS_ERR(inode)) {
-                btrfs_orphan_add(trans, inode);
+                ret = btrfs_orphan_add(trans, inode);
+                BUG_ON(ret);
                clear_nlink(inode);
                /* One for the block groups ref */
                spin_lock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1efd..d418164a35f1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 *
 * This should be called with the tree lock held.
 */
-static int merge_state(struct extent_io_tree *tree,
+static void merge_state(struct extent_io_tree *tree,
-                       struct extent_state *state)
+                        struct extent_state *state)
 {
        struct extent_state *other;
        struct rb_node *other_node;
        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
-                return 0;
+                return;
        other_node = rb_prev(&state->rb_node);
        if (other_node) {
@@ -281,26 +281,19 @@ static int merge_state(struct extent_io_tree *tree,
                if (other->start == state->end + 1 &&
                    other->state == state->state) {
                        merge_cb(tree, state, other);
-                        other->start = state->start;
+                        state->end = other->end;
-                        state->tree = NULL;
+                        other->tree = NULL;
-                        rb_erase(&state->rb_node, &tree->state);
+                        rb_erase(&other->rb_node, &tree->state);
-                        free_extent_state(state);
+                        free_extent_state(other);
-                        state = NULL;
                }
        }
-        return 0;
 }
-static int set_state_cb(struct extent_io_tree *tree,
+static void set_state_cb(struct extent_io_tree *tree,
                         struct extent_state *state, int *bits)
 {
-        if (tree->ops && tree->ops->set_bit_hook) {
+        if (tree->ops && tree->ops->set_bit_hook)
-                return tree->ops->set_bit_hook(tree->mapping->host,
+                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
-                                               state, bits);
-        }
-        return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
+static void set_state_bits(struct extent_io_tree *tree,
+                           struct extent_state *state, int *bits);
 /*
 * insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
@@ -325,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree,
                        int *bits)
 {
        struct rb_node *node;
-        int bits_to_set = *bits & ~EXTENT_CTLBITS;
-        int ret;
        if (end < start) {
                printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree,
        }
        state->start = start;
        state->end = end;
-        ret = set_state_cb(tree, state, bits);
-        if (ret)
-                return ret;
-        if (bits_to_set & EXTENT_DIRTY)
+        set_state_bits(tree, state, bits);
-                tree->dirty_bytes += end - start + 1;
-        state->state |= bits_to_set;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -351,7 +341,6 @@ static int insert_state(struct extent_io_tree *tree,
                       "%llu %llu\n", (unsigned long long)found->start,
                       (unsigned long long)found->end,
                       (unsigned long long)start, (unsigned long long)end);
-                free_extent_state(state);
                return -EEXIST;
        }
        state->tree = tree;
@@ -359,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree,
        return 0;
 }
-static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
                     u64 split)
 {
        if (tree->ops && tree->ops->split_extent_hook)
-                return tree->ops->split_extent_hook(tree->mapping->host,
+                tree->ops->split_extent_hook(tree->mapping->host, orig, split);
-                                                    orig, split);
-        return 0;
 }
 /*
@@ -500,7 +487,8 @@ again:
                        cached_state = NULL;
                }
-                if (cached && cached->tree && cached->start == start) {
+                if (cached && cached->tree && cached->start <= start &&
+                    cached->end > start) {
                        if (clear)
                                atomic_dec(&cached->refs);
                        state = cached;
@@ -660,34 +648,25 @@ again:
                if (start > end)
                        break;
-                if (need_resched()) {
+                cond_resched_lock(&tree->lock);
-                        spin_unlock(&tree->lock);
-                        cond_resched();
-                        spin_lock(&tree->lock);
-                }
        }
 out:
        spin_unlock(&tree->lock);
        return 0;
 }
-static int set_state_bits(struct extent_io_tree *tree,
+static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
                           int *bits)
 {
-        int ret;
        int bits_to_set = *bits & ~EXTENT_CTLBITS;
-        ret = set_state_cb(tree, state, bits);
+        set_state_cb(tree, state, bits);
-        if (ret)
-                return ret;
        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
        state->state |= bits_to_set;
-        return 0;
 }
 static void cache_state(struct extent_state *state,
@@ -742,7 +721,8 @@ again:
        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
-                if (state->start == start && state->tree) {
+                if (state->start <= start && state->end > start &&
+                    state->tree) {
                        node = &state->rb_node;
                        goto hit_next;
                }
@@ -779,17 +759,15 @@ hit_next:
                        goto out;
                }
-                err = set_state_bits(tree, state, &bits);
+                set_state_bits(tree, state, &bits);
-                if (err)
-                        goto out;
-                next_node = rb_next(node);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
+                next_node = rb_next(&state->rb_node);
                if (next_node && start < end && prealloc && !need_resched()) {
                        state = rb_entry(next_node, struct extent_state,
                                         rb_node);
@@ -830,9 +808,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        err = set_state_bits(tree, state, &bits);
+                        set_state_bits(tree, state, &bits);
-                        if (err)
-                                goto out;
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
@@ -862,7 +838,6 @@ hit_next:
                 * Avoid to free 'prealloc' if it can be merged with
                 * the later extent.
                 */
-                atomic_inc(&prealloc->refs);
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
                BUG_ON(err == -EEXIST);
@@ -872,7 +847,6 @@ hit_next:
                        goto out;
                }
                cache_state(prealloc, cached_state);
-                free_extent_state(prealloc);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -895,11 +869,7 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
-                err = set_state_bits(tree, prealloc, &bits);
+                set_state_bits(tree, prealloc, &bits);
-                if (err) {
-                        prealloc = NULL;
-                        goto out;
-                }
                cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
@@ -1061,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
        return 0;
 }
-/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
- *
- * If nothing was found, 1 is returned, < 0 on error
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                          u64 *start_ret, u64 *end_ret, int bits)
-{
-        struct rb_node *node;
-        struct extent_state *state;
-        int ret = 1;
-        spin_lock(&tree->lock);
-        /*
-         * this search will find all the extents that end after
-         * our range starts.
-         */
-        node = tree_search(tree, start);
-        if (!node)
-                goto out;
-        while (1) {
-                state = rb_entry(node, struct extent_state, rb_node);
-                if (state->end >= start && (state->state & bits)) {
-                        *start_ret = state->start;
-                        *end_ret = state->end;
-                        ret = 0;
-                        break;
-                }
-                node = rb_next(node);
-                if (!node)
-                        break;
-        }
-out:
-        spin_unlock(&tree->lock);
-        return ret;
-}
 /* find the first state struct with 'bits' set after 'start', and
 * return it.  tree->lock must be held.  NULL will returned if
 * nothing was found after 'start'
@@ -1133,6 +1063,30 @@ out:
 }
 /*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                          u64 *start_ret, u64 *end_ret, int bits)
+{
+        struct extent_state *state;
+        int ret = 1;
+        spin_lock(&tree->lock);
+        state = find_first_extent_bit_state(tree, start, bits);
+        if (state) {
+                *start_ret = state->start;
+                *end_ret = state->end;
+                ret = 0;
+        }
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*
 * find a contiguous range of bytes in the file marked as delalloc, not
 * more than 'max_bytes'.  start and end are used to return the range,
 *
@@ -1564,7 +1518,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int bitset = 0;
        spin_lock(&tree->lock);
-        if (cached && cached->tree && cached->start == start)
+        if (cached && cached->tree && cached->start <= start &&
+            cached->end > start)
                node = &cached->rb_node;
        else
                node = tree_search(tree, start);
@@ -2432,6 +2387,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
+        int tag;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -2442,11 +2398,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                scanned = 1;
        }
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        while (!done && !nr_to_write_done && (index <= end) &&
-               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY, min(end - index,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-                                  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
                unsigned i;
                scanned = 1;
@@ -2541,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                          struct writeback_control *wbc)
 {
        int ret;
-        struct address_space *mapping = page->mapping;
        struct extent_page_data epd = {
                .bio = NULL,
                .tree = tree,
@@ -2549,18 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
-        struct writeback_control wbc_writepages = {
-                .sync_mode      = wbc->sync_mode,
-                .older_than_this = NULL,
-                .nr_to_write    = 64,
-                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
-                .range_end      = (loff_t)-1,
-        };
        ret = __extent_writepage(page, wbc, &epd);
-        extent_write_cache_pages(tree, mapping, &wbc_writepages,
-                                 __extent_writepage, &epd, flush_write_bio);
        flush_epd_write_bio(&epd);
        return ret;
 }
@@ -2584,7 +2535,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
        };
        struct writeback_control wbc_writepages = {
                .sync_mode      = mode,
-                .older_than_this = NULL,
                .nr_to_write    = nr_pages * 2,
                .range_start    = start,
                .range_end      = end + 1,
@@ -3022,8 +2972,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                return NULL;
        eb->start = start;
        eb->len = len;
-        spin_lock_init(&eb->lock);
+        rwlock_init(&eb->lock);
-        init_waitqueue_head(&eb->lock_wq);
+        atomic_set(&eb->write_locks, 0);
+        atomic_set(&eb->read_locks, 0);
+        atomic_set(&eb->blocking_readers, 0);
+        atomic_set(&eb->blocking_writers, 0);
+        atomic_set(&eb->spinning_readers, 0);
+        atomic_set(&eb->spinning_writers, 0);
+        init_waitqueue_head(&eb->write_lock_wq);
+        init_waitqueue_head(&eb->read_lock_wq);
 #if LEAK_DEBUG
        spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3076,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                i = 0;
        }
        for (; i < num_pages; i++, index++) {
-                p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
+                p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
@@ -3266,6 +3223,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
        return was_dirty;
 }
+static int __eb_straddles_pages(u64 start, u64 len)
+{
+        if (len < PAGE_CACHE_SIZE)
+                return 1;
+        if (start & (PAGE_CACHE_SIZE - 1))
+                return 1;
+        if ((start + len) & (PAGE_CACHE_SIZE - 1))
+                return 1;
+        return 0;
+}
+static int eb_straddles_pages(struct extent_buffer *eb)
+{
+        return __eb_straddles_pages(eb->start, eb->len);
+}
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
                                struct extent_buffer *eb,
                                struct extent_state **cached_state)
@@ -3277,8 +3250,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+        if (eb_straddles_pages(eb)) {
-                              cached_state, GFP_NOFS);
+                clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                                      cached_state, GFP_NOFS);
+        }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
@@ -3296,8 +3271,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
-        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+        if (eb_straddles_pages(eb)) {
-                            NULL, GFP_NOFS);
+                set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                                    NULL, GFP_NOFS);
+        }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3297,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        int uptodate;
        unsigned long index;
-        ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
+        if (__eb_straddles_pages(start, end - start + 1)) {
-        if (ret)
+                ret = test_range_bit(tree, start, end,
-                return 1;
+                                     EXTENT_UPTODATE, 1, NULL);
+                if (ret)
+                        return 1;
+        }
        while (start <= end) {
                index = start >> PAGE_CACHE_SHIFT;
                page = find_get_page(tree->mapping, index);
@@ -3350,10 +3330,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 1;
-        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+        if (eb_straddles_pages(eb)) {
-                           EXTENT_UPTODATE, 1, cached_state);
+                ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-        if (ret)
+                                   EXTENT_UPTODATE, 1, cached_state);
-                return ret;
+                if (ret)
+                        return ret;
+        }
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
@@ -3386,9 +3368,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
-        if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+        if (eb_straddles_pages(eb)) {
-                           EXTENT_UPTODATE, 1, NULL)) {
+                if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                return 0;
+                                   EXTENT_UPTODATE, 1, NULL)) {
+                        return 0;
+                }
        }
        if (start) {
@@ -3492,9 +3476,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
                page = extent_buffer_page(eb, i);
                cur = min(len, (PAGE_CACHE_SIZE - offset));
-                kaddr = kmap_atomic(page, KM_USER1);
+                kaddr = page_address(page);
                memcpy(dst, kaddr + offset, cur);
-                kunmap_atomic(kaddr, KM_USER1);
                dst += cur;
                len -= cur;
@@ -3504,9 +3487,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 }
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-                               unsigned long min_len, char **token, char **map,
+                               unsigned long min_len, char **map,
                               unsigned long *map_start,
-                               unsigned long *map_len, int km)
+                               unsigned long *map_len)
 {
        size_t offset = start & (PAGE_CACHE_SIZE - 1);
        char *kaddr;
@@ -3536,42 +3519,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
        }
        p = extent_buffer_page(eb, i);
-        kaddr = kmap_atomic(p, km);
+        kaddr = page_address(p);
-        *token = kaddr;
        *map = kaddr + offset;
        *map_len = PAGE_CACHE_SIZE - offset;
        return 0;
 }
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-                      unsigned long min_len,
-                      char **token, char **map,
-                      unsigned long *map_start,
-                      unsigned long *map_len, int km)
-{
-        int err;
-        int save = 0;
-        if (eb->map_token) {
-                unmap_extent_buffer(eb, eb->map_token, km);
-                eb->map_token = NULL;
-                save = 1;
-        }
-        err = map_private_extent_buffer(eb, start, min_len, token, map,
-                                       map_start, map_len, km);
-        if (!err && save) {
-                eb->map_token = *token;
-                eb->kaddr = *map;
-                eb->map_start = *map_start;
-                eb->map_len = *map_len;
-        }
-        return err;
-}
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
-        kunmap_atomic(token, km);
-}
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
                          unsigned long start,
                          unsigned long len)
@@ -3595,9 +3548,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
                cur = min(len, (PAGE_CACHE_SIZE - offset));
-                kaddr = kmap_atomic(page, KM_USER0);
+                kaddr = page_address(page);
                ret = memcmp(ptr, kaddr + offset, cur);
-                kunmap_atomic(kaddr, KM_USER0);
                if (ret)
                        break;
@@ -3630,9 +3582,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
                WARN_ON(!PageUptodate(page));
                cur = min(len, PAGE_CACHE_SIZE - offset);
-                kaddr = kmap_atomic(page, KM_USER1);
+                kaddr = page_address(page);
                memcpy(kaddr + offset, src, cur);
-                kunmap_atomic(kaddr, KM_USER1);
                src += cur;
                len -= cur;
@@ -3661,9 +3612,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
                WARN_ON(!PageUptodate(page));
                cur = min(len, PAGE_CACHE_SIZE - offset);
-                kaddr = kmap_atomic(page, KM_USER0);
+                kaddr = page_address(page);
                memset(kaddr + offset, c, cur);
-                kunmap_atomic(kaddr, KM_USER0);
                len -= cur;
                offset = 0;
@@ -3694,9 +3644,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
-                kaddr = kmap_atomic(page, KM_USER0);
+                kaddr = page_address(page);
                read_extent_buffer(src, kaddr + offset, src_offset, cur);
-                kunmap_atomic(kaddr, KM_USER0);
                src_offset += cur;
                len -= cur;
@@ -3709,20 +3658,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
 {
-        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        char *dst_kaddr = page_address(dst_page);
        if (dst_page == src_page) {
                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
        } else {
-                char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+                char *src_kaddr = page_address(src_page);
                char *p = dst_kaddr + dst_off + len;
                char *s = src_kaddr + src_off + len;
                while (len--)
                        *--p = *--s;
-                kunmap_atomic(src_kaddr, KM_USER1);
        }
-        kunmap_atomic(dst_kaddr, KM_USER0);
 }
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +3681,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
 {
-        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
        if (dst_page != src_page) {
-                src_kaddr = kmap_atomic(src_page, KM_USER1);
+                src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
                BUG_ON(areas_overlap(src_off, dst_off, len));
        }
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-        kunmap_atomic(dst_kaddr, KM_USER0);
-        if (dst_page != src_page)
-                kunmap_atomic(src_kaddr, KM_USER1);
 }
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a11a92ee2d30..7b2f0c3e7929 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -76,15 +76,15 @@ struct extent_io_ops {
                                    struct extent_state *state);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
-        int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+        void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            int *bits);
+                             int *bits);
-        int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+        void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              int *bits);
+                               int *bits);
-        int (*merge_extent_hook)(struct inode *inode,
+        void (*merge_extent_hook)(struct inode *inode,
-                                 struct extent_state *new,
+                                  struct extent_state *new,
-                                 struct extent_state *other);
+                                  struct extent_state *other);
-        int (*split_extent_hook)(struct inode *inode,
+        void (*split_extent_hook)(struct inode *inode,
-                                 struct extent_state *orig, u64 split);
+                                  struct extent_state *orig, u64 split);
        int (*write_cache_pages_lock_hook)(struct page *page);
 };
@@ -108,8 +108,6 @@ struct extent_state {
        wait_queue_head_t wq;
        atomic_t refs;
        unsigned long state;
-        u64 split_start;
-        u64 split_end;
        /* for use by the FS */
        u64 private;
@@ -120,8 +118,6 @@ struct extent_state {
 struct extent_buffer {
        u64 start;
        unsigned long len;
-        char *map_token;
-        char *kaddr;
        unsigned long map_start;
        unsigned long map_len;
        struct page *first_page;
@@ -130,14 +126,26 @@ struct extent_buffer {
        struct rcu_head rcu_head;
        atomic_t refs;
-        /* the spinlock is used to protect most operations */
+        /* count of read lock holders on the extent buffer */
-        spinlock_t lock;
+        atomic_t write_locks;
+        atomic_t read_locks;
+        atomic_t blocking_writers;
+        atomic_t blocking_readers;
+        atomic_t spinning_readers;
+        atomic_t spinning_writers;
+        /* protects write locks */
+        rwlock_t lock;
-        /*
+        /* readers use lock_wq while they wait for the write
-         * when we keep the lock held while blocking, waiters go onto
+         * lock holders to unlock
-         * the wq
         */
-        wait_queue_head_t lock_wq;
+        wait_queue_head_t write_lock_wq;
+        /* writers use read_lock_wq while they wait for readers
+         * to unlock
+         */
+        wait_queue_head_t read_lock_wq;
 };
 static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -279,15 +287,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 int extent_buffer_uptodate(struct extent_io_tree *tree,
                           struct extent_buffer *eb,
                           struct extent_state *cached_state);
-int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-                      unsigned long min_len, char **token, char **map,
-                      unsigned long *map_start,
-                      unsigned long *map_len, int km);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-                      unsigned long min_len, char **token, char **map,
+                      unsigned long min_len, char **map,
                      unsigned long *map_start,
-                      unsigned long *map_len, int km);
+                      unsigned long *map_len);
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int extent_range_uptodate(struct extent_io_tree *tree,
                          u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2d0410344ea3..7c97b3301459 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        return 0;
 }
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 {
-        int ret = 0;
        struct extent_map *merge = NULL;
        struct rb_node *rb;
-        struct extent_map *em;
-        write_lock(&tree->lock);
-        em = lookup_extent_mapping(tree, start, len);
-        WARN_ON(!em || em->start != start);
-        if (!em)
-                goto out;
-        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
        if (em->start != 0) {
                rb = rb_prev(&em->rb_node);
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
                merge->in_tree = 0;
                free_extent_map(merge);
        }
+}
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+        int ret = 0;
+        struct extent_map *em;
+        write_lock(&tree->lock);
+        em = lookup_extent_mapping(tree, start, len);
+        WARN_ON(!em || em->start != start);
+        if (!em)
+                goto out;
+        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+        try_merge_map(tree, em);
        free_extent_map(em);
 out:
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
 {
        int ret = 0;
-        struct extent_map *merge = NULL;
        struct rb_node *rb;
        struct extent_map *exist;
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
                goto out;
        }
        atomic_inc(&em->refs);
-        if (em->start != 0) {
-                rb = rb_prev(&em->rb_node);
+        try_merge_map(tree, em);
-                if (rb)
-                        merge = rb_entry(rb, struct extent_map, rb_node);
-                if (rb && mergable_maps(merge, em)) {
-                        em->start = merge->start;
-                        em->len += merge->len;
-                        em->block_len += merge->block_len;
-                        em->block_start = merge->block_start;
-                        merge->in_tree = 0;
-                        rb_erase(&merge->rb_node, &tree->map);
-                        free_extent_map(merge);
-                }
-         }
-        rb = rb_next(&em->rb_node);
-        if (rb)
-                merge = rb_entry(rb, struct extent_map, rb_node);
-        if (rb && mergable_maps(em, merge)) {
-                em->len += merge->len;
-                em->block_len += merge->len;
-                rb_erase(&merge->rb_node, &tree->map);
-                merge->in_tree = 0;
-                free_extent_map(merge);
-        }
 out:
        return ret;
 }
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len)
        return start + len;
 }
-/**
+struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
- * lookup_extent_mapping - lookup extent_map
+                                           u64 start, u64 len, int strict)
- * @tree:       tree to lookup in
- * @start:      byte offset to start the search
- * @len:        length of the lookup range
- *
- * Find and return the first extent_map struct in @tree that intersects the
- * [start, len] range.  There may be additional objects in the tree that
- * intersect, so check the object returned carefully to make sure that no
- * additional lookups are needed.
- */
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-                                         u64 start, u64 len)
 {
        struct extent_map *em;
        struct rb_node *rb_node;
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
        u64 end = range_end(start, len);
        rb_node = __tree_search(&tree->map, start, &prev, &next);
-        if (!rb_node && prev) {
-                em = rb_entry(prev, struct extent_map, rb_node);
-                if (end > em->start && start < extent_map_end(em))
-                        goto found;
-        }
-        if (!rb_node && next) {
-                em = rb_entry(next, struct extent_map, rb_node);
-                if (end > em->start && start < extent_map_end(em))
-                        goto found;
-        }
        if (!rb_node) {
-                em = NULL;
+                if (prev)
-                goto out;
+                        rb_node = prev;
-        }
+                else if (next)
-        if (IS_ERR(rb_node)) {
+                        rb_node = next;
-                em = ERR_CAST(rb_node);
+                else
-                goto out;
+                        return NULL;
        }
        em = rb_entry(rb_node, struct extent_map, rb_node);
-        if (end > em->start && start < extent_map_end(em))
-                goto found;
-        em = NULL;
+        if (strict && !(end > em->start && start < extent_map_end(em)))
-        goto out;
+                return NULL;
-found:
        atomic_inc(&em->refs);
-out:
        return em;
 }
 /**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:       tree to lookup in
+ * @start:      byte offset to start the search
+ * @len:        length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+                                         u64 start, u64 len)
+{
+        return __lookup_extent_mapping(tree, start, len, 1);
+}
+/**
 * search_extent_mapping - find a nearby extent map
 * @tree:       tree to lookup in
 * @start:      byte offset to start the search
@@ -365,38 +341,7 @@ out:
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len)
 {
-        struct extent_map *em;
+        return __lookup_extent_mapping(tree, start, len, 0);
-        struct rb_node *rb_node;
-        struct rb_node *prev = NULL;
-        struct rb_node *next = NULL;
-        rb_node = __tree_search(&tree->map, start, &prev, &next);
-        if (!rb_node && prev) {
-                em = rb_entry(prev, struct extent_map, rb_node);
-                goto found;
-        }
-        if (!rb_node && next) {
-                em = rb_entry(next, struct extent_map, rb_node);
-                goto found;
-        }
-        if (!rb_node) {
-                em = NULL;
-                goto out;
-        }
-        if (IS_ERR(rb_node)) {
-                em = ERR_CAST(rb_node);
-                goto out;
-        }
-        em = rb_entry(rb_node, struct extent_map, rb_node);
-        goto found;
-        em = NULL;
-        goto out;
-found:
-        atomic_inc(&em->refs);
-out:
-        return em;
 }
 /**
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90d4ee52cd45..a1cb7821becd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,6 +177,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        WARN_ON(bio->bi_vcnt <= 0);
+        /*
+         * the free space stuff is only read when it hasn't been
+         * updated in the current transaction.  So, we can safely
+         * read from the commit root and sidestep a nasty deadlock
+         * between reading the free space cache and updating the csum tree.
+         */
+        if (btrfs_is_free_space_inode(root, inode)) {
+                path->search_commit_root = 1;
+                path->skip_locking = 1;
+        }
        disk_bytenr = (u64)bio->bi_sector << 9;
        if (dio)
                offset = logical_offset;
@@ -282,7 +293,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        if (search_commit) {
                path->skip_locking = 1;
@@ -664,15 +676,13 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
        struct btrfs_sector_sum *sector_sum;
        u32 nritems;
        u32 ins_size;
-        char *eb_map;
-        char *eb_token;
-        unsigned long map_len;
-        unsigned long map_start;
        u16 csum_size =
                btrfs_super_csum_size(&root->fs_info->super_copy);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        sector_sum = sums->sums;
 again:
        next_offset = (u64)-1;
@@ -814,30 +824,9 @@ found:
        item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
                                      btrfs_item_size_nr(leaf, path->slots[0]));
-        eb_token = NULL;
 next_sector:
-        if (!eb_token ||
+        write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-           (unsigned long)item + csum_size >= map_start + map_len) {
-                int err;
-                if (eb_token)
-                        unmap_extent_buffer(leaf, eb_token, KM_USER1);
-                eb_token = NULL;
-                err = map_private_extent_buffer(leaf, (unsigned long)item,
-                                                csum_size,
-                                                &eb_token, &eb_map,
-                                                &map_start, &map_len, KM_USER1);
-                if (err)
-                        eb_token = NULL;
-        }
-        if (eb_token) {
-                memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-                       &sector_sum->sum, csum_size);
-        } else {
-                write_extent_buffer(leaf, &sector_sum->sum,
-                                    (unsigned long)item, csum_size);
-        }
        total_bytes += root->sectorsize;
        sector_sum++;
@@ -850,10 +839,7 @@ next_sector:
                        goto next_sector;
                }
        }
-        if (eb_token) {
-                unmap_extent_buffer(leaf, eb_token, KM_USER1);
-                eb_token = NULL;
-        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
        if (total_bytes < sums->len) {
                btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 59cbdb120ad0..3c3abff731a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -74,7 +74,7 @@ struct inode_defrag {
 * If an existing record is found the defrag item you
 * pass in is freed
 */
-static int __btrfs_add_inode_defrag(struct inode *inode,
+static void __btrfs_add_inode_defrag(struct inode *inode,
                                    struct inode_defrag *defrag)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
        BTRFS_I(inode)->in_defrag = 1;
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-        return 0;
+        return;
 exists:
        kfree(defrag);
-        return 0;
+        return;
 }
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct inode_defrag *defrag;
-        int ret = 0;
        u64 transid;
        if (!btrfs_test_opt(root, AUTO_DEFRAG))
@@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->defrag_inodes_lock);
        if (!BTRFS_I(inode)->in_defrag)
-                ret = __btrfs_add_inode_defrag(inode, defrag);
+                __btrfs_add_inode_defrag(inode, defrag);
+        else
+                kfree(defrag);
        spin_unlock(&root->fs_info->defrag_inodes_lock);
-        return ret;
+        return 0;
 }
 /*
@@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
 again:
        recow = 0;
        split = start;
@@ -1059,7 +1061,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
                         struct page **pages, size_t num_pages,
                         loff_t pos, unsigned long first_index,
-                         unsigned long last_index, size_t write_bytes)
+                         size_t write_bytes)
 {
        struct extent_state *cached_state = NULL;
        int i;
@@ -1073,15 +1075,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        start_pos = pos & ~((u64)root->sectorsize - 1);
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-        if (start_pos > inode->i_size) {
-                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
-                if (err)
-                        return err;
-        }
 again:
        for (i = 0; i < num_pages; i++) {
-                pages[i] = grab_cache_page(inode->i_mapping, index + i);
+                pages[i] = find_or_create_page(inode->i_mapping, index + i,
+                                               GFP_NOFS);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
@@ -1158,7 +1155,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
        unsigned long first_index;
-        unsigned long last_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
@@ -1171,7 +1167,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                return -ENOMEM;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
        while (iov_iter_count(i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -1205,8 +1200,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * contents of pages from loop to loop
                 */
                ret = prepare_pages(root, file, pages, num_pages,
-                                    pos, first_index, last_index,
+                                    pos, first_index, write_bytes);
-                                    write_bytes);
                if (ret) {
                        btrfs_delalloc_release_space(inode,
                                        num_pages << PAGE_CACHE_SHIFT);
@@ -1238,9 +1232,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * managed to copy.
                 */
                if (num_pages > dirty_pages) {
-                        if (copied > 0)
+                        if (copied > 0) {
-                                atomic_inc(
+                                spin_lock(&BTRFS_I(inode)->lock);
-                                        &BTRFS_I(inode)->outstanding_extents);
+                                BTRFS_I(inode)->outstanding_extents++;
+                                spin_unlock(&BTRFS_I(inode)->lock);
+                        }
                        btrfs_delalloc_release_space(inode,
                                        (num_pages - dirty_pages) <<
                                        PAGE_CACHE_SHIFT);
@@ -1336,6 +1332,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        loff_t *ppos = &iocb->ki_pos;
+        u64 start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
@@ -1384,6 +1381,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
+        start_pos = round_down(pos, root->sectorsize);
+        if (start_pos > i_size_read(inode)) {
+                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+                if (err) {
+                        mutex_unlock(&inode->i_mutex);
+                        goto out;
+                }
+        }
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                   pos, ppos, count, ocount);
@@ -1638,11 +1644,15 @@ static long btrfs_fallocate(struct file *file, int mode,
        cur_offset = alloc_start;
        while (1) {
+                u64 actual_end;
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), alloc_end);
+                actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1655,6 +1665,16 @@ static long btrfs_fallocate(struct file *file, int mode,
                                free_extent_map(em);
                                break;
                        }
+                } else if (actual_end > inode->i_size &&
+                           !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        /*
+                         * We didn't need to allocate any more space, but we
+                         * still extended the size of the file so we need to
+                         * update i_size.
+                         */
+                        inode->i_ctime = CURRENT_TIME;
+                        i_size_write(inode, actual_end);
+                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
                }
                free_extent_map(em);
@@ -1804,10 +1824,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
                }
        }
-        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
-                return -EINVAL;
+                ret = -EINVAL;
-        if (offset > inode->i_sb->s_maxbytes)
+                goto out;
-                return -EINVAL;
+        }
+        if (offset > inode->i_sb->s_maxbytes) {
+                ret = -EINVAL;
+                goto out;
+        }
        /* Special lock needed here? */
        if (offset != file->f_pos) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bf0d61567f3d..41ac927401d0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&block_group->lock);
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+                printk(KERN_INFO "Old style space inode found, converting.\n");
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+                block_group->disk_cache_state = BTRFS_DC_CLEAR;
+        }
        if (!btrfs_fs_closing(root->fs_info)) {
                block_group->inode = igrab(inode);
                block_group->iref = 1;
@@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,
        btrfs_set_inode_gid(leaf, inode_item, 0);
        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
        btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
-                              BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+                              BTRFS_INODE_PREALLOC);
        btrfs_set_inode_nlink(leaf, inode_item, 1);
        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
        btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -184,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_path *path,
                                    struct inode *inode)
 {
+        struct btrfs_block_rsv *rsv;
        loff_t oldsize;
        int ret = 0;
+        rsv = trans->block_rsv;
        trans->block_rsv = root->orphan_block_rsv;
        ret = btrfs_block_rsv_check(trans, root,
                                    root->orphan_block_rsv,
@@ -204,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
         */
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
+        trans->block_rsv = rsv;
        if (ret) {
                WARN_ON(1);
                return ret;
@@ -239,17 +249,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
        struct page *page;
-        u32 *checksums = NULL, *crc;
-        char *disk_crcs = NULL;
        struct btrfs_key key;
        struct list_head bitmaps;
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
-        u32 cur_crc = ~(u32)0;
        pgoff_t index = 0;
-        unsigned long first_page_offset;
-        int num_checksums;
        int ret = 0;
        INIT_LIST_HEAD(&bitmaps);
@@ -292,16 +297,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        if (!num_entries)
                goto out;
-        /* Setup everything for doing checksumming */
-        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
-        checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
-        if (!checksums)
-                goto out;
-        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
-        disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
-        if (!disk_crcs)
-                goto out;
        ret = readahead_cache(inode);
        if (ret)
                goto out;
@@ -311,18 +306,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                struct btrfs_free_space *e;
                void *addr;
                unsigned long offset = 0;
-                unsigned long start_offset = 0;
                int need_loop = 0;
                if (!num_entries && !num_bitmaps)
                        break;
-                if (index == 0) {
+                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-                        start_offset = first_page_offset;
-                        offset = start_offset;
-                }
-                page = grab_cache_page(inode->i_mapping, index);
                if (!page)
                        goto free_cache;
@@ -342,8 +331,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                if (index == 0) {
                        u64 *gen;
-                        memcpy(disk_crcs, addr, first_page_offset);
+                        /*
-                        gen = addr + (sizeof(u32) * num_checksums);
+                         * We put a bogus crc in the front of the first page in
+                         * case old kernels try to mount a fs with the new
+                         * format to make sure they discard the cache.
+                         */
+                        addr += sizeof(u64);
+                        offset += sizeof(u64);
+                        gen = addr;
                        if (*gen != BTRFS_I(inode)->generation) {
                                printk(KERN_ERR "btrfs: space cache generation"
                                       " (%llu) does not match inode (%llu)\n",
@@ -355,24 +351,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                page_cache_release(page);
                                goto free_cache;
                        }
-                        crc = (u32 *)disk_crcs;
+                        addr += sizeof(u64);
+                        offset += sizeof(u64);
                }
-                entry = addr + start_offset;
+                entry = addr;
-                /* First lets check our crc before we do anything fun */
-                cur_crc = ~(u32)0;
-                cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
-                                          PAGE_CACHE_SIZE - start_offset);
-                btrfs_csum_final(cur_crc, (char *)&cur_crc);
-                if (cur_crc != *crc) {
-                        printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
-                               index);
-                        kunmap(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto free_cache;
-                }
-                crc++;
                while (1) {
                        if (!num_entries)
@@ -470,8 +452,6 @@ next:
        ret = 1;
 out:
-        kfree(checksums);
-        kfree(disk_crcs);
        return ret;
 free_cache:
        __btrfs_remove_free_space_cache(ctl);
@@ -569,8 +549,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct btrfs_key key;
        u64 start, end, len;
        u64 bytes = 0;
-        u32 *crc, *checksums;
+        u32 crc = ~(u32)0;
-        unsigned long first_page_offset;
        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
@@ -590,34 +569,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
                PAGE_CACHE_SHIFT;
-        /* Since the first page has all of our checksums and our generation we
-         * need to calculate the offset into the page that we can start writing
-         * our entries.
-         */
-        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
-        /* make sure we don't overflow that first page */
-        if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
-                /* this is really the same as running out of space, where we also return 0 */
-                printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
-                ret = 0;
-                goto out_update;
-        }
-        /* We need a checksum per page. */
-        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
-        if (!crc)
-                return -1;
        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-        if (!pages) {
+        if (!pages)
-                kfree(crc);
                return -1;
-        }
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -640,7 +598,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * know and don't freak out.
         */
        while (index < num_pages) {
-                page = grab_cache_page(inode->i_mapping, index);
+                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
                if (!page) {
                        int i;
@@ -648,7 +606,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                        goto out_free;
+                        goto out;
                }
                pages[index] = page;
                index++;
@@ -668,17 +626,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        /* Write out the extent entries */
        do {
                struct btrfs_free_space_entry *entry;
-                void *addr;
+                void *addr, *orig;
                unsigned long offset = 0;
-                unsigned long start_offset = 0;
                next_page = false;
-                if (index == 0) {
-                        start_offset = first_page_offset;
-                        offset = start_offset;
-                }
                if (index >= num_pages) {
                        out_of_space = true;
                        break;
@@ -686,10 +638,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                page = pages[index];
-                addr = kmap(page);
+                orig = addr = kmap(page);
-                entry = addr + start_offset;
+                if (index == 0) {
+                        u64 *gen;
-                memset(addr, 0, PAGE_CACHE_SIZE);
+                        /*
+                         * We're going to put in a bogus crc for this page to
+                         * make sure that old kernels who aren't aware of this
+                         * format will be sure to discard the cache.
+                         */
+                        addr += sizeof(u64);
+                        offset += sizeof(u64);
+                        gen = addr;
+                        *gen = trans->transid;
+                        addr += sizeof(u64);
+                        offset += sizeof(u64);
+                }
+                entry = addr;
+                memset(addr, 0, PAGE_CACHE_SIZE - offset);
                while (node && !next_page) {
                        struct btrfs_free_space *e;
@@ -752,13 +720,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                next_page = true;
                        entry++;
                }
-                *crc = ~(u32)0;
-                *crc = btrfs_csum_data(root, addr + start_offset, *crc,
-                                       PAGE_CACHE_SIZE - start_offset);
-                kunmap(page);
-                btrfs_csum_final(*crc, (char *)crc);
+                /* Generate bogus crc value */
-                crc++;
+                if (index == 0) {
+                        u32 *tmp;
+                        crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
+                                              PAGE_CACHE_SIZE - sizeof(u64));
+                        btrfs_csum_final(crc, (char *)&crc);
+                        crc++;
+                        tmp = orig;
+                        *tmp = crc;
+                }
+                kunmap(page);
                bytes += PAGE_CACHE_SIZE;
@@ -779,11 +753,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                addr = kmap(page);
                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-                *crc = ~(u32)0;
-                *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
                kunmap(page);
-                btrfs_csum_final(*crc, (char *)crc);
-                crc++;
                bytes += PAGE_CACHE_SIZE;
                list_del_init(&entry->list);
@@ -796,7 +766,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                     i_size_read(inode) - 1, &cached_state,
                                     GFP_NOFS);
                ret = 0;
-                goto out_free;
+                goto out;
        }
        /* Zero out the rest of the pages just to make sure */
@@ -811,20 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                index++;
        }
-        /* Write the checksums and trans id to the first page */
-        {
-                void *addr;
-                u64 *gen;
-                page = pages[0];
-                addr = kmap(page);
-                memcpy(addr, checksums, sizeof(u32) * num_pages);
-                gen = addr + (sizeof(u32) * num_pages);
-                *gen = trans->transid;
-                kunmap(page);
-        }
        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
                                            bytes, &cached_state);
        btrfs_drop_pages(pages, num_pages);
@@ -833,7 +789,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (ret) {
                ret = 0;
-                goto out_free;
+                goto out;
        }
        BTRFS_I(inode)->generation = trans->transid;
@@ -850,7 +806,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
                                 EXTENT_DIRTY | EXTENT_DELALLOC |
                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
-                goto out_free;
+                goto out;
        }
        leaf = path->nodes[0];
        if (ret > 0) {
@@ -866,7 +822,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
                                         GFP_NOFS);
                        btrfs_release_path(path);
-                        goto out_free;
+                        goto out;
                }
        }
        header = btrfs_item_ptr(leaf, path->slots[0],
@@ -879,11 +835,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        ret = 1;
-out_free:
+out:
-        kfree(checksums);
        kfree(pages);
-out_update:
        if (ret != 1) {
                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
                BTRFS_I(inode)->generation = 0;
@@ -1219,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-                              struct btrfs_free_space *info, u64 offset,
+                                       struct btrfs_free_space *info,
-                              u64 bytes)
+                                       u64 offset, u64 bytes)
 {
        unsigned long start, count;
@@ -1232,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
        bitmap_clear(info->bitmap, start, count);
        info->bytes -= bytes;
+}
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        __bitmap_clear_bits(ctl, info, offset, bytes);
        ctl->free_space -= bytes;
 }
@@ -2035,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                return 0;
        ret = search_start;
-        bitmap_clear_bits(ctl, entry, ret, bytes);
+        __bitmap_clear_bits(ctl, entry, ret, bytes);
        return ret;
 }
@@ -2090,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                continue;
                        }
                } else {
                        ret = entry->offset;
                        entry->offset += bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e91b097e7252..4d14de6d121b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
        return alloc_hint;
 }
-static inline bool is_free_space_inode(struct btrfs_root *root,
-                                       struct inode *inode)
-{
-        if (root == root->fs_info->tree_root ||
-            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
-                return true;
-        return false;
-}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
-        BUG_ON(is_free_space_inode(root, inode));
+        BUG_ON(btrfs_is_free_space_inode(root, inode));
        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1070,9 +1061,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
-        nolock = is_free_space_inode(root, inode);
+        nolock = btrfs_is_free_space_inode(root, inode);
        if (nolock)
                trans = btrfs_join_transaction_nolock(root);
@@ -1291,15 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        return ret;
 }
-static int btrfs_split_extent_hook(struct inode *inode,
+static void btrfs_split_extent_hook(struct inode *inode,
-                                   struct extent_state *orig, u64 split)
+                                    struct extent_state *orig, u64 split)
 {
        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
-                return 0;
+                return;
-        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->lock);
-        return 0;
+        BTRFS_I(inode)->outstanding_extents++;
+        spin_unlock(&BTRFS_I(inode)->lock);
 }
 /*
@@ -1308,16 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
 * extents, such as when we are doing sequential writes, so we can properly
 * account for the metadata space we'll need.
 */
-static int btrfs_merge_extent_hook(struct inode *inode,
+static void btrfs_merge_extent_hook(struct inode *inode,
-                                   struct extent_state *new,
+                                    struct extent_state *new,
-                                   struct extent_state *other)
+                                    struct extent_state *other)
 {
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
-                return 0;
+                return;
-        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->lock);
-        return 0;
+        BTRFS_I(inode)->outstanding_extents--;
+        spin_unlock(&BTRFS_I(inode)->lock);
 }
 /*
@@ -1325,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 * bytes in this file, and to maintain the list of inodes that
 * have pending delalloc work to be done.
 */
-static int btrfs_set_bit_hook(struct inode *inode,
+static void btrfs_set_bit_hook(struct inode *inode,
-                              struct extent_state *state, int *bits)
+                               struct extent_state *state, int *bits)
 {
        /*
@@ -1337,12 +1331,15 @@ static int btrfs_set_bit_hook(struct inode *inode,
        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
-                bool do_list = !is_free_space_inode(root, inode);
+                bool do_list = !btrfs_is_free_space_inode(root, inode);
-                if (*bits & EXTENT_FIRST_DELALLOC)
+                if (*bits & EXTENT_FIRST_DELALLOC) {
                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                else
+                } else {
-                        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+                        spin_lock(&BTRFS_I(inode)->lock);
+                        BTRFS_I(inode)->outstanding_extents++;
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                }
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += len;
@@ -1353,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
                }
                spin_unlock(&root->fs_info->delalloc_lock);
        }
-        return 0;
 }
 /*
 * extent_io.c clear_bit_hook, see set_bit_hook for why
 */
-static int btrfs_clear_bit_hook(struct inode *inode,
+static void btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, int *bits)
+                                 struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1370,12 +1366,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
-                bool do_list = !is_free_space_inode(root, inode);
+                bool do_list = !btrfs_is_free_space_inode(root, inode);
-                if (*bits & EXTENT_FIRST_DELALLOC)
+                if (*bits & EXTENT_FIRST_DELALLOC) {
                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                else if (!(*bits & EXTENT_DO_ACCOUNTING))
+                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
-                        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+                        spin_lock(&BTRFS_I(inode)->lock);
+                        BTRFS_I(inode)->outstanding_extents--;
+                        spin_unlock(&BTRFS_I(inode)->lock);
+                }
                if (*bits & EXTENT_DO_ACCOUNTING)
                        btrfs_delalloc_release_metadata(inode, len);
@@ -1394,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode,
                }
                spin_unlock(&root->fs_info->delalloc_lock);
        }
-        return 0;
 }
 /*
@@ -1477,7 +1475,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        if (is_free_space_inode(root, inode))
+        if (btrfs_is_free_space_inode(root, inode))
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
        else
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1644,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        int ret;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
@@ -1726,7 +1725,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                return 0;
        BUG_ON(!ordered_extent);
-        nolock = is_free_space_inode(root, inode);
+        nolock = btrfs_is_free_space_inode(root, inode);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list));
@@ -1787,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                          &ordered_extent->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        if (!ret) {
+        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
        }
@@ -2214,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        if (!root->orphan_block_rsv) {
                block_rsv = btrfs_alloc_block_rsv(root);
-                BUG_ON(!block_rsv);
+                if (!block_rsv)
+                        return -ENOMEM;
        }
        spin_lock(&root->orphan_lock);
@@ -2516,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
                filled = true;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                goto make_bad;
        path->leave_spinning = 1;
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
@@ -2531,13 +2533,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
-        if (!leaf->map_token)
-                map_private_extent_buffer(leaf, (unsigned long)inode_item,
-                                          sizeof(struct btrfs_inode_item),
-                                          &leaf->map_token, &leaf->kaddr,
-                                          &leaf->map_start, &leaf->map_len,
-                                          KM_USER1);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
        inode->i_uid = btrfs_inode_uid(leaf, inode_item);
@@ -2575,11 +2570,6 @@ cache_acl:
        if (!maybe_acls)
                cache_no_acl(inode);
-        if (leaf->map_token) {
-                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                leaf->map_token = NULL;
-        }
        btrfs_free_path(path);
        switch (inode->i_mode & S_IFMT) {
@@ -2624,13 +2614,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
-        if (!leaf->map_token)
-                map_private_extent_buffer(leaf, (unsigned long)item,
-                                          sizeof(struct btrfs_inode_item),
-                                          &leaf->map_token, &leaf->kaddr,
-                                          &leaf->map_start, &leaf->map_len,
-                                          KM_USER1);
        btrfs_set_inode_uid(leaf, item, inode->i_uid);
        btrfs_set_inode_gid(leaf, item, inode->i_gid);
        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2659,11 +2642,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
        btrfs_set_inode_block_group(leaf, item, 0);
-        if (leaf->map_token) {
-                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
-                leaf->map_token = NULL;
-        }
 }
 /*
@@ -2684,7 +2662,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
         * The data relocation inode should also be directly updated
         * without delay
         */
-        if (!is_free_space_inode(root, inode)
+        if (!btrfs_is_free_space_inode(root, inode)
            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
                ret = btrfs_delayed_update_inode(trans, root, inode);
                if (!ret)
@@ -3021,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
-        BUG_ON(ret);
+        if (ret)
+                goto out;
        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
        }
+out:
        nr = trans->blocks_used;
        __unlink_end_trans(trans, root);
        btrfs_btree_balance_dirty(root, nr);
@@ -3170,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = -1;
        if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
@@ -3182,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (min_type == 0 && root == BTRFS_I(inode)->root)
                btrfs_kill_delayed_inode_items(inode);
-        path = btrfs_alloc_path();
-        BUG_ON(!path);
-        path->reada = -1;
        key.objectid = ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;
@@ -3398,7 +3380,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        ret = -ENOMEM;
 again:
-        page = grab_cache_page(mapping, index);
+        page = find_or_create_page(mapping, index, GFP_NOFS);
        if (!page) {
                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                goto out;
@@ -3528,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
-                        if (err)
+                        if (err) {
+                                btrfs_end_transaction(trans, root);
                                break;
+                        }
                        err = btrfs_insert_file_extent(trans, root,
                                        btrfs_ino(inode), cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
-                        if (err)
+                        if (err) {
+                                btrfs_end_transaction(trans, root);
                                break;
+                        }
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
@@ -3634,7 +3620,7 @@ void btrfs_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-                               is_free_space_inode(root, inode)))
+                               btrfs_is_free_space_inode(root, inode)))
                goto no_delete;
        if (is_bad_inode(inode)) {
@@ -3713,7 +3699,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
        int ret = 0;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
                                    namelen, 0);
@@ -3978,10 +3965,16 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
-                inode_tree_add(inode);
+                if (!is_bad_inode(inode)) {
-                unlock_new_inode(inode);
+                        inode_tree_add(inode);
-                if (new)
+                        unlock_new_inode(inode);
-                        *new = 1;
+                        if (new)
+                                *new = 1;
+                } else {
+                        unlock_new_inode(inode);
+                        iput(inode);
+                        inode = ERR_PTR(-ESTALE);
+                }
        }
        return inode;
@@ -4016,12 +4009,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
        int index;
-        int ret;
+        int ret = 0;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        ret = btrfs_inode_by_name(dir, dentry, &location);
+        if (unlikely(d_need_lookup(dentry))) {
+                memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
+                kfree(dentry->d_fsdata);
+                dentry->d_fsdata = NULL;
+                d_clear_need_lookup(dentry);
+        } else {
+                ret = btrfs_inode_by_name(dir, dentry, &location);
+        }
        if (ret < 0)
                return ERR_PTR(ret);
@@ -4076,6 +4076,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
        return 0;
 }
+static void btrfs_dentry_release(struct dentry *dentry)
+{
+        if (dentry->d_fsdata)
+                kfree(dentry->d_fsdata);
+}
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
 {
@@ -4098,6 +4104,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_path *path;
        struct list_head ins_list;
        struct list_head del_list;
+        struct qstr q;
        int ret;
        struct extent_buffer *leaf;
        int slot;
@@ -4187,6 +4194,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                while (di_cur < di_total) {
                        struct btrfs_key location;
+                        struct dentry *tmp;
                        if (verify_dir_item(root, leaf, di))
                                break;
@@ -4207,6 +4215,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
+                        q.name = name_ptr;
+                        q.len = name_len;
+                        q.hash = full_name_hash(q.name, q.len);
+                        tmp = d_lookup(filp->f_dentry, &q);
+                        if (!tmp) {
+                                struct btrfs_key *newkey;
+                                newkey = kzalloc(sizeof(struct btrfs_key),
+                                                 GFP_NOFS);
+                                if (!newkey)
+                                        goto no_dentry;
+                                tmp = d_alloc(filp->f_dentry, &q);
+                                if (!tmp) {
+                                        kfree(newkey);
+                                        dput(tmp);
+                                        goto no_dentry;
+                                }
+                                memcpy(newkey, &location,
+                                       sizeof(struct btrfs_key));
+                                tmp->d_fsdata = newkey;
+                                tmp->d_flags |= DCACHE_NEED_LOOKUP;
+                                d_rehash(tmp);
+                                dput(tmp);
+                        } else {
+                                dput(tmp);
+                        }
+no_dentry:
                        /* is this a reference to our own snapshot? If so
                         * skip it
                         */
@@ -4271,7 +4306,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
-        if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
+        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
                nolock = true;
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4432,7 +4467,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        int owner;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return ERR_PTR(-ENOMEM);
        inode = new_inode(root->fs_info->sb);
        if (!inode) {
@@ -4467,7 +4503,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        inode->i_generation = BTRFS_I(inode)->generation;
        btrfs_set_inode_space_info(root, inode);
-        if (mode & S_IFDIR)
+        if (S_ISDIR(mode))
                owner = 0;
        else
                owner = 1;
@@ -4512,7 +4548,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        btrfs_inherit_iflags(inode, dir);
-        if ((mode & S_IFREG)) {
+        if (S_ISREG(mode)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
                if (btrfs_test_opt(root, NODATACOW) ||
@@ -5787,7 +5823,7 @@ again:
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        if (!ret)
+        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
                btrfs_update_inode(trans, root, inode);
        ret = 0;
 out_unlock:
@@ -6692,19 +6728,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
        return 0;
 }
-/* helper function for file defrag and space balancing.  This
- * forces readahead on a given range of bytes in an inode
- */
-unsigned long btrfs_force_ra(struct address_space *mapping,
-                              struct file_ra_state *ra, struct file *file,
-                              pgoff_t offset, pgoff_t last_index)
-{
-        pgoff_t req_size = last_index - offset + 1;
-        page_cache_sync_readahead(mapping, ra, file, offset, req_size);
-        return offset + req_size;
-}
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
@@ -6728,8 +6751,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
-        atomic_set(&ei->outstanding_extents, 0);
+        spin_lock_init(&ei->lock);
-        atomic_set(&ei->reserved_extents, 0);
+        ei->outstanding_extents = 0;
+        ei->reserved_extents = 0;
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
@@ -6767,8 +6791,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
-        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+        WARN_ON(BTRFS_I(inode)->outstanding_extents);
-        WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
+        WARN_ON(BTRFS_I(inode)->reserved_extents);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -6823,7 +6847,7 @@ int btrfs_drop_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (btrfs_root_refs(&root->root_item) == 0 &&
-            !is_free_space_inode(root, inode))
+            !btrfs_is_free_space_inode(root, inode))
                return 1;
        else
                return generic_drop_inode(inode);
@@ -7186,7 +7210,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                err = -ENOMEM;
+                drop_inode = 1;
+                goto out_unlock;
+        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
@@ -7326,11 +7354,15 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        umode_t mode = inode->i_mode;
-        if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+        if (mask & MAY_WRITE &&
-                return -EROFS;
+            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
-        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
+                if (btrfs_root_readonly(root))
-                return -EACCES;
+                        return -EROFS;
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+                        return -EACCES;
+        }
        return generic_permission(inode, mask);
 }
@@ -7452,4 +7484,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 const struct dentry_operations btrfs_dentry_operations = {
        .d_delete       = btrfs_dentry_delete,
+        .d_release      = btrfs_dentry_release,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 622543309eb2..3351b1b24574 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -859,8 +859,8 @@ again:
        /* step one, lock all the pages */
        for (i = 0; i < num_pages; i++) {
                struct page *page;
-                page = grab_cache_page(inode->i_mapping,
+                page = find_or_create_page(inode->i_mapping,
-                                            start_index + i);
+                                            start_index + i, GFP_NOFS);
                if (!page)
                        break;
@@ -930,7 +930,9 @@ again:
                          GFP_NOFS);
        if (i_done != num_pages) {
-                atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+                spin_lock(&BTRFS_I(inode)->lock);
+                BTRFS_I(inode)->outstanding_extents++;
+                spin_unlock(&BTRFS_I(inode)->lock);
                btrfs_delalloc_release_space(inode,
                                     (num_pages - i_done) << PAGE_CACHE_SHIFT);
        }
@@ -1747,11 +1749,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                key.objectid = key.offset;
                key.offset = (u64)-1;
                dirid = key.objectid;
        }
        if (ptr < name)
                goto out;
-        memcpy(name, ptr, total_len);
+        memmove(name, ptr, total_len);
        name[total_len]='\0';
        ret = 0;
 out:
@@ -2219,6 +2220,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
+        if (destoff > inode->i_size) {
+                ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+                if (ret)
+                        goto out_unlock;
+        }
        /* do any pending delalloc/csum calc on src, one way or
           another, and lock file content */
        while (1) {
@@ -2235,6 +2242,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, len);
        }
+        /* truncate page cache pages from target inode range */
+        truncate_inode_pages_range(&inode->i_data, off,
+                                   ALIGN(off + len, PAGE_CACHE_SIZE) - 1);
        /* clone data */
        key.objectid = btrfs_ino(src);
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -2320,14 +2331,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                /*
+                                 *    a  | --- range to clone ---|  b
+                                 * | ------------- extent ------------- |
+                                 */
+                                /* substract range b */
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                /* substract range a */
                                if (off > key.offset) {
                                        datao += off - key.offset;
                                        datal -= off - key.offset;
                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                ret = btrfs_drop_extents(trans, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0f..d77b67c4b275 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
 #include "extent_io.h"
 #include "locking.h"
-static inline void spin_nested(struct extent_buffer *eb)
+void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-{
-        spin_lock(&eb->lock);
-}
 /*
- * Setting a lock to blocking will drop the spinlock and set the
+ * if we currently have a spinning reader or writer lock
- * flag that forces other procs who want the lock to wait.  After
+ * (indicated by the rw flag) this will bump the count
- * this you can safely schedule with the lock held.
+ * of blocking holders and drop the spinlock.
 */
-void btrfs_set_lock_blocking(struct extent_buffer *eb)
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+        if (rw == BTRFS_WRITE_LOCK) {
-                set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                if (atomic_read(&eb->blocking_writers) == 0) {
-                spin_unlock(&eb->lock);
+                        WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+                        atomic_dec(&eb->spinning_writers);
+                        btrfs_assert_tree_locked(eb);
+                        atomic_inc(&eb->blocking_writers);
+                        write_unlock(&eb->lock);
+                }
+        } else if (rw == BTRFS_READ_LOCK) {
+                btrfs_assert_tree_read_locked(eb);
+                atomic_inc(&eb->blocking_readers);
+                WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+                atomic_dec(&eb->spinning_readers);
+                read_unlock(&eb->lock);
        }
-        /* exit with the spin lock released and the bit set */
+        return;
 }
 /*
- * clearing the blocking flag will take the spinlock again.
+ * if we currently have a blocking lock, take the spinlock
- * After this you can't safely schedule
+ * and drop our blocking count
 */
-void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
-        if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+        if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
-                spin_nested(eb);
+                BUG_ON(atomic_read(&eb->blocking_writers) != 1);
-                clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+                write_lock(&eb->lock);
-                smp_mb__after_clear_bit();
+                WARN_ON(atomic_read(&eb->spinning_writers));
+                atomic_inc(&eb->spinning_writers);
+                if (atomic_dec_and_test(&eb->blocking_writers))
+                        wake_up(&eb->write_lock_wq);
+        } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
+                BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+                read_lock(&eb->lock);
+                atomic_inc(&eb->spinning_readers);
+                if (atomic_dec_and_test(&eb->blocking_readers))
+                        wake_up(&eb->read_lock_wq);
        }
-        /* exit with the spin lock held */
+        return;
 }
 /*
- * unfortunately, many of the places that currently set a lock to blocking
+ * take a spinning read lock.  This will wait for any blocking
- * don't end up blocking for very long, and often they don't block
+ * writers
- * at all.  For a dbench 50 run, if we don't spin on the blocking bit
- * at all, the context switch rate can jump up to 400,000/sec or more.
- *
- * So, we're still stuck with this crummy spin on the blocking bit,
- * at least until the most common causes of the short blocks
- * can be dealt with.
 */
-static int btrfs_spin_on_block(struct extent_buffer *eb)
+void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
-        int i;
+again:
+        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-        for (i = 0; i < 512; i++) {
+        read_lock(&eb->lock);
-                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+        if (atomic_read(&eb->blocking_writers)) {
-                        return 1;
+                read_unlock(&eb->lock);
-                if (need_resched())
+                wait_event(eb->write_lock_wq,
-                        break;
+                           atomic_read(&eb->blocking_writers) == 0);
-                cpu_relax();
+                goto again;
        }
-        return 0;
+        atomic_inc(&eb->read_locks);
+        atomic_inc(&eb->spinning_readers);
 }
 /*
- * This is somewhat different from trylock.  It will take the
+ * returns 1 if we get the read lock and 0 if we don't
- * spinlock but if it finds the lock is set to blocking, it will
+ * this won't wait for blocking writers
- * return without the lock held.
- *
- * returns 1 if it was able to take the lock and zero otherwise
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
 */
-int btrfs_try_spin_lock(struct extent_buffer *eb)
+int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 {
-        int i;
+        if (atomic_read(&eb->blocking_writers))
+                return 0;
-        if (btrfs_spin_on_block(eb)) {
+        read_lock(&eb->lock);
-                spin_nested(eb);
+        if (atomic_read(&eb->blocking_writers)) {
-                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                read_unlock(&eb->lock);
-                        return 1;
+                return 0;
-                spin_unlock(&eb->lock);
        }
-        /* spin for a bit on the BLOCKING flag */
+        atomic_inc(&eb->read_locks);
-        for (i = 0; i < 2; i++) {
+        atomic_inc(&eb->spinning_readers);
-                cpu_relax();
+        return 1;
-                if (!btrfs_spin_on_block(eb))
-                        break;
-                spin_nested(eb);
-                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-                        return 1;
-                spin_unlock(&eb->lock);
-        }
-        return 0;
 }
 /*
- * the autoremove wake function will return 0 if it tried to wake up
+ * returns 1 if we get the read lock and 0 if we don't
- * a process that was already awake, which means that process won't
+ * this won't wait for blocking writers or readers
- * count as an exclusive wakeup.  The waitq code will continue waking
- * procs until it finds one that was actually sleeping.
- *
- * For btrfs, this isn't quite what we want.  We want a single proc
- * to be notified that the lock is ready for taking.  If that proc
- * already happen to be awake, great, it will loop around and try for
- * the lock.
- *
- * So, btrfs_wake_function always returns 1, even when the proc that we
- * tried to wake up was already awake.
 */
-static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+int btrfs_try_tree_write_lock(struct extent_buffer *eb)
-                               int sync, void *key)
 {
-        autoremove_wake_function(wait, mode, sync, key);
+        if (atomic_read(&eb->blocking_writers) ||
+            atomic_read(&eb->blocking_readers))
+                return 0;
+        write_lock(&eb->lock);
+        if (atomic_read(&eb->blocking_writers) ||
+            atomic_read(&eb->blocking_readers)) {
+                write_unlock(&eb->lock);
+                return 0;
+        }
+        atomic_inc(&eb->write_locks);
+        atomic_inc(&eb->spinning_writers);
        return 1;
 }
 /*
- * returns with the extent buffer spinlocked.
+ * drop a spinning read lock
- *
+ */
- * This will spin and/or wait as required to take the lock, and then
+void btrfs_tree_read_unlock(struct extent_buffer *eb)
- * return with the spinlock held.
+{
- *
+        btrfs_assert_tree_read_locked(eb);
- * After this call, scheduling is not safe without first calling
+        WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- * btrfs_set_lock_blocking()
+        atomic_dec(&eb->spinning_readers);
+        atomic_dec(&eb->read_locks);
+        read_unlock(&eb->lock);
+}
+/*
+ * drop a blocking read lock
+ */
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
+{
+        btrfs_assert_tree_read_locked(eb);
+        WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+        if (atomic_dec_and_test(&eb->blocking_readers))
+                wake_up(&eb->read_lock_wq);
+        atomic_dec(&eb->read_locks);
+}
+/*
+ * take a spinning write lock.  This will wait for both
+ * blocking readers or writers
 */
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
-        DEFINE_WAIT(wait);
+again:
-        wait.func = btrfs_wake_function;
+        wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
+        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-        if (!btrfs_spin_on_block(eb))
+        write_lock(&eb->lock);
-                goto sleep;
+        if (atomic_read(&eb->blocking_readers)) {
+                write_unlock(&eb->lock);
-        while(1) {
+                wait_event(eb->read_lock_wq,
-                spin_nested(eb);
+                           atomic_read(&eb->blocking_readers) == 0);
+                goto again;
-                /* nobody is blocking, exit with the spinlock held */
-                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-                        return 0;
-                /*
-                 * we have the spinlock, but the real owner is blocking.
-                 * wait for them
-                 */
-                spin_unlock(&eb->lock);
-                /*
-                 * spin for a bit, and if the blocking flag goes away,
-                 * loop around
-                 */
-                cpu_relax();
-                if (btrfs_spin_on_block(eb))
-                        continue;
-sleep:
-                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
-                                          TASK_UNINTERRUPTIBLE);
-                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-                        schedule();
-                finish_wait(&eb->lock_wq, &wait);
        }
+        if (atomic_read(&eb->blocking_writers)) {
+                write_unlock(&eb->lock);
+                wait_event(eb->write_lock_wq,
+                           atomic_read(&eb->blocking_writers) == 0);
+                goto again;
+        }
+        WARN_ON(atomic_read(&eb->spinning_writers));
+        atomic_inc(&eb->spinning_writers);
+        atomic_inc(&eb->write_locks);
        return 0;
 }
+/*
+ * drop a spinning or a blocking write lock.
+ */
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-        /*
+        int blockers = atomic_read(&eb->blocking_writers);
-         * if we were a blocking owner, we don't have the spinlock held
-         * just clear the bit and look for waiters
+        BUG_ON(blockers > 1);
-         */
-        if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+        btrfs_assert_tree_locked(eb);
-                smp_mb__after_clear_bit();
+        atomic_dec(&eb->write_locks);
-        else
-                spin_unlock(&eb->lock);
+        if (blockers) {
+                WARN_ON(atomic_read(&eb->spinning_writers));
-        if (waitqueue_active(&eb->lock_wq))
+                atomic_dec(&eb->blocking_writers);
-                wake_up(&eb->lock_wq);
+                smp_wmb();
+                wake_up(&eb->write_lock_wq);
+        } else {
+                WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+                atomic_dec(&eb->spinning_writers);
+                write_unlock(&eb->lock);
+        }
        return 0;
 }
 void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+        BUG_ON(!atomic_read(&eb->write_locks));
-                assert_spin_locked(&eb->lock);
+}
+void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+        BUG_ON(!atomic_read(&eb->read_locks));
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f1..17247ddb81a0 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
 #ifndef __BTRFS_LOCKING_
 #define __BTRFS_LOCKING_
+#define BTRFS_WRITE_LOCK 1
+#define BTRFS_READ_LOCK 2
+#define BTRFS_WRITE_LOCK_BLOCKING 3
+#define BTRFS_READ_LOCK_BLOCKING 4
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
-void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_tree_read_lock(struct extent_buffer *eb);
-void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_tree_read_unlock(struct extent_buffer *eb);
+void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+{
+        if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+                btrfs_tree_unlock(eb);
+        else if (rw == BTRFS_READ_LOCK_BLOCKING)
+                btrfs_tree_read_unlock_blocking(eb);
+        else if (rw == BTRFS_READ_LOCK)
+                btrfs_tree_read_unlock(eb);
+        else
+                BUG();
+}
+static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+        btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
+}
+static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
+{
+        btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
+}
 #endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index 82d569cb6267..000000000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sort.h>
-#include "ctree.h"
-#include "ref-cache.h"
-#include "transaction.h"
-static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
-                                   struct rb_node *node)
-{
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct btrfs_leaf_ref *entry;
-        while (*p) {
-                parent = *p;
-                entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
-                if (bytenr < entry->bytenr)
-                        p = &(*p)->rb_left;
-                else if (bytenr > entry->bytenr)
-                        p = &(*p)->rb_right;
-                else
-                        return parent;
-        }
-        entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
-        rb_link_node(node, parent, p);
-        rb_insert_color(node, root);
-        return NULL;
-}
-static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
-{
-        struct rb_node *n = root->rb_node;
-        struct btrfs_leaf_ref *entry;
-        while (n) {
-                entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
-                WARN_ON(!entry->in_tree);
-                if (bytenr < entry->bytenr)
-                        n = n->rb_left;
-                else if (bytenr > entry->bytenr)
-                        n = n->rb_right;
-                else
-                        return n;
-        }
-        return NULL;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index 24f7001f6387..000000000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __REFCACHE__
-#define __REFCACHE__
-struct btrfs_extent_info {
-        /* bytenr and num_bytes find the extent in the extent allocation tree */
-        u64 bytenr;
-        u64 num_bytes;
-        /* objectid and offset find the back reference for the file */
-        u64 objectid;
-        u64 offset;
-};
-struct btrfs_leaf_ref {
-        struct rb_node rb_node;
-        struct btrfs_leaf_ref_tree *tree;
-        int in_tree;
-        atomic_t usage;
-        u64 root_gen;
-        u64 bytenr;
-        u64 owner;
-        u64 generation;
-        int nritems;
-        struct list_head list;
-        struct btrfs_extent_info extents[];
-};
-static inline size_t btrfs_leaf_ref_size(int nr_extents)
-{
-        return sizeof(struct btrfs_leaf_ref) +
-               sizeof(struct btrfs_extent_info) * nr_extents;
-}
-#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5e0a3dc79a45..59bb1764273d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        page_cache_sync_readahead(inode->i_mapping,
                                                  ra, NULL, index,
                                                  last_index + 1 - index);
-                        page = grab_cache_page(inode->i_mapping, index);
+                        page = find_or_create_page(inode->i_mapping, index,
+                                                   GFP_NOFS);
                        if (!page) {
                                btrfs_delalloc_release_metadata(inode,
                                                        PAGE_CACHE_SIZE);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ebe45443de06..f4099904565a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,13 +71,12 @@ out:
        return ret;
 }
-int btrfs_set_root_node(struct btrfs_root_item *item,
+void btrfs_set_root_node(struct btrfs_root_item *item,
-                        struct extent_buffer *node)
+                         struct extent_buffer *node)
 {
        btrfs_set_root_bytenr(item, node->start);
        btrfs_set_root_level(item, btrfs_header_level(node));
        btrfs_set_root_generation(item, btrfs_header_generation(node));
-        return 0;
 }
 /*
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7ecaf1e79..bc1f6ad18442 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
        unsigned long part_offset = (unsigned long)s;                   \
        unsigned long offset = part_offset + offsetof(type, member);    \
        type *p;                                                        \
-        /* ugly, but we want the fast path here */                      \
+        int err;                                                \
-        if (eb->map_token && offset >= eb->map_start &&                 \
+        char *kaddr;                                            \
-            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+        unsigned long map_start;                                \
-            eb->map_len) {                                              \
+        unsigned long map_len;                                  \
-                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+        u##bits res;                                            \
-                return le##bits##_to_cpu(p->member);                    \
+        err = map_private_extent_buffer(eb, offset,             \
-        }                                                               \
+                        sizeof(((type *)0)->member),            \
-        {                                                               \
+                        &kaddr, &map_start, &map_len);          \
-                int err;                                                \
+        if (err) {                                              \
-                char *map_token;                                        \
+                __le##bits leres;                               \
-                char *kaddr;                                            \
+                read_eb_member(eb, s, type, member, &leres);    \
-                int unmap_on_exit = (eb->map_token == NULL);            \
+                return le##bits##_to_cpu(leres);                \
-                unsigned long map_start;                                \
+        }                                                       \
-                unsigned long map_len;                                  \
+        p = (type *)(kaddr + part_offset - map_start);          \
-                u##bits res;                                            \
+        res = le##bits##_to_cpu(p->member);                     \
-                err = map_extent_buffer(eb, offset,                     \
+        return res;                                             \
-                                sizeof(((type *)0)->member),            \
-                                &map_token, &kaddr,                     \
-                                &map_start, &map_len, KM_USER1);        \
-                if (err) {                                              \
-                        __le##bits leres;                               \
-                        read_eb_member(eb, s, type, member, &leres);    \
-                        return le##bits##_to_cpu(leres);                \
-                }                                                       \
-                p = (type *)(kaddr + part_offset - map_start);          \
-                res = le##bits##_to_cpu(p->member);                     \
-                if (unmap_on_exit)                                      \
-                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
-                return res;                                             \
-        }                                                               \
 }                                                                       \
 void btrfs_set_##name(struct extent_buffer *eb,                         \
                                    type *s, u##bits val)               \
@@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
        unsigned long part_offset = (unsigned long)s;                   \
        unsigned long offset = part_offset + offsetof(type, member);    \
        type *p;                                                        \
-        /* ugly, but we want the fast path here */                      \
+        int err;                                                \
-        if (eb->map_token && offset >= eb->map_start &&                 \
+        char *kaddr;                                            \
-            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+        unsigned long map_start;                                \
-            eb->map_len) {                                              \
+        unsigned long map_len;                                  \
-                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+        err = map_private_extent_buffer(eb, offset,             \
-                p->member = cpu_to_le##bits(val);                       \
+                        sizeof(((type *)0)->member),            \
-                return;                                                 \
+                        &kaddr, &map_start, &map_len);          \
-        }                                                               \
+        if (err) {                                              \
-        {                                                               \
+                __le##bits val2;                                \
-                int err;                                                \
+                val2 = cpu_to_le##bits(val);                    \
-                char *map_token;                                        \
+                write_eb_member(eb, s, type, member, &val2);    \
-                char *kaddr;                                            \
+                return;                                         \
-                int unmap_on_exit = (eb->map_token == NULL);            \
+        }                                                       \
-                unsigned long map_start;                                \
+        p = (type *)(kaddr + part_offset - map_start);          \
-                unsigned long map_len;                                  \
+        p->member = cpu_to_le##bits(val);                       \
-                err = map_extent_buffer(eb, offset,                     \
-                                sizeof(((type *)0)->member),            \
-                                &map_token, &kaddr,                     \
-                                &map_start, &map_len, KM_USER1);        \
-                if (err) {                                              \
-                        __le##bits val2;                                \
-                        val2 = cpu_to_le##bits(val);                    \
-                        write_eb_member(eb, s, type, member, &val2);    \
-                        return;                                         \
-                }                                                       \
-                p = (type *)(kaddr + part_offset - map_start);          \
-                p->member = cpu_to_le##bits(val);                       \
-                if (unmap_on_exit)                                      \
-                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
-        }                                                               \
 }
 #include "ctree.h"
@@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,
                    struct btrfs_disk_key *disk_key, int nr)
 {
        unsigned long ptr = btrfs_node_key_ptr_offset(nr);
-        if (eb->map_token && ptr >= eb->map_start &&
-            ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
-                memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
-                        sizeof(*disk_key));
-                return;
-        } else if (eb->map_token) {
-                unmap_extent_buffer(eb, eb->map_token, KM_USER1);
-                eb->map_token = NULL;
-        }
        read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
                       struct btrfs_key_ptr, key, disk_key);
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 51dcec86757f..e24b7964a155 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root)
        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
-                DEFINE_WAIT(wait);
                atomic_inc(&cur_trans->use_count);
                spin_unlock(&root->fs_info->trans_lock);
-                while (1) {
-                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+                wait_event(root->fs_info->transaction_wait,
-                                        TASK_UNINTERRUPTIBLE);
+                           !cur_trans->blocked);
-                        if (!cur_trans->blocked)
-                                break;
-                        schedule();
-                }
-                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
        } else {
                spin_unlock(&root->fs_info->trans_lock);
@@ -260,7 +254,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
-        int retries = 0;
+        u64 num_bytes = 0;
        int ret;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -274,6 +268,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                h->block_rsv = NULL;
                goto got_it;
        }
+        /*
+         * Do the reservation before we join the transaction so we can do all
+         * the appropriate flushing if need be.
+         */
+        if (num_items > 0 && root != root->fs_info->chunk_root) {
+                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+                ret = btrfs_block_rsv_add(NULL, root,
+                                          &root->fs_info->trans_block_rsv,
+                                          num_bytes);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
@@ -310,24 +317,9 @@ again:
                goto again;
        }
-        if (num_items > 0) {
+        if (num_bytes) {
-                ret = btrfs_trans_reserve_metadata(h, root, num_items);
+                h->block_rsv = &root->fs_info->trans_block_rsv;
-                if (ret == -EAGAIN && !retries) {
+                h->bytes_reserved = num_bytes;
-                        retries++;
-                        btrfs_commit_transaction(h, root);
-                        goto again;
-                } else if (ret == -EAGAIN) {
-                        /*
-                         * We have already retried and got EAGAIN, so really we
-                         * don't have space, so set ret to -ENOSPC.
-                         */
-                        ret = -ENOSPC;
-                }
-                if (ret < 0) {
-                        btrfs_end_transaction(h, root);
-                        return ERR_PTR(ret);
-                }
        }
 got_it:
@@ -359,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 }
 /* wait for a transaction commit to be fully complete */
-static noinline int wait_for_commit(struct btrfs_root *root,
+static noinline void wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
-        DEFINE_WAIT(wait);
+        wait_event(commit->commit_wait, commit->commit_done);
-        while (!commit->commit_done) {
-                prepare_to_wait(&commit->commit_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                if (commit->commit_done)
-                        break;
-                schedule();
-        }
-        finish_wait(&commit->commit_wait, &wait);
-        return 0;
 }
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -499,10 +482,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        }
        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
-                if (throttle)
+                if (throttle) {
+                        /*
+                         * We may race with somebody else here so end up having
+                         * to call end_transaction on ourselves again, so inc
+                         * our use_count.
+                         */
+                        trans->use_count++;
                        return btrfs_commit_transaction(trans, root);
-                else
+                } else {
                        wake_up_process(info->transaction_kthread);
+                }
        }
        WARN_ON(cur_trans != info->running_transaction);
@@ -894,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
+        struct btrfs_block_rsv *rsv;
        struct inode *parent_inode;
        struct dentry *parent;
        struct dentry *dentry;
@@ -905,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 objectid;
        u64 root_flags;
+        rsv = trans->block_rsv;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
                pending->error = -ENOMEM;
@@ -1012,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
+        trans->block_rsv = rsv;
        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
        return 0;
 }
@@ -1080,22 +1074,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 static void wait_current_trans_commit_start(struct btrfs_root *root,
                                            struct btrfs_transaction *trans)
 {
-        DEFINE_WAIT(wait);
+        wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
-        if (trans->in_commit)
-                return;
-        while (1) {
-                prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                if (trans->in_commit) {
-                        finish_wait(&root->fs_info->transaction_blocked_wait,
-                                    &wait);
-                        break;
-                }
-                schedule();
-                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
-        }
 }
 /*
@@ -1105,24 +1084,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
 static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                         struct btrfs_transaction *trans)
 {
-        DEFINE_WAIT(wait);
+        wait_event(root->fs_info->transaction_wait,
+                   trans->commit_done || (trans->in_commit && !trans->blocked));
-        if (trans->commit_done || (trans->in_commit && !trans->blocked))
-                return;
-        while (1) {
-                prepare_to_wait(&root->fs_info->transaction_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                if (trans->commit_done ||
-                    (trans->in_commit && !trans->blocked)) {
-                        finish_wait(&root->fs_info->transaction_wait,
-                                    &wait);
-                        break;
-                }
-                schedule();
-                finish_wait(&root->fs_info->transaction_wait,
-                            &wait);
-        }
 }
 /*
@@ -1229,8 +1192,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                atomic_inc(&cur_trans->use_count);
                btrfs_end_transaction(trans, root);
-                ret = wait_for_commit(root, cur_trans);
+                wait_for_commit(root, cur_trans);
-                BUG_ON(ret);
                put_transaction(cur_trans);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1e..786639fca067 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct extent_buffer *eb, int slot,
                                  struct btrfs_key *key)
 {
-        struct inode *dir;
-        int ret;
        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
+        struct inode *dir;
        struct inode *inode;
-        char *name;
-        int namelen;
        unsigned long ref_ptr;
        unsigned long ref_end;
+        char *name;
+        int namelen;
+        int ret;
        int search_done = 0;
        /*
@@ -909,6 +910,25 @@ again:
        }
        btrfs_release_path(path);
+        /* look for a conflicting sequence number */
+        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+                                         btrfs_inode_ref_index(eb, ref),
+                                         name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(path);
+        /* look for a conflicing name */
+        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+                                   name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(path);
 insert:
        /* insert our name */
        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
@@ -1617,7 +1637,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                return 0;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        nritems = btrfs_header_nritems(eb);
        for (i = 0; i < nritems; i++) {
@@ -1723,15 +1744,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                        return -ENOMEM;
                if (*level == 1) {
-                        wc->process_func(root, next, wc, ptr_gen);
+                        ret = wc->process_func(root, next, wc, ptr_gen);
+                        if (ret)
+                                return ret;
                        path->slots[*level]++;
                        if (wc->free) {
                                btrfs_read_buffer(next, ptr_gen);
                                btrfs_tree_lock(next);
-                                clean_tree_block(trans, root, next);
                                btrfs_set_lock_blocking(next);
+                                clean_tree_block(trans, root, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
@@ -1788,16 +1811,19 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                parent = path->nodes[*level + 1];
                        root_owner = btrfs_header_owner(parent);
-                        wc->process_func(root, path->nodes[*level], wc,
+                        ret = wc->process_func(root, path->nodes[*level], wc,
                                 btrfs_header_generation(path->nodes[*level]));
+                        if (ret)
+                                return ret;
                        if (wc->free) {
                                struct extent_buffer *next;
                                next = path->nodes[*level];
                                btrfs_tree_lock(next);
-                                clean_tree_block(trans, root, next);
                                btrfs_set_lock_blocking(next);
+                                clean_tree_block(trans, root, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
@@ -1864,8 +1890,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        next = path->nodes[orig_level];
                        btrfs_tree_lock(next);
-                        clean_tree_block(trans, log, next);
                        btrfs_set_lock_blocking(next);
+                        clean_tree_block(trans, log, next);
                        btrfs_wait_tree_block_writeback(next);
                        btrfs_tree_unlock(next);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 19450bc53632..f2a4cc79da61 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        unsigned long limit;
        unsigned long last_waited = 0;
        int force_reg = 0;
+        int sync_pending = 0;
        struct blk_plug plug;
        /*
@@ -229,6 +230,22 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+                /*
+                 * if we're doing the sync list, record that our
+                 * plug has some sync requests on it
+                 *
+                 * If we're doing the regular list and there are
+                 * sync requests sitting around, unplug before
+                 * we add more
+                 */
+                if (pending_bios == &device->pending_sync_bios) {
+                        sync_pending = 1;
+                } else if (sync_pending) {
+                        blk_finish_plug(&plug);
+                        blk_start_plug(&plug);
+                        sync_pending = 0;
+                }
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                        fs_devices->rw_devices--;
                }
+                if (device->can_discard)
+                        fs_devices->num_can_discard--;
                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
                BUG_ON(!new_device);
                memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
+                new_device->can_discard = 0;
                list_replace_rcu(&device->dev_list, &new_device->dev_list);
                call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                                fmode_t flags, void *holder)
 {
+        struct request_queue *q;
        struct block_device *bdev;
        struct list_head *head = &fs_devices->devices;
        struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        seeding = 0;
                }
+                q = bdev_get_queue(bdev);
+                if (blk_queue_discard(q)) {
+                        device->can_discard = 1;
+                        fs_devices->num_can_discard++;
+                }
                device->bdev = bdev;
                device->in_fs_metadata = 0;
                device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        max_hole_start = search_start;
        max_hole_size = 0;
+        hole_size = 0;
        if (search_start >= search_end) {
                ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
                cond_resched();
        }
-        hole_size = search_end- search_start;
+        /*
+         * At this point, search_start should be the end of
+         * allocated dev extents, and when shrinking the device,
+         * search_end may be smaller than search_start.
+         */
+        if (search_end > search_start)
+                hole_size = search_end - search_start;
        if (hole_size > max_hole_size) {
                max_hole_start = search_start;
                max_hole_size = hole_size;
@@ -1037,7 +1073,8 @@ static noinline int find_next_chunk(struct btrfs_root *root,
        struct btrfs_key found_key;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        key.objectid = objectid;
        key.offset = (u64)-1;
@@ -1542,6 +1579,7 @@ error:
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
+        struct request_queue *q;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device;
        struct block_device *bdev;
@@ -1611,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        lock_chunks(root);
+        q = bdev_get_queue(bdev);
+        if (blk_queue_discard(q))
+                device->can_discard = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1646,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->num_devices++;
        root->fs_info->fs_devices->open_devices++;
        root->fs_info->fs_devices->rw_devices++;
+        if (device->can_discard)
+                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2061,8 +2104,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
        /* step two, relocate all the chunks */
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                ret = -ENOMEM;
+                goto error;
+        }
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -2410,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                        total_avail = device->total_bytes - device->bytes_used;
                else
                        total_avail = 0;
-                /* avail is off by max(alloc_start, 1MB), but that is the same
-                 * for all devices, so it doesn't hurt the sorting later on
+                /* If there is no space on this device, skip it. */
-                 */
+                if (total_avail == 0)
+                        continue;
                ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
@@ -2661,7 +2707,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        ret = find_next_chunk(fs_info->chunk_root,
                              BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
                        (fs_info->metadata_alloc_profile &
@@ -3595,7 +3642,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        if (!sb)
                return -ENOMEM;
        btrfs_set_buffer_uptodate(sb);
-        btrfs_set_buffer_lockdep_class(sb, 0);
+        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61ae7ae..6d866db4e177 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
        int missing;
+        int can_discard;
        spinlock_t io_lock;
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 missing_devices;
        u64 total_rw_bytes;
+        u64 num_can_discard;
        struct block_device *latest_bdev;
        /* all of the devices in the FS, protected by a mutex
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5366fe452ab0..69565e5fc6a0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -102,48 +102,71 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        /* first lets see if we already have this xattr */
+        if (flags & XATTR_REPLACE) {
-        di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+                di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
-                                strlen(name), -1);
+                                        name_len, -1);
-        if (IS_ERR(di)) {
+                if (IS_ERR(di)) {
-                ret = PTR_ERR(di);
+                        ret = PTR_ERR(di);
-                goto out;
+                        goto out;
-        }
+                } else if (!di) {
+                        ret = -ENODATA;
-        /* ok we already have this xattr, lets remove it */
-        if (di) {
-                /* if we want create only exit */
-                if (flags & XATTR_CREATE) {
-                        ret = -EEXIST;
                        goto out;
                }
                ret = btrfs_delete_one_dir_name(trans, root, path, di);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
                btrfs_release_path(path);
-                /* if we don't have a value then we are removing the xattr */
+                /*
+                 * remove the attribute
+                 */
                if (!value)
                        goto out;
-        } else {
+        }
+again:
+        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+                                      name, name_len, value, size);
+        if (ret == -EEXIST) {
+                if (flags & XATTR_CREATE)
+                        goto out;
+                /*
+                 * We can't use the path we already have since we won't have the
+                 * proper locking for a delete, so release the path and
+                 * re-lookup to delete the thing.
+                 */
                btrfs_release_path(path);
+                di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+                                        name, name_len, -1);
+                if (IS_ERR(di)) {
+                        ret = PTR_ERR(di);
+                        goto out;
+                } else if (!di) {
+                        /* Shouldn't happen but just in case... */
+                        btrfs_release_path(path);
+                        goto again;
+                }
-                if (flags & XATTR_REPLACE) {
+                ret = btrfs_delete_one_dir_name(trans, root, path, di);
-                        /* we couldn't find the attr to replace */
+                if (ret)
-                        ret = -ENODATA;
                        goto out;
+                /*
+                 * We have a value to set, so go back and try to insert it now.
+                 */
+                if (value) {
+                        btrfs_release_path(path);
+                        goto again;
                }
        }
-        /* ok we have to create a completely new xattr */
-        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
-                                      name, name_len, value, size);
-        BUG_ON(ret);
 out:
        btrfs_free_path(path);
        return ret;
 }
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
 int __btrfs_setxattr(struct btrfs_trans_handle *trans,
                     struct inode *inode, const char *name,
                     const void *value, size_t size, int flags)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 0dba6915712b..fb962efdacee 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                path = NULL;
                        spin_lock(&req->r_old_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
-                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
+                           ceph_ino(req->r_old_dentry_dir),
                                   req->r_old_dentry->d_name.len,
                                   req->r_old_dentry->d_name.name,
                                   path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1065ac779840..382abc9a6a54 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
-        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
-            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
-                d_set_d_op(dentry, &ceph_dentry_ops);
-        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
-                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
-        else
-                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
@@ -58,16 +50,42 @@ int ceph_init_dentry(struct dentry *dentry)
                kmem_cache_free(ceph_dentry_cachep, di);
                goto out_unlock;
        }
+        if (dentry->d_parent == NULL ||   /* nfs fh_to_dentry */
+            ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+                d_set_d_op(dentry, &ceph_dentry_ops);
+        else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+                d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
+        else
+                d_set_d_op(dentry, &ceph_snap_dentry_ops);
        di->dentry = dentry;
        di->lease_session = NULL;
-        dentry->d_fsdata = di;
        dentry->d_time = jiffies;
+        /* avoid reordering d_fsdata setup so that the check above is safe */
+        smp_mb();
+        dentry->d_fsdata = di;
        ceph_dentry_lru_add(dentry);
 out_unlock:
        spin_unlock(&dentry->d_lock);
        return 0;
 }
+struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
+{
+        struct inode *inode = NULL;
+        if (!dentry)
+                return NULL;
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_parent) {
+                inode = dentry->d_parent->d_inode;
+                ihold(inode);
+        }
+        spin_unlock(&dentry->d_lock);
+        return inode;
+}
 /*
@@ -133,7 +151,7 @@ more:
                     d_unhashed(dentry) ? "!hashed" : "hashed",
                     parent->d_subdirs.prev, parent->d_subdirs.next);
                if (p == &parent->d_subdirs) {
-                        fi->at_end = 1;
+                        fi->flags |= CEPH_F_ATEND;
                        goto out_unlock;
                }
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@@ -234,7 +252,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        const int max_bytes = fsc->mount_options->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
-        if (fi->at_end)
+        if (fi->flags & CEPH_F_ATEND)
                return 0;
        /* always start with . and .. */
@@ -403,7 +421,7 @@ more:
                dout("readdir next frag is %x\n", frag);
                goto more;
        }
-        fi->at_end = 1;
+        fi->flags |= CEPH_F_ATEND;
        /*
         * if dir_release_count still matches the dir, no dentries
@@ -435,7 +453,7 @@ static void reset_readdir(struct ceph_file_info *fi)
                dput(fi->dentry);
                fi->dentry = NULL;
        }
-        fi->at_end = 0;
+        fi->flags &= ~CEPH_F_ATEND;
 }
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
@@ -463,7 +481,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
-                        fi->at_end = 0;
+                        fi->flags &= ~CEPH_F_ATEND;
                }
                retval = offset;
@@ -488,21 +506,13 @@ out:
 }
 /*
- * Process result of a lookup/open request.
+ * Handle lookups for the hidden .snap directory.
- *
- * Mainly, make sure we return the final req->r_dentry (if it already
- * existed) in place of the original VFS-provided dentry when they
- * differ.
- *
- * Gracefully handle the case where the MDS replies with -ENOENT and
- * no trace (which it may do, at its discretion, e.g., if it doesn't
- * care to issue a lease on the negative dentry).
 */
-struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+int ceph_handle_snapdir(struct ceph_mds_request *req,
-                                  struct dentry *dentry, int err)
+                        struct dentry *dentry, int err)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
-        struct inode *parent = dentry->d_parent->d_inode;
+        struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
        /* .snap dir? */
        if (err == -ENOENT &&
@@ -516,7 +526,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                d_add(dentry, inode);
                err = 0;
        }
+        return err;
+}
+/*
+ * Figure out final result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+                                  struct dentry *dentry, int err)
+{
        if (err == -ENOENT) {
                /* no trace? */
                err = 0;
@@ -610,6 +636,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
        req->r_locked_dir = dir;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        err = ceph_handle_snapdir(req, dentry, err);
        dentry = ceph_finish_lookup(req, dentry, err);
        ceph_mdsc_put_request(req);  /* will dput(dentry) */
        dout("lookup result=%p\n", dentry);
@@ -789,6 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+        req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
        req->r_locked_dir = dir;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -887,6 +915,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        req->r_dentry = dget(new_dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
+        req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
        req->r_locked_dir = new_dir;
        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -1002,36 +1031,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 */
 static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
+        int valid = 0;
        struct inode *dir;
        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
-        dir = dentry->d_parent->d_inode;
        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
             ceph_dentry(dentry)->offset);
+        dir = ceph_get_dentry_parent_inode(dentry);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
                dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
                     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-                goto out_touch;
+                valid = 1;
+        } else if (dentry->d_inode &&
+                   ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
+                valid = 1;
+        } else if (dentry_lease_is_valid(dentry) ||
+                   dir_lease_is_valid(dir, dentry)) {
+                valid = 1;
        }
-        if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
-                goto out_touch;
-        if (dentry_lease_is_valid(dentry) ||
-            dir_lease_is_valid(dir, dentry))
-                goto out_touch;
-        dout("d_revalidate %p invalid\n", dentry);
+        dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
-        d_drop(dentry);
+        if (valid)
-        return 0;
+                ceph_dentry_lru_touch(dentry);
-out_touch:
+        else
-        ceph_dentry_lru_touch(dentry);
+                d_drop(dentry);
-        return 1;
+        iput(dir);
+        return valid;
 }
 /*
@@ -1228,9 +1259,8 @@ void ceph_dentry_lru_del(struct dentry *dn)
 * Return name hash for a given dentry.  This is dependent on
 * the parent directory's hash function.
 */
-unsigned ceph_dentry_hash(struct dentry *dn)
+unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 {
-        struct inode *dir = dn->d_parent->d_inode;
        struct ceph_inode_info *dci = ceph_inode(dir);
        switch (dci->i_dir_layout.dl_dir_hash) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index f67b687550de..9fbcdecaaccd 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
        int type;
        struct ceph_nfs_fh *fh = (void *)rawfh;
        struct ceph_nfs_confh *cfh = (void *)rawfh;
-        struct dentry *parent = dentry->d_parent;
+        struct dentry *parent;
        struct inode *inode = dentry->d_inode;
        int connected_handle_length = sizeof(*cfh)/4;
        int handle_length = sizeof(*fh)/4;
@@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
+        spin_lock(&dentry->d_lock);
+        parent = dget(dentry->d_parent);
+        spin_unlock(&dentry->d_lock);
        if (*max_len >= connected_handle_length) {
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
-                cfh->parent_name_hash = ceph_dentry_hash(parent);
+                cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
+                                                         dentry);
                *max_len = connected_handle_length;
                type = 2;
        } else if (*max_len >= handle_length) {
                if (connectable) {
                        *max_len = connected_handle_length;
-                        return 255;
+                        type = 255;
+                } else {
+                        dout("encode_fh %p\n", dentry);
+                        fh->ino = ceph_ino(dentry->d_inode);
+                        *max_len = handle_length;
+                        type = 1;
                }
-                dout("encode_fh %p\n", dentry);
-                fh->ino = ceph_ino(dentry->d_inode);
-                *max_len = handle_length;
-                type = 1;
        } else {
                *max_len = handle_length;
-                return 255;
+                type = 255;
        }
+        dput(parent);
        return type;
 }
@@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
                iput(inode);
                return ERR_PTR(err);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0d0eae05598f..ce549d31eeb7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct ceph_file_info *cf = file->private_data;
-        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        struct inode *parent_inode = NULL;
        int err;
        int flags, fmode, wanted;
@@ -194,7 +194,10 @@ int ceph_open(struct inode *inode, struct file *file)
        req->r_inode = inode;
        ihold(inode);
        req->r_num_caps = 1;
+        if (flags & (O_CREAT|O_TRUNC))
+                parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        iput(parent_inode);
        if (!err)
                err = ceph_init_file(inode, file, req->r_fmode);
        ceph_mdsc_put_request(req);
@@ -222,9 +225,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
        struct ceph_mds_client *mdsc = fsc->mdsc;
-        struct file *file = nd->intent.open.file;
+        struct file *file;
-        struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
        struct ceph_mds_request *req;
+        struct dentry *ret;
        int err;
        int flags = nd->intent.open.flags;
@@ -242,16 +245,24 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        }
        req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
-        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        err = ceph_mdsc_do_request(mdsc,
-        dentry = ceph_finish_lookup(req, dentry, err);
+                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
-        if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+                                   req);
+        err = ceph_handle_snapdir(req, dentry, err);
+        if (err)
+                goto out;
+        if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
-        if (!err)
+        if (err)
-                err = ceph_init_file(req->r_dentry->d_inode, file,
+                goto out;
-                                     req->r_fmode);
+        file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open);
+        if (IS_ERR(file))
+                err = PTR_ERR(file);
+out:
+        ret = ceph_finish_lookup(req, dentry, err);
        ceph_mdsc_put_request(req);
-        dout("ceph_lookup_open result=%p\n", dentry);
+        dout("ceph_lookup_open result=%p\n", ret);
-        return dentry;
+        return ret;
 }
 int ceph_release(struct inode *inode, struct file *file)
@@ -643,7 +654,8 @@ again:
        if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
-            (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+            (fi->flags & CEPH_F_SYNC))
                /* hmm, this isn't really async... */
                ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
        else
@@ -712,7 +724,7 @@ retry_snap:
                want = CEPH_CAP_FILE_BUFFER;
        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
        if (ret < 0)
-                goto out;
+                goto out_put;
        dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
@@ -720,12 +732,23 @@ retry_snap:
        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
-            (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+            (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+            (fi->flags & CEPH_F_SYNC)) {
                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
                        &iocb->ki_pos);
        } else {
-                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                /*
+                 * buffered write; drop Fw early to avoid slow
+                 * revocation if we get stuck on balance_dirty_pages
+                 */
+                int dirty;
+                spin_lock(&inode->i_lock);
+                dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+                spin_unlock(&inode->i_lock);
+                ceph_put_cap_refs(ci, got);
+                ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
@@ -733,7 +756,12 @@ retry_snap:
                        if (err < 0)
                                ret = err;
                }
+                if (dirty)
+                        __mark_inode_dirty(inode, dirty);
+                goto out;
        }
        if (ret >= 0) {
                int dirty;
                spin_lock(&inode->i_lock);
@@ -743,12 +771,13 @@ retry_snap:
                        __mark_inode_dirty(inode, dirty);
        }
-out:
+out_put:
        dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
             ceph_cap_string(got));
        ceph_put_cap_refs(ci, got);
+out:
        if (ret == -EOLDSNAPC) {
                dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
                     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index dfb2831d8d85..095799ba9dd1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode,
        struct ceph_mds_reply_inode *info = iinfo->in;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int i;
-        int issued, implemented;
+        int issued = 0, implemented;
+        int updating_inode = 0;
        struct timespec mtime, atime, ctime;
        u32 nsplits;
        struct ceph_buffer *xattr_blob = NULL;
@@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode,
        if (le64_to_cpu(info->version) > 0 &&
            (ci->i_version & ~1) >= le64_to_cpu(info->version))
                goto no_change;
+        
+        updating_inode = 1;
        issued = __ceph_caps_issued(ci, &implemented);
        issued |= implemented | __ceph_caps_dirty(ci);
@@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode,
                ci->i_rfiles = le64_to_cpu(info->rfiles);
                ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
                ceph_decode_timespec(&ci->i_rctime, &info->rctime);
-                /* set dir completion flag? */
-                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
-                    ceph_snap(inode) == CEPH_NOSNAP &&
-                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
-                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
-                        dout(" marking %p complete (empty)\n", inode);
-                        /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
-                        ci->i_max_offset = 2;
-                }
                break;
        default:
                pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -774,6 +765,19 @@ no_change:
                __ceph_get_fmode(ci, cap_fmode);
        }
+        /* set dir completion flag? */
+        if (S_ISDIR(inode->i_mode) &&
+            updating_inode &&                 /* didn't jump to no_change */
+            ci->i_files == 0 && ci->i_subdirs == 0 &&
+            ceph_snap(inode) == CEPH_NOSNAP &&
+            (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+            (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+                dout(" marking %p complete (empty)\n", inode);
+                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+                ci->i_max_offset = 2;
+        }
        /* update delegation info? */
        if (dirinfo)
                ceph_fill_dirfrag(inode, dirinfo);
@@ -805,14 +809,14 @@ static void update_dentry_lease(struct dentry *dentry,
                return;
        spin_lock(&dentry->d_lock);
-        dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+        dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
-             dentry, le16_to_cpu(lease->mask), duration, ttl);
+             dentry, duration, ttl);
        /* make lease_rdcache_gen match directory */
        dir = dentry->d_parent->d_inode;
        di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
-        if (lease->mask == 0)
+        if (duration == 0)
                goto out_unlock;
        if (di->lease_gen == session->s_cap_gen &&
@@ -839,11 +843,13 @@ out_unlock:
 /*
 * Set dentry's directory position based on the current dir's max, and
 * order it in d_subdirs, so that dcache_readdir behaves.
+ *
+ * Always called under directory's i_mutex.
 */
 static void ceph_set_dentry_offset(struct dentry *dn)
 {
        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dn->d_parent->d_inode;
+        struct inode *inode = dir->d_inode;
        struct ceph_dentry_info *di;
        BUG_ON(!inode);
@@ -1022,9 +1028,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                /* do we have a dn lease? */
                have_lease = have_dir_cap ||
-                        (le16_to_cpu(rinfo->dlease->mask) &
+                        le32_to_cpu(rinfo->dlease->duration_ms);
-                         CEPH_LOCK_DN);
                if (!have_lease)
                        dout("fill_trace  no dentry lease or dir cap\n");
@@ -1560,7 +1564,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1743,7 +1747,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                req->r_inode_drop = release;
                req->r_args.setattr.mask = cpu_to_le32(mask);
                req->r_num_caps = 1;
+                parent_inode = ceph_get_dentry_parent_inode(dentry);
                err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+                iput(parent_inode);
        }
        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
             ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index ef0b5f48e13a..3b256b50f7d8 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+        struct inode *parent_inode;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        struct ceph_ioctl_layout l;
@@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
        req->r_args.setlayout.layout.fl_pg_preferred =
                cpu_to_le32(l.preferred_osd);
+        parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        iput(parent_inode);
        ceph_mdsc_put_request(req);
        return err;
 }
@@ -231,6 +233,14 @@ static long ceph_ioctl_lazyio(struct file *file)
        return 0;
 }
+static long ceph_ioctl_syncio(struct file *file)
+{
+        struct ceph_file_info *fi = file->private_data;
+        fi->flags |= CEPH_F_SYNC;
+        return 0;
+}
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -249,6 +259,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case CEPH_IOC_LAZYIO:
                return ceph_ioctl_lazyio(file);
+        case CEPH_IOC_SYNCIO:
+                return ceph_ioctl_syncio(file);
        }
        return -ENOTTY;
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 52e8fd74d450..0c5167e43180 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc {
                                   struct ceph_ioctl_dataloc)
 #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
+#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5)
 #endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0c1d91756528..86c59e16ba74 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref)
                destroy_reply_info(&req->r_reply_info);
        }
        if (req->r_inode) {
-                ceph_put_cap_refs(ceph_inode(req->r_inode),
+                ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
-                                  CEPH_CAP_PIN);
                iput(req->r_inode);
        }
        if (req->r_locked_dir)
-                ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+                ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
-                                  CEPH_CAP_PIN);
        if (req->r_target_inode)
                iput(req->r_target_inode);
        if (req->r_dentry)
                dput(req->r_dentry);
        if (req->r_old_dentry) {
-                ceph_put_cap_refs(
+                /*
-                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                 * track (and drop pins for) r_old_dentry_dir
-                        CEPH_CAP_PIN);
+                 * separately, since r_old_dentry's d_parent may have
+                 * changed between the dir mutex being dropped and
+                 * this request being freed.
+                 */
+                ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+                                  CEPH_CAP_PIN);
                dput(req->r_old_dentry);
+                iput(req->r_old_dentry_dir);
        }
        kfree(req->r_path1);
        kfree(req->r_path2);
@@ -617,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 */
 struct dentry *get_nonsnap_parent(struct dentry *dentry)
 {
+        /*
+         * we don't need to worry about protecting the d_parent access
+         * here because we never renaming inside the snapped namespace
+         * except to resplice to another snapdir, and either the old or new
+         * result is a valid result.
+         */
        while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
                dentry = dentry->d_parent;
        return dentry;
@@ -652,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        if (req->r_inode) {
                inode = req->r_inode;
        } else if (req->r_dentry) {
-                struct inode *dir = req->r_dentry->d_parent->d_inode;
+                /* ignore race with rename; old or new d_parent is okay */
+                struct dentry *parent = req->r_dentry->d_parent;
+                struct inode *dir = parent->d_inode;
                if (dir->i_sb != mdsc->fsc->sb) {
                        /* not this fs! */
@@ -660,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
                        /* direct snapped/virtual snapdir requests
                         * based on parent dir inode */
-                        struct dentry *dn =
+                        struct dentry *dn = get_nonsnap_parent(parent);
-                                get_nonsnap_parent(req->r_dentry->d_parent);
                        inode = dn->d_inode;
                        dout("__choose_mds using nonsnap parent %p\n", inode);
                } else if (req->r_dentry->d_inode) {
@@ -670,7 +681,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                } else {
                        /* dir + name */
                        inode = dir;
-                        hash = ceph_dentry_hash(req->r_dentry);
+                        hash = ceph_dentry_hash(dir, req->r_dentry);
                        is_hash = true;
                }
        }
@@ -1584,7 +1595,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
                r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
                     *ppath);
-        } else if (rpath) {
+        } else if (rpath || rino) {
                *ino = rino;
                *ppath = rpath;
                *pathlen = strlen(rpath);
@@ -1931,9 +1942,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        if (req->r_locked_dir)
                ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
        if (req->r_old_dentry)
-                ceph_get_cap_refs(
+                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
-                        ceph_inode(req->r_old_dentry->d_parent->d_inode),
+                                  CEPH_CAP_PIN);
-                        CEPH_CAP_PIN);
        /* issue */
        mutex_lock(&mdsc->mutex);
@@ -2714,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        struct ceph_mds_lease *h = msg->front.iov_base;
        u32 seq;
        struct ceph_vino vino;
-        int mask;
        struct qstr dname;
        int release = 0;
@@ -2725,7 +2734,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                goto bad;
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
-        mask = le16_to_cpu(h->mask);
        seq = le32_to_cpu(h->seq);
        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
@@ -2737,8 +2745,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        /* lookup inode */
        inode = ceph_find_inode(sb, vino);
-        dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
+        dout("handle_lease %s, ino %llx %p %.*s\n",
-             ceph_lease_op_name(h->action), mask, vino.ino, inode,
+             ceph_lease_op_name(h->action), vino.ino, inode,
             dname.len, dname.name);
        if (inode == NULL) {
                dout("handle_lease no inode %llx\n", vino.ino);
@@ -2828,7 +2836,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
                return;
        lease = msg->front.iov_base;
        lease->action = action;
-        lease->mask = cpu_to_le16(1);
        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
        lease->seq = cpu_to_le32(seq);
@@ -2850,7 +2857,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 * Pass @inode always, @dentry is optional.
 */
 void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
-                             struct dentry *dentry, int mask)
+                             struct dentry *dentry)
 {
        struct ceph_dentry_info *di;
        struct ceph_mds_session *session;
@@ -2858,7 +2865,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        BUG_ON(inode == NULL);
        BUG_ON(dentry == NULL);
-        BUG_ON(mask == 0);
        /* is dentry lease valid? */
        spin_lock(&dentry->d_lock);
@@ -2868,8 +2874,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
            di->lease_gen != di->lease_session->s_cap_gen ||
            !time_before(jiffies, dentry->d_time)) {
                dout("lease_release inode %p dentry %p -- "
-                     "no lease on %d\n",
+                     "no lease\n",
-                     inode, dentry, mask);
+                     inode, dentry);
                spin_unlock(&dentry->d_lock);
                return;
        }
@@ -2880,8 +2886,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        __ceph_mdsc_drop_dentry_lease(dentry);
        spin_unlock(&dentry->d_lock);
-        dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+        dout("lease_release inode %p dentry %p to mds%d\n",
-             inode, dentry, mask, session->s_mds);
+             inode, dentry, session->s_mds);
        ceph_mdsc_lease_send_msg(session, inode, dentry,
                                 CEPH_MDS_LEASE_RELEASE, seq);
        ceph_put_mds_session(session);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7d8a0d662d56..4bb239921dbd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -171,6 +171,7 @@ struct ceph_mds_request {
        struct inode *r_inode;              /* arg1 */
        struct dentry *r_dentry;            /* arg1 */
        struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+        struct inode *r_old_dentry_dir;     /* arg2: old dentry's parent dir */
        char *r_path1, *r_path2;
        struct ceph_vino r_ino1, r_ino2;
@@ -333,7 +334,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct inode *inode,
-                                    struct dentry *dn, int mask);
+                                    struct dentry *dn);
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 54b14de2e729..e26437191333 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
        dirty = __ceph_caps_dirty(ci);
+        /*
+         * If there is a write in progress, treat that as a dirty Fw,
+         * even though it hasn't completed yet; by the time we finish
+         * up this capsnap it will be.
+         */
+        if (used & CEPH_CAP_FILE_WR)
+                dirty |= CEPH_CAP_FILE_WR;
        if (__ceph_have_pending_cap_snap(ci)) {
                /* there is no point in queuing multiple "pending" cap_snaps,
                   as no new writes are allowed to start when pending, so any
@@ -456,13 +465,19 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
                kfree(capsnap);
-        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+        } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-                   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+                            CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
-                             CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
                struct ceph_snap_context *snapc = ci->i_head_snapc;
-                dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+                /*
-                     capsnap, snapc);
+                 * if we are a sync write, we may need to go to the snaprealm
+                 * to get the current snapc.
+                 */
+                if (!snapc)
+                        snapc = ci->i_snap_realm->cached_context;
+                dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
+                     inode, capsnap, snapc, ceph_cap_string(dirty));
                ihold(inode);
                atomic_set(&capsnap->nref, 1);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f2f77fd3c14c..88bacaf385d9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
         */
        buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
        buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
-        buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+        buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
-                (CEPH_BLOCK_SHIFT-10);
        buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
        buf->f_files = le64_to_cpu(st.num_objects);
@@ -780,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb,
                fsc->backing_dev_info.ra_pages =
                        (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
+        else
+                fsc->backing_dev_info.ra_pages =
+                        default_backing_dev_info.ra_pages;
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
                           atomic_long_inc_return(&bdi_seq));
        if (!err)
@@ -810,8 +813,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        fsc = create_fs_client(fsopt, opt);
        if (IS_ERR(fsc)) {
                res = ERR_CAST(fsc);
-                kfree(fsopt);
+                destroy_mount_options(fsopt);
-                kfree(opt);
+                ceph_destroy_options(opt);
                goto out_final;
        }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 30446b144e3d..a23eed526f05 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -543,13 +543,16 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
 /*
 * we keep buffered readdir results attached to file->private_data
 */
+#define CEPH_F_SYNC     1
+#define CEPH_F_ATEND    2
 struct ceph_file_info {
-        int fmode;     /* initialized on open */
+        short fmode;     /* initialized on open */
+        short flags;     /* CEPH_F_* */
        /* readdir: position within the dir */
        u32 frag;
        struct ceph_mds_request *last_readdir;
-        int at_end;
        /* readdir: position within a frag */
        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
@@ -789,6 +792,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern int ceph_handle_snapdir(struct ceph_mds_request *req,
+                               struct dentry *dentry, int err);
 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                         struct dentry *dentry, int err);
@@ -796,7 +801,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
-extern unsigned ceph_dentry_hash(struct dentry *dn);
+extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
+extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
 /*
 * our d_ops vary depending on whether the inode is live,
@@ -819,14 +825,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
                             int p_locks, int f_locks);
 extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
-static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
-{
-        if (dentry && dentry->d_parent)
-                return dentry->d_parent->d_inode;
-        return NULL;
-}
 /* debugfs.c */
 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f42d730f1b66..96c6739a0280 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        int err;
@@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
        req->r_data_len = size;
        dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+        parent_inode = ceph_get_dentry_parent_inode(dentry);
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        iput(parent_inode);
        ceph_mdsc_put_request(req);
        dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
@@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode;
        struct ceph_mds_request *req;
        int err;
@@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
        req->r_num_caps = 1;
        req->r_path2 = kstrdup(name, GFP_NOFS);
+        parent_inode = ceph_get_dentry_parent_inode(dentry);
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+        iput(parent_inode);
        ceph_mdsc_put_request(req);
        return err;
 }
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2fe3cf13b2e9..6d40656e1e29 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_CIFS_STATS2
                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-                                atomic_read(&server->inSend),
+                                atomic_read(&server->in_send),
                                atomic_read(&server->num_waiters));
 #endif
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 8d8f28c94c0f..6873bb634a97 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc < 0) {
-                cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
+                cFYI(1, "%s: Failed to resolve server part of %s to IP: %d",
-                          __func__, *devname, rc);
+                        __func__, *devname, rc);
                goto compose_mount_options_err;
        }
        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
         * assuming that we have 'unc=' and 'ip=' in
         * the original sb_mountdata
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 21de1d6d5849..d0f59faefb78 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -991,24 +991,6 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        return pntsd;
 }
-static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-                struct cifs_ntsd *pnntsd, u32 acllen)
-{
-        int xid, rc;
-        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        xid = GetXid();
-        rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
-        FreeXid(xid);
-        cifs_put_tlink(tlink);
-        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-        return rc;
-}
 static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                struct cifs_ntsd *pnntsd, u32 acllen)
 {
@@ -1047,18 +1029,10 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                                struct inode *inode, const char *path)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsFileInfo *open_file;
-        int rc;
        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-        open_file = find_readable_file(CIFS_I(inode), true);
+        return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-        if (!open_file)
-                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-        cifsFileInfo_put(open_file);
-        return rc;
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5a0ee7f2af06..e76bfeb68267 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -52,19 +52,29 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
        if (rc) {
-                cERROR(1, "%s: Oould not init md5\n", __func__);
+                cERROR(1, "%s: Could not init md5\n", __func__);
                return rc;
        }
-        crypto_shash_update(&server->secmech.sdescmd5->shash,
+        rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
                server->session_key.response, server->session_key.len);
+        if (rc) {
+                cERROR(1, "%s: Could not update with response\n", __func__);
+                return rc;
+        }
-        crypto_shash_update(&server->secmech.sdescmd5->shash,
+        rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
                cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
+        if (rc) {
+                cERROR(1, "%s: Could not update with payload\n", __func__);
+                return rc;
+        }
        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-        return 0;
+        return rc;
 }
 /* must be called with server->srv_mutex held */
@@ -77,9 +87,15 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
        if ((cifs_pdu == NULL) || (server == NULL))
                return -EINVAL;
-        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
+        if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
+            server->tcpStatus == CifsNeedNegotiate)
                return rc;
+        if (!server->session_estab) {
+                strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+                return rc;
+        }
        cifs_pdu->Signature.Sequence.SequenceNumber =
                        cpu_to_le32(server->sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
@@ -112,12 +128,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
        if (rc) {
-                cERROR(1, "%s: Oould not init md5\n", __func__);
+                cERROR(1, "%s: Could not init md5\n", __func__);
                return rc;
        }
-        crypto_shash_update(&server->secmech.sdescmd5->shash,
+        rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
                server->session_key.response, server->session_key.len);
+        if (rc) {
+                cERROR(1, "%s: Could not update with response\n", __func__);
+                return rc;
+        }
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
@@ -131,14 +151,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
+                        rc =
                        crypto_shash_update(&server->secmech.sdescmd5->shash,
                                iov[i].iov_base + 4, iov[i].iov_len - 4);
-                } else
+                } else {
+                        rc =
                        crypto_shash_update(&server->secmech.sdescmd5->shash,
                                iov[i].iov_base, iov[i].iov_len);
+                }
+                if (rc) {
+                        cERROR(1, "%s: Could not update with payload\n",
+                                                        __func__);
+                        return rc;
+                }
        }
        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
        return rc;
 }
@@ -154,8 +184,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        if ((cifs_pdu == NULL) || (server == NULL))
                return -EINVAL;
-        if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0)
+        if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
+            server->tcpStatus == CifsNeedNegotiate)
+                return rc;
+        if (!server->session_estab) {
+                strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
                return rc;
+        }
        cifs_pdu->Signature.Sequence.SequenceNumber =
                                cpu_to_le32(server->sequence_number);
@@ -463,8 +499,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        /* calculate md4 hash of password */
        E_md4hash(ses->password, nt_hash);
-        crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
+        rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
                                CIFS_NTHASH_SIZE);
+        if (rc) {
+                cERROR(1, "%s: Could not set NT Hash as a key", __func__);
+                return rc;
+        }
        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
        if (rc) {
@@ -478,13 +518,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        if (user == NULL) {
                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
                rc = -ENOMEM;
-                goto calc_exit_2;
+                return rc;
        }
        len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
        UniStrupr(user);
-        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+        rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                                (char *)user, 2 * len);
+        kfree(user);
+        if (rc) {
+                cERROR(1, "%s: Could not update with user\n", __func__);
+                return rc;
+        }
        /* convert ses->domainName to unicode and uppercase */
        if (ses->domainName) {
@@ -494,13 +539,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                if (domain == NULL) {
                        cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
                        rc = -ENOMEM;
-                        goto calc_exit_1;
+                        return rc;
                }
                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
                                        nls_cp);
+                rc =
                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                                        (char *)domain, 2 * len);
                kfree(domain);
+                if (rc) {
+                        cERROR(1, "%s: Could not update with domain\n",
+                                                                __func__);
+                        return rc;
+                }
        } else if (ses->serverName) {
                len = strlen(ses->serverName);
@@ -508,21 +559,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                if (server == NULL) {
                        cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
                        rc = -ENOMEM;
-                        goto calc_exit_1;
+                        return rc;
                }
                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
                                        nls_cp);
+                rc =
                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                                        (char *)server, 2 * len);
                kfree(server);
+                if (rc) {
+                        cERROR(1, "%s: Could not update with server\n",
+                                                                __func__);
+                        return rc;
+                }
        }
        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
                                        ntlmv2_hash);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-calc_exit_1:
-        kfree(user);
-calc_exit_2:
        return rc;
 }
@@ -537,8 +593,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
                return -1;
        }
-        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+        rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
                                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        if (rc) {
+                cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+                return rc;
+        }
        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
        if (rc) {
@@ -552,11 +612,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
        else
                memcpy(ses->auth_key.response + offset,
                        ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
-        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+        rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                ses->auth_key.response + offset, ses->auth_key.len - offset);
+        if (rc) {
+                cERROR(1, "%s: Could not update with response\n", __func__);
+                return rc;
+        }
        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
                ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
        return rc;
 }
@@ -626,8 +692,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
        }
        /* now calculate the session key for NTLMv2 */
-        crypto_shash_setkey(ses->server->secmech.hmacmd5,
+        rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
                ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+        if (rc) {
+                cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
        rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
        if (rc) {
@@ -635,12 +705,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
                goto setup_ntlmv2_rsp_ret;
        }
-        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
+        rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                ses->auth_key.response + CIFS_SESS_KEY_SIZE,
                CIFS_HMAC_MD5_HASH_SIZE);
+        if (rc) {
+                cERROR(1, "%s: Could not update with response\n", __func__);
+                goto setup_ntlmv2_rsp_ret;
+        }
        rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
                ses->auth_key.response);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
 setup_ntlmv2_rsp_ret:
        kfree(tiblob);
@@ -668,8 +744,12 @@ calc_seckey(struct cifs_ses *ses)
        desc.tfm = tfm_arc4;
-        crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+        rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
                                        CIFS_SESS_KEY_SIZE);
+        if (rc) {
+                cERROR(1, "%s: Could not set response as a key", __func__);
+                return rc;
+        }
        sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
        sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
@@ -688,7 +768,7 @@ calc_seckey(struct cifs_ses *ses)
        crypto_free_blkcipher(tfm_arc4);
-        return 0;
+        return rc;
 }
 void
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 865517470967..f93eb948d071 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -86,24 +86,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
-void
-cifs_sb_active(struct super_block *sb)
-{
-        struct cifs_sb_info *server = CIFS_SB(sb);
-        if (atomic_inc_return(&server->active) == 1)
-                atomic_inc(&sb->s_active);
-}
-void
-cifs_sb_deactive(struct super_block *sb)
-{
-        struct cifs_sb_info *server = CIFS_SB(sb);
-        if (atomic_dec_and_test(&server->active))
-                deactivate_super(sb);
-}
 static int
 cifs_read_super(struct super_block *sb)
 {
@@ -581,6 +563,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                mutex_unlock(&dir->i_mutex);
                dput(dentry);
                dentry = child;
+                if (!dentry->d_inode) {
+                        dput(dentry);
+                        dentry = ERR_PTR(-ENOENT);
+                }
        } while (!IS_ERR(dentry));
        _FreeXid(xid);
        kfree(full_path);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index fbd050c8d52a..95da8027983d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
-/* Functions related to super block operations */
-extern void cifs_sb_active(struct super_block *sb);
-extern void cifs_sb_deactive(struct super_block *sb);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
 extern struct inode *cifs_root_iget(struct super_block *);
@@ -129,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.74"
+#define CIFS_VERSION   "1.75"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6255fa812c7a..95dad9d14cf1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -291,7 +291,7 @@ struct TCP_Server_Info {
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
 #ifdef CONFIG_CIFS_STATS2
-        atomic_t inSend; /* requests trying to send */
+        atomic_t in_send; /* requests trying to send */
        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
 #endif
 };
@@ -501,7 +501,7 @@ struct cifs_search_info {
        char *ntwrk_buf_start;
        char *srch_entries_start;
        char *last_entry;
-        char *presume_name;
+        const char *presume_name;
        unsigned int resume_name_len;
        bool endOfSearch:1;
        bool emptyDir:1;
@@ -672,12 +672,54 @@ struct mid_q_entry {
        bool multiEnd:1;        /* both received */
 };
-struct oplock_q_entry {
+/*      Make code in transport.c a little cleaner by moving
-        struct list_head qhead;
+        update of optional stats into function below */
-        struct inode *pinode;
+#ifdef CONFIG_CIFS_STATS2
-        struct cifs_tcon *tcon;
-        __u16 netfid;
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
-};
+{
+        atomic_inc(&server->in_send);
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+        atomic_dec(&server->in_send);
+}
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+        atomic_inc(&server->num_waiters);
+}
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+        atomic_dec(&server->num_waiters);
+}
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+        mid->when_sent = jiffies;
+}
+#else
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+}
+#endif
 /* for pending dnotify requests */
 struct dir_notify_req {
@@ -942,8 +984,6 @@ GLOBAL_EXTERN spinlock_t siduidlock;
 GLOBAL_EXTERN spinlock_t sidgidlock;
 void cifs_oplock_break(struct work_struct *work);
-void cifs_oplock_break_get(struct cifsFileInfo *cfile);
-void cifs_oplock_break_put(struct cifsFileInfo *cfile);
 extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1a9fe7f816d1..aac37d99a487 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -107,7 +107,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon)
 static int
 cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 {
-        int rc = 0;
+        int rc;
        struct cifs_ses *ses;
        struct TCP_Server_Info *server;
        struct nls_table *nls_codepage;
@@ -5720,6 +5720,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
        char *temp_ptr;
        char *end_of_smb;
        __u16 params, byte_count, data_offset;
+        unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
        cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
@@ -5837,7 +5838,8 @@ QAllEAsRetry:
                }
                if (ea_name) {
-                        if (strncmp(ea_name, temp_ptr, name_len) == 0) {
+                        if (ea_name_len == name_len &&
+                            strncmp(ea_name, temp_ptr, name_len) == 0) {
                                temp_ptr += name_len + 1;
                                rc = value_len;
                                if (buf_size == 0)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e66297bad412..633c246b6775 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -319,25 +319,328 @@ requeue_echo:
        queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
 }
+static bool
+allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
+                 bool is_large_buf)
+{
+        char *bbuf = *bigbuf, *sbuf = *smallbuf;
+        if (bbuf == NULL) {
+                bbuf = (char *)cifs_buf_get();
+                if (!bbuf) {
+                        cERROR(1, "No memory for large SMB response");
+                        msleep(3000);
+                        /* retry will check if exiting */
+                        return false;
+                }
+        } else if (is_large_buf) {
+                /* we are reusing a dirty large buf, clear its start */
+                memset(bbuf, 0, size);
+        }
+        if (sbuf == NULL) {
+                sbuf = (char *)cifs_small_buf_get();
+                if (!sbuf) {
+                        cERROR(1, "No memory for SMB response");
+                        msleep(1000);
+                        /* retry will check if exiting */
+                        return false;
+                }
+                /* beginning of smb buffer is cleared in our buf_get */
+        } else {
+                /* if existing small buf clear beginning */
+                memset(sbuf, 0, size);
+        }
+        *bigbuf = bbuf;
+        *smallbuf = sbuf;
+        return true;
+}
+static int
+read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
+                 struct kvec *iov, unsigned int to_read,
+                 unsigned int *ptotal_read, bool is_header_read)
+{
+        int length, rc = 0;
+        unsigned int total_read;
+        char *buf = iov->iov_base;
+        for (total_read = 0; total_read < to_read; total_read += length) {
+                length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
+                                        to_read - total_read, 0);
+                if (server->tcpStatus == CifsExiting) {
+                        /* then will exit */
+                        rc = 2;
+                        break;
+                } else if (server->tcpStatus == CifsNeedReconnect) {
+                        cifs_reconnect(server);
+                        /* Reconnect wakes up rspns q */
+                        /* Now we will reread sock */
+                        rc = 1;
+                        break;
+                } else if (length == -ERESTARTSYS ||
+                           length == -EAGAIN ||
+                           length == -EINTR) {
+                        /*
+                         * Minimum sleep to prevent looping, allowing socket
+                         * to clear and app threads to set tcpStatus
+                         * CifsNeedReconnect if server hung.
+                         */
+                        usleep_range(1000, 2000);
+                        length = 0;
+                        if (!is_header_read)
+                                continue;
+                        /* Special handling for header read */
+                        if (total_read) {
+                                iov->iov_base = (to_read - total_read) +
+                                                buf;
+                                iov->iov_len = to_read - total_read;
+                                smb_msg->msg_control = NULL;
+                                smb_msg->msg_controllen = 0;
+                                rc = 3;
+                        } else
+                                rc = 1;
+                        break;
+                } else if (length <= 0) {
+                        cERROR(1, "Received no data, expecting %d",
+                               to_read - total_read);
+                        cifs_reconnect(server);
+                        rc = 1;
+                        break;
+                }
+        }
+        *ptotal_read = total_read;
+        return rc;
+}
+static bool
+check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
+{
+        char temp = *buf;
+        unsigned int pdu_length = be32_to_cpu(
+                                ((struct smb_hdr *)buf)->smb_buf_length);
+        /*
+         * The first byte big endian of the length field,
+         * is actually not part of the length but the type
+         * with the most common, zero, as regular data.
+         */
+        if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
+                return false;
+        } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
+                cFYI(1, "Good RFC 1002 session rsp");
+                return false;
+        } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+                /*
+                 * We get this from Windows 98 instead of an error on
+                 * SMB negprot response.
+                 */
+                cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
+                        pdu_length);
+                /* give server a second to clean up */
+                msleep(1000);
+                /*
+                 * Always try 445 first on reconnect since we get NACK
+                 * on some if we ever connected to port 139 (the NACK
+                 * is since we do not begin with RFC1001 session
+                 * initialize frame).
+                 */
+                cifs_set_port((struct sockaddr *)
+                                &server->dstaddr, CIFS_PORT);
+                cifs_reconnect(server);
+                wake_up(&server->response_q);
+                return false;
+        } else if (temp != (char) 0) {
+                cERROR(1, "Unknown RFC 1002 frame");
+                cifs_dump_mem(" Received Data: ", buf, 4);
+                cifs_reconnect(server);
+                return false;
+        }
+        /* else we have an SMB response */
+        if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
+            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
+                cERROR(1, "Invalid size SMB length %d pdu_length %d",
+                       4, pdu_length+4);
+                cifs_reconnect(server);
+                wake_up(&server->response_q);
+                return false;
+        }
+        return true;
+}
+static struct mid_q_entry *
+find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
+              int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
+{
+        struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+        spin_lock(&GlobalMid_Lock);
+        list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
+                if (mid->mid != buf->Mid ||
+                    mid->midState != MID_REQUEST_SUBMITTED ||
+                    mid->command != buf->Command)
+                        continue;
+                if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
+                        /* We have a multipart transact2 resp */
+                        *is_multi_rsp = true;
+                        if (mid->resp_buf) {
+                                /* merge response - fix up 1st*/
+                                *length = coalesce_t2(buf, mid->resp_buf);
+                                if (*length > 0) {
+                                        *length = 0;
+                                        mid->multiRsp = true;
+                                        break;
+                                }
+                                /* All parts received or packet is malformed. */
+                                mid->multiEnd = true;
+                                goto multi_t2_fnd;
+                        }
+                        if (!is_large_buf) {
+                                /*FIXME: switch to already allocated largebuf?*/
+                                cERROR(1, "1st trans2 resp needs bigbuf");
+                        } else {
+                                /* Have first buffer */
+                                mid->resp_buf = buf;
+                                mid->largeBuf = true;
+                                *bigbuf = NULL;
+                        }
+                        break;
+                }
+                mid->resp_buf = buf;
+                mid->largeBuf = is_large_buf;
+multi_t2_fnd:
+                if (*length == 0)
+                        mid->midState = MID_RESPONSE_RECEIVED;
+                else
+                        mid->midState = MID_RESPONSE_MALFORMED;
+#ifdef CONFIG_CIFS_STATS2
+                mid->when_received = jiffies;
+#endif
+                list_del_init(&mid->qhead);
+                ret = mid;
+                break;
+        }
+        spin_unlock(&GlobalMid_Lock);
+        return ret;
+}
+static void clean_demultiplex_info(struct TCP_Server_Info *server)
+{
+        int length;
+        /* take it off the list, if it's not already */
+        spin_lock(&cifs_tcp_ses_lock);
+        list_del_init(&server->tcp_ses_list);
+        spin_unlock(&cifs_tcp_ses_lock);
+        spin_lock(&GlobalMid_Lock);
+        server->tcpStatus = CifsExiting;
+        spin_unlock(&GlobalMid_Lock);
+        wake_up_all(&server->response_q);
+        /*
+         * Check if we have blocked requests that need to free. Note that
+         * cifs_max_pending is normally 50, but can be set at module install
+         * time to as little as two.
+         */
+        spin_lock(&GlobalMid_Lock);
+        if (atomic_read(&server->inFlight) >= cifs_max_pending)
+                atomic_set(&server->inFlight, cifs_max_pending - 1);
+        /*
+         * We do not want to set the max_pending too low or we could end up
+         * with the counter going negative.
+         */
+        spin_unlock(&GlobalMid_Lock);
+        /*
+         * Although there should not be any requests blocked on this queue it
+         * can not hurt to be paranoid and try to wake up requests that may
+         * haven been blocked when more than 50 at time were on the wire to the
+         * same server - they now will see the session is in exit state and get
+         * out of SendReceive.
+         */
+        wake_up_all(&server->request_q);
+        /* give those requests time to exit */
+        msleep(125);
+        if (server->ssocket) {
+                sock_release(server->ssocket);
+                server->ssocket = NULL;
+        }
+        if (!list_empty(&server->pending_mid_q)) {
+                struct list_head dispose_list;
+                struct mid_q_entry *mid_entry;
+                struct list_head *tmp, *tmp2;
+                INIT_LIST_HEAD(&dispose_list);
+                spin_lock(&GlobalMid_Lock);
+                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
+                        mid_entry->midState = MID_SHUTDOWN;
+                        list_move(&mid_entry->qhead, &dispose_list);
+                }
+                spin_unlock(&GlobalMid_Lock);
+                /* now walk dispose list and issue callbacks */
+                list_for_each_safe(tmp, tmp2, &dispose_list) {
+                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                        cFYI(1, "Callback mid 0x%x", mid_entry->mid);
+                        list_del_init(&mid_entry->qhead);
+                        mid_entry->callback(mid_entry);
+                }
+                /* 1/8th of sec is more than enough time for them to exit */
+                msleep(125);
+        }
+        if (!list_empty(&server->pending_mid_q)) {
+                /*
+                 * mpx threads have not exited yet give them at least the smb
+                 * send timeout time for long ops.
+                 *
+                 * Due to delays on oplock break requests, we need to wait at
+                 * least 45 seconds before giving up on a request getting a
+                 * response and going ahead and killing cifsd.
+                 */
+                cFYI(1, "Wait for exit from demultiplex thread");
+                msleep(46000);
+                /*
+                 * If threads still have not exited they are probably never
+                 * coming home not much else we can do but free the memory.
+                 */
+        }
+        kfree(server->hostname);
+        kfree(server);
+        length = atomic_dec_return(&tcpSesAllocCount);
+        if (length > 0)
+                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
+                                GFP_KERNEL);
+}
 static int
 cifs_demultiplex_thread(void *p)
 {
        int length;
        struct TCP_Server_Info *server = p;
        unsigned int pdu_length, total_read;
+        char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
        struct smb_hdr *smb_buffer = NULL;
-        struct smb_hdr *bigbuf = NULL;
-        struct smb_hdr *smallbuf = NULL;
        struct msghdr smb_msg;
        struct kvec iov;
-        struct socket *csocket = server->ssocket;
-        struct list_head *tmp, *tmp2;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
-        char temp;
        bool isLargeBuf = false;
-        bool isMultiRsp;
+        bool isMultiRsp = false;
-        int reconnect;
+        int rc;
        current->flags |= PF_MEMALLOC;
        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -351,35 +654,16 @@ cifs_demultiplex_thread(void *p)
        while (server->tcpStatus != CifsExiting) {
                if (try_to_freeze())
                        continue;
-                if (bigbuf == NULL) {
-                        bigbuf = cifs_buf_get();
-                        if (!bigbuf) {
-                                cERROR(1, "No memory for large SMB response");
-                                msleep(3000);
-                                /* retry will check if exiting */
-                                continue;
-                        }
-                } else if (isLargeBuf) {
-                        /* we are reusing a dirty large buf, clear its start */
-                        memset(bigbuf, 0, sizeof(struct smb_hdr));
-                }
-                if (smallbuf == NULL) {
+                if (!allocate_buffers(&bigbuf, &smallbuf,
-                        smallbuf = cifs_small_buf_get();
+                                      sizeof(struct smb_hdr), isLargeBuf))
-                        if (!smallbuf) {
+                        continue;
-                                cERROR(1, "No memory for SMB response");
-                                msleep(1000);
-                                /* retry will check if exiting */
-                                continue;
-                        }
-                        /* beginning of smb buffer is cleared in our buf_get */
-                } else /* if existing small buf clear beginning */
-                        memset(smallbuf, 0, sizeof(struct smb_hdr));
                isLargeBuf = false;
                isMultiRsp = false;
-                smb_buffer = smallbuf;
+                smb_buffer = (struct smb_hdr *)smallbuf;
-                iov.iov_base = smb_buffer;
+                buf = smallbuf;
+                iov.iov_base = buf;
                iov.iov_len = 4;
                smb_msg.msg_control = NULL;
                smb_msg.msg_controllen = 0;
@@ -393,158 +677,50 @@ incomplete_rcv:
                                  "Reconnecting...", server->hostname,
                                  (echo_retries * SMB_ECHO_INTERVAL / HZ));
                        cifs_reconnect(server);
-                        csocket = server->ssocket;
                        wake_up(&server->response_q);
                        continue;
                }
-                length =
+                rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
-                    kernel_recvmsg(csocket, &smb_msg,
+                                      &total_read, true /* header read */);
-                                &iov, 1, pdu_length, 0 /* BB other flags? */);
+                if (rc == 3)
+                        goto incomplete_rcv;
-                if (server->tcpStatus == CifsExiting) {
+                else if (rc == 2)
                        break;
-                } else if (server->tcpStatus == CifsNeedReconnect) {
+                else if (rc == 1)
-                        cFYI(1, "Reconnect after server stopped responding");
-                        cifs_reconnect(server);
-                        cFYI(1, "call to reconnect done");
-                        csocket = server->ssocket;
-                        continue;
-                } else if (length == -ERESTARTSYS ||
-                           length == -EAGAIN ||
-                           length == -EINTR) {
-                        msleep(1); /* minimum sleep to prevent looping
-                                allowing socket to clear and app threads to set
-                                tcpStatus CifsNeedReconnect if server hung */
-                        if (pdu_length < 4) {
-                                iov.iov_base = (4 - pdu_length) +
-                                                        (char *)smb_buffer;
-                                iov.iov_len = pdu_length;
-                                smb_msg.msg_control = NULL;
-                                smb_msg.msg_controllen = 0;
-                                goto incomplete_rcv;
-                        } else
-                                continue;
-                } else if (length <= 0) {
-                        cFYI(1, "Reconnect after unexpected peek error %d",
-                                length);
-                        cifs_reconnect(server);
-                        csocket = server->ssocket;
-                        wake_up(&server->response_q);
                        continue;
-                } else if (length < pdu_length) {
-                        cFYI(1, "requested %d bytes but only got %d bytes",
-                                  pdu_length, length);
-                        pdu_length -= length;
-                        msleep(1);
-                        goto incomplete_rcv;
-                }
-                /* The right amount was read from socket - 4 bytes */
-                /* so we can now interpret the length field */
-                /* the first byte big endian of the length field,
+                /*
-                is actually not part of the length but the type
+                 * The right amount was read from socket - 4 bytes,
-                with the most common, zero, as regular data */
+                 * so we can now interpret the length field.
-                temp = *((char *) smb_buffer);
+                 */
-                /* Note that FC 1001 length is big endian on the wire,
+                /*
-                but we convert it here so it is always manipulated
+                 * Note that RFC 1001 length is big endian on the wire,
-                as host byte order */
+                 * but we convert it here so it is always manipulated
+                 * as host byte order.
+                 */
                pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
+                if (!check_rfc1002_header(server, buf))
-                if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
-                        continue;
-                } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-                        cFYI(1, "Good RFC 1002 session rsp");
-                        continue;
-                } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
-                        /* we get this from Windows 98 instead of
-                           an error on SMB negprot response */
-                        cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-                                pdu_length);
-                        /* give server a second to clean up  */
-                        msleep(1000);
-                        /* always try 445 first on reconnect since we get NACK
-                         * on some if we ever connected to port 139 (the NACK
-                         * is since we do not begin with RFC1001 session
-                         * initialize frame)
-                         */
-                        cifs_set_port((struct sockaddr *)
-                                        &server->dstaddr, CIFS_PORT);
-                        cifs_reconnect(server);
-                        csocket = server->ssocket;
-                        wake_up(&server->response_q);
-                        continue;
-                } else if (temp != (char) 0) {
-                        cERROR(1, "Unknown RFC 1002 frame");
-                        cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
-                                      length);
-                        cifs_reconnect(server);
-                        csocket = server->ssocket;
                        continue;
-                }
-                /* else we have an SMB response */
-                if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-                            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                        cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                                        length, pdu_length+4);
-                        cifs_reconnect(server);
-                        csocket = server->ssocket;
-                        wake_up(&server->response_q);
-                        continue;
-                }
                /* else length ok */
-                reconnect = 0;
                if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
                        isLargeBuf = true;
                        memcpy(bigbuf, smallbuf, 4);
-                        smb_buffer = bigbuf;
+                        smb_buffer = (struct smb_hdr *)bigbuf;
+                        buf = bigbuf;
                }
-                length = 0;
-                iov.iov_base = 4 + (char *)smb_buffer;
+                iov.iov_base = 4 + buf;
                iov.iov_len = pdu_length;
-                for (total_read = 0; total_read < pdu_length;
+                rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
-                     total_read += length) {
+                                      &total_read, false);
-                        length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
+                if (rc == 2)
-                                                pdu_length - total_read, 0);
-                        if (server->tcpStatus == CifsExiting) {
-                                /* then will exit */
-                                reconnect = 2;
-                                break;
-                        } else if (server->tcpStatus == CifsNeedReconnect) {
-                                cifs_reconnect(server);
-                                csocket = server->ssocket;
-                                /* Reconnect wakes up rspns q */
-                                /* Now we will reread sock */
-                                reconnect = 1;
-                                break;
-                        } else if (length == -ERESTARTSYS ||
-                                   length == -EAGAIN ||
-                                   length == -EINTR) {
-                                msleep(1); /* minimum sleep to prevent looping,
-                                              allowing socket to clear and app
-                                              threads to set tcpStatus
-                                              CifsNeedReconnect if server hung*/
-                                length = 0;
-                                continue;
-                        } else if (length <= 0) {
-                                cERROR(1, "Received no data, expecting %d",
-                                              pdu_length - total_read);
-                                cifs_reconnect(server);
-                                csocket = server->ssocket;
-                                reconnect = 1;
-                                break;
-                        }
-                }
-                if (reconnect == 2)
                        break;
-                else if (reconnect == 1)
+                else if (rc == 1)
                        continue;
                total_read += 4; /* account for rfc1002 hdr */
@@ -562,75 +738,13 @@ incomplete_rcv:
                 */
                length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
                if (length != 0)
-                        cifs_dump_mem("Bad SMB: ", smb_buffer,
+                        cifs_dump_mem("Bad SMB: ", buf,
-                                        min_t(unsigned int, total_read, 48));
+                                      min_t(unsigned int, total_read, 48));
-                mid_entry = NULL;
                server->lstrp = jiffies;
-                spin_lock(&GlobalMid_Lock);
+                mid_entry = find_cifs_mid(server, smb_buffer, &length,
-                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+                                          isLargeBuf, &isMultiRsp, &bigbuf);
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        if (mid_entry->mid != smb_buffer->Mid ||
-                            mid_entry->midState != MID_REQUEST_SUBMITTED ||
-                            mid_entry->command != smb_buffer->Command) {
-                                mid_entry = NULL;
-                                continue;
-                        }
-                        if (length == 0 &&
-                            check2ndT2(smb_buffer, server->maxBuf) > 0) {
-                                /* We have a multipart transact2 resp */
-                                isMultiRsp = true;
-                                if (mid_entry->resp_buf) {
-                                        /* merge response - fix up 1st*/
-                                        length = coalesce_t2(smb_buffer,
-                                                        mid_entry->resp_buf);
-                                        if (length > 0) {
-                                                length = 0;
-                                                mid_entry->multiRsp = true;
-                                                break;
-                                        } else {
-                                                /* all parts received or
-                                                 * packet is malformed
-                                                 */
-                                                mid_entry->multiEnd = true;
-                                                goto multi_t2_fnd;
-                                        }
-                                } else {
-                                        if (!isLargeBuf) {
-                                                /*
-                                                 * FIXME: switch to already
-                                                 *        allocated largebuf?
-                                                 */
-                                                cERROR(1, "1st trans2 resp "
-                                                          "needs bigbuf");
-                                        } else {
-                                                /* Have first buffer */
-                                                mid_entry->resp_buf =
-                                                         smb_buffer;
-                                                mid_entry->largeBuf = true;
-                                                bigbuf = NULL;
-                                        }
-                                }
-                                break;
-                        }
-                        mid_entry->resp_buf = smb_buffer;
-                        mid_entry->largeBuf = isLargeBuf;
-multi_t2_fnd:
-                        if (length == 0)
-                                mid_entry->midState = MID_RESPONSE_RECEIVED;
-                        else
-                                mid_entry->midState = MID_RESPONSE_MALFORMED;
-#ifdef CONFIG_CIFS_STATS2
-                        mid_entry->when_received = jiffies;
-#endif
-                        list_del_init(&mid_entry->qhead);
-                        break;
-                }
-                spin_unlock(&GlobalMid_Lock);
                if (mid_entry != NULL) {
                        mid_entry->callback(mid_entry);
                        /* Was previous buf put in mpx struct for multi-rsp? */
@@ -648,7 +762,7 @@ multi_t2_fnd:
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
                                   "NumMids %d", atomic_read(&midCount));
-                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
+                        cifs_dump_mem("Received Data is: ", buf,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
                        cifs_dump_detail(smb_buffer);
@@ -658,88 +772,13 @@ multi_t2_fnd:
                }
        } /* end while !EXITING */
-        /* take it off the list, if it's not already */
-        spin_lock(&cifs_tcp_ses_lock);
-        list_del_init(&server->tcp_ses_list);
-        spin_unlock(&cifs_tcp_ses_lock);
-        spin_lock(&GlobalMid_Lock);
-        server->tcpStatus = CifsExiting;
-        spin_unlock(&GlobalMid_Lock);
-        wake_up_all(&server->response_q);
-        /* check if we have blocked requests that need to free */
-        /* Note that cifs_max_pending is normally 50, but
-        can be set at module install time to as little as two */
-        spin_lock(&GlobalMid_Lock);
-        if (atomic_read(&server->inFlight) >= cifs_max_pending)
-                atomic_set(&server->inFlight, cifs_max_pending - 1);
-        /* We do not want to set the max_pending too low or we
-        could end up with the counter going negative */
-        spin_unlock(&GlobalMid_Lock);
-        /* Although there should not be any requests blocked on
-        this queue it can not hurt to be paranoid and try to wake up requests
-        that may haven been blocked when more than 50 at time were on the wire
-        to the same server - they now will see the session is in exit state
-        and get out of SendReceive.  */
-        wake_up_all(&server->request_q);
-        /* give those requests time to exit */
-        msleep(125);
-        if (server->ssocket) {
-                sock_release(csocket);
-                server->ssocket = NULL;
-        }
        /* buffer usually freed in free_mid - need to free it here on exit */
        cifs_buf_release(bigbuf);
        if (smallbuf) /* no sense logging a debug message if NULL */
                cifs_small_buf_release(smallbuf);
-        if (!list_empty(&server->pending_mid_q)) {
-                struct list_head dispose_list;
-                INIT_LIST_HEAD(&dispose_list);
-                spin_lock(&GlobalMid_Lock);
-                list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
-                        mid_entry->midState = MID_SHUTDOWN;
-                        list_move(&mid_entry->qhead, &dispose_list);
-                }
-                spin_unlock(&GlobalMid_Lock);
-                /* now walk dispose list and issue callbacks */
-                list_for_each_safe(tmp, tmp2, &dispose_list) {
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                        cFYI(1, "Callback mid 0x%x", mid_entry->mid);
-                        list_del_init(&mid_entry->qhead);
-                        mid_entry->callback(mid_entry);
-                }
-                /* 1/8th of sec is more than enough time for them to exit */
-                msleep(125);
-        }
-        if (!list_empty(&server->pending_mid_q)) {
-                /* mpx threads have not exited yet give them
-                at least the smb send timeout time for long ops */
-                /* due to delays on oplock break requests, we need
-                to wait at least 45 seconds before giving up
-                on a request getting a response and going ahead
-                and killing cifsd */
-                cFYI(1, "Wait for exit from demultiplex thread");
-                msleep(46000);
-                /* if threads still have not exited they are probably never
-                coming home not much else we can do but free the memory */
-        }
-        kfree(server->hostname);
        task_to_wake = xchg(&server->tsk, NULL);
-        kfree(server);
+        clean_demultiplex_info(server);
-        length = atomic_dec_return(&tcpSesAllocCount);
-        if (length  > 0)
-                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
-                                GFP_KERNEL);
        /* if server->tsk was NULL then wait for a signal before exiting */
        if (!task_to_wake) {
@@ -2839,7 +2878,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
        kfree(volume_info->username);
        kzfree(volume_info->password);
        kfree(volume_info->UNC);
-        kfree(volume_info->UNCip);
+        if (volume_info->UNCip != volume_info->UNC + 2)
+                kfree(volume_info->UNCip);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
        kfree(volume_info->prepath);
@@ -3193,15 +3233,9 @@ mount_fail_check:
                else
                        cifs_put_tcp_session(srvTcp);
                bdi_destroy(&cifs_sb->bdi);
-                goto out;
        }
-        /* volume_info->password is freed above when existing session found
-        (in which case it is not needed anymore) but when new sesion is created
-        the password ptr is put in the new session structure (in which case the
-        password will be freed at unmount time) */
 out:
-        /* zero out password before freeing */
        FreeXid(xid);
        return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 499f27fc8576..72d448bf96ce 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry)
        struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
        unsigned seq;
-        if (direntry == NULL)
-                return NULL;  /* not much we can do if dentry is freed and
-                we need to reopen the file after it was closed implicitly
-                when the server crashed */
        dirsep = CIFS_DIR_SEP(cifs_sb);
        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
@@ -110,8 +105,8 @@ cifs_bp_rename_retry:
        }
        rcu_read_unlock();
        if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
-                cERROR(1, "did not end path lookup where expected namelen is %d",
+                cFYI(1, "did not end path lookup where expected. namelen=%d "
-                        namelen);
+                        "dfsplen=%d", namelen, dfsplen);
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 548f06230a6d..1d2d91d9bf65 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        /* Perform the upcall */
        rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
        if (rc < 0)
-                cERROR(1, "%s: unable to resolve: %*.*s",
+                cFYI(1, "%s: unable to resolve: %*.*s",
-                       __func__, len, len, hostname);
+                        __func__, len, len, hostname);
        else
                cFYI(1, "%s: resolved: %*.*s to %s",
                     __func__, len, len, hostname, *ip_addr);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 378acdafa356..9f41a10523a1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -314,6 +314,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        }
        spin_unlock(&cifs_file_list_lock);
+        cancel_work_sync(&cifs_file->oplock_break);
        if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
                int xid, rc;
@@ -2418,31 +2420,6 @@ void cifs_oplock_break(struct work_struct *work)
                                 cinode->clientCanCacheRead ? 1 : 0);
                cFYI(1, "Oplock release rc = %d", rc);
        }
-        /*
-         * We might have kicked in before is_valid_oplock_break()
-         * finished grabbing reference for us.  Make sure it's done by
-         * waiting for cifs_file_list_lock.
-         */
-        spin_lock(&cifs_file_list_lock);
-        spin_unlock(&cifs_file_list_lock);
-        cifs_oplock_break_put(cfile);
-}
-/* must be called while holding cifs_file_list_lock */
-void cifs_oplock_break_get(struct cifsFileInfo *cfile)
-{
-        cifs_sb_active(cfile->dentry->d_sb);
-        cifsFileInfo_get(cfile);
-}
-void cifs_oplock_break_put(struct cifsFileInfo *cfile)
-{
-        struct super_block *sb = cfile->dentry->d_sb;
-        cifsFileInfo_put(cfile);
-        cifs_sb_deactive(sb);
 }
 const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9b018c8334fa..a7b2dcd4a53e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
        if (full_path == NULL)
                return full_path;
-        if (dfsplen) {
+        if (dfsplen)
                strncpy(full_path, tcon->treeName, dfsplen);
-                /* switch slash direction in prepath depending on whether
-                 * windows or posix style path names
-                 */
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
-                        int i;
-                        for (i = 0; i < dfsplen; i++) {
-                                if (full_path[i] == '\\')
-                                        full_path[i] = '/';
-                        }
-                }
-        }
        strncpy(full_path + dfsplen, vol->prepath, pplen);
+        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
        full_path[dfsplen + pplen] = 0; /* add trailing null */
        return full_path;
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 556b1a0b54de..db3f18cdf024 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
                cERROR(1, "%s: Could not init md5 shash\n", __func__);
                goto symlink_hash_err;
        }
-        crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+        rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+        if (rc) {
+                cERROR(1, "%s: Could not update iwth link_str\n", __func__);
+                goto symlink_hash_err;
+        }
        rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+        if (rc)
+                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
 symlink_hash_err:
        crypto_free_shash(md5);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 03a1f491d39b..7c1693392598 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -585,15 +585,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                cifs_set_oplock_level(pCifsInode,
                                        pSMB->OplockLevel ? OPLOCK_READ : 0);
-                                /*
+                                queue_work(system_nrt_wq,
-                                 * cifs_oplock_break_put() can't be called
+                                           &netfile->oplock_break);
-                                 * from here.  Get reference after queueing
-                                 * succeeded.  cifs_oplock_break() will
-                                 * synchronize using cifs_file_list_lock.
-                                 */
-                                if (queue_work(system_nrt_wq,
-                                               &netfile->oplock_break))
-                                        cifs_oplock_break_get(netfile);
                                netfile->oplock_break_cancelled = false;
                                spin_unlock(&cifs_file_list_lock);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 965a3af186a1..5de03ec20144 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -4,6 +4,7 @@
 *   Directory search handling
 *
 *   Copyright (C) International Business Machines  Corp., 2004, 2008
+ *   Copyright (C) Red Hat, Inc., 2011
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -290,10 +291,10 @@ error_exit:
 }
 /* return length of unicode string in bytes */
-static int cifs_unicode_bytelen(char *str)
+static int cifs_unicode_bytelen(const char *str)
 {
        int len;
-        __le16 *ustr = (__le16 *)str;
+        const __le16 *ustr = (const __le16 *)str;
        for (len = 0; len <= PATH_MAX; len++) {
                if (ustr[len] == 0)
@@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 }
+struct cifs_dirent {
+        const char      *name;
+        size_t          namelen;
+        u32             resume_key;
+        u64             ino;
+};
+static void cifs_fill_dirent_unix(struct cifs_dirent *de,
+                const FILE_UNIX_INFO *info, bool is_unicode)
+{
+        de->name = &info->FileName[0];
+        if (is_unicode)
+                de->namelen = cifs_unicode_bytelen(de->name);
+        else
+                de->namelen = strnlen(de->name, PATH_MAX);
+        de->resume_key = info->ResumeKey;
+        de->ino = le64_to_cpu(info->basic.UniqueId);
+}
+static void cifs_fill_dirent_dir(struct cifs_dirent *de,
+                const FILE_DIRECTORY_INFO *info)
+{
+        de->name = &info->FileName[0];
+        de->namelen = le32_to_cpu(info->FileNameLength);
+        de->resume_key = info->FileIndex;
+}
+static void cifs_fill_dirent_full(struct cifs_dirent *de,
+                const FILE_FULL_DIRECTORY_INFO *info)
+{
+        de->name = &info->FileName[0];
+        de->namelen = le32_to_cpu(info->FileNameLength);
+        de->resume_key = info->FileIndex;
+}
+static void cifs_fill_dirent_search(struct cifs_dirent *de,
+                const SEARCH_ID_FULL_DIR_INFO *info)
+{
+        de->name = &info->FileName[0];
+        de->namelen = le32_to_cpu(info->FileNameLength);
+        de->resume_key = info->FileIndex;
+        de->ino = le64_to_cpu(info->UniqueId);
+}
+static void cifs_fill_dirent_both(struct cifs_dirent *de,
+                const FILE_BOTH_DIRECTORY_INFO *info)
+{
+        de->name = &info->FileName[0];
+        de->namelen = le32_to_cpu(info->FileNameLength);
+        de->resume_key = info->FileIndex;
+}
+static void cifs_fill_dirent_std(struct cifs_dirent *de,
+                const FIND_FILE_STANDARD_INFO *info)
+{
+        de->name = &info->FileName[0];
+        /* one byte length, no endianess conversion */
+        de->namelen = info->FileNameLength;
+        de->resume_key = info->ResumeKey;
+}
+static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
+                u16 level, bool is_unicode)
+{
+        memset(de, 0, sizeof(*de));
+        switch (level) {
+        case SMB_FIND_FILE_UNIX:
+                cifs_fill_dirent_unix(de, info, is_unicode);
+                break;
+        case SMB_FIND_FILE_DIRECTORY_INFO:
+                cifs_fill_dirent_dir(de, info);
+                break;
+        case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+                cifs_fill_dirent_full(de, info);
+                break;
+        case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+                cifs_fill_dirent_search(de, info);
+                break;
+        case SMB_FIND_FILE_BOTH_DIRECTORY_INFO:
+                cifs_fill_dirent_both(de, info);
+                break;
+        case SMB_FIND_FILE_INFO_STANDARD:
+                cifs_fill_dirent_std(de, info);
+                break;
+        default:
+                cFYI(1, "Unknown findfirst level %d", level);
+                return -EINVAL;
+        }
+        return 0;
+}
 #define UNICODE_DOT cpu_to_le16(0x2e)
 /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */
-static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
+static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
 {
        int rc = 0;
-        char *filename = NULL;
-        int len = 0;
-        if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                if (cfile->srch_inf.unicode) {
-                        len = cifs_unicode_bytelen(filename);
-                } else {
-                        /* BB should we make this strnlen of PATH_MAX? */
-                        len = strnlen(filename, 5);
-                }
-        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
-                FILE_DIRECTORY_INFO *pFindData =
-                        (FILE_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (cfile->srch_inf.info_level ==
-                        SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-                FILE_FULL_DIRECTORY_INFO *pFindData =
-                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (cfile->srch_inf.info_level ==
-                        SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-                SEARCH_ID_FULL_DIR_INFO *pFindData =
-                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (cfile->srch_inf.info_level ==
-                        SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-                FILE_BOTH_DIRECTORY_INFO *pFindData =
-                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO *pFindData =
-                        (FIND_FILE_STANDARD_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = pFindData->FileNameLength;
-        } else {
-                cFYI(1, "Unknown findfirst level %d",
-                         cfile->srch_inf.info_level);
-        }
-        if (filename) {
+        if (!de->name)
-                if (cfile->srch_inf.unicode) {
+                return 0;
-                        __le16 *ufilename = (__le16 *)filename;
-                        if (len == 2) {
+        if (is_unicode) {
-                                /* check for . */
+                __le16 *ufilename = (__le16 *)de->name;
-                                if (ufilename[0] == UNICODE_DOT)
+                if (de->namelen == 2) {
-                                        rc = 1;
+                        /* check for . */
-                        } else if (len == 4) {
+                        if (ufilename[0] == UNICODE_DOT)
-                                /* check for .. */
+                                rc = 1;
-                                if ((ufilename[0] == UNICODE_DOT)
+                } else if (de->namelen == 4) {
-                                   && (ufilename[1] == UNICODE_DOT))
+                        /* check for .. */
-                                        rc = 2;
+                        if (ufilename[0] == UNICODE_DOT &&
-                        }
+                            ufilename[1] == UNICODE_DOT)
-                } else /* ASCII */ {
+                                rc = 2;
-                        if (len == 1) {
+                }
-                                if (filename[0] == '.')
+        } else /* ASCII */ {
-                                        rc = 1;
+                if (de->namelen == 1) {
-                        } else if (len == 2) {
+                        if (de->name[0] == '.')
-                                if ((filename[0] == '.') && (filename[1] == '.'))
+                                rc = 1;
-                                        rc = 2;
+                } else if (de->namelen == 2) {
-                        }
+                        if (de->name[0] == '.' && de->name[1] == '.')
+                                rc = 2;
                }
        }
@@ -427,66 +478,18 @@ static int is_dir_changed(struct file *file)
 }
 static int cifs_save_resume_key(const char *current_entry,
-        struct cifsFileInfo *cifsFile)
+        struct cifsFileInfo *file_info)
 {
-        int rc = 0;
+        struct cifs_dirent de;
-        unsigned int len = 0;
+        int rc;
-        __u16 level;
-        char *filename;
-        if ((cifsFile == NULL) || (current_entry == NULL))
-                return -EINVAL;
-        level = cifsFile->srch_inf.info_level;
-        if (level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
+        rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level,
-                if (cifsFile->srch_inf.unicode) {
+                              file_info->srch_inf.unicode);
-                        len = cifs_unicode_bytelen(filename);
+        if (!rc) {
-                } else {
+                file_info->srch_inf.presume_name = de.name;
-                        /* BB should we make this strnlen of PATH_MAX? */
+                file_info->srch_inf.resume_name_len = de.namelen;
-                        len = strnlen(filename, PATH_MAX);
+                file_info->srch_inf.resume_key = de.resume_key;
-                }
-                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-                FILE_DIRECTORY_INFO *pFindData =
-                        (FILE_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-                FILE_FULL_DIRECTORY_INFO *pFindData =
-                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-                SEARCH_ID_FULL_DIR_INFO *pFindData =
-                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-                FILE_BOTH_DIRECTORY_INFO *pFindData =
-                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO *pFindData =
-                        (FIND_FILE_STANDARD_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                /* one byte length, no name conversion */
-                len = (unsigned int)pFindData->FileNameLength;
-                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-        } else {
-                cFYI(1, "Unknown findfirst level %d", level);
-                return -EINVAL;
        }
-        cifsFile->srch_inf.resume_name_len = len;
-        cifsFile->srch_inf.presume_name = filename;
        return rc;
 }
@@ -605,136 +608,70 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon,
        return rc;
 }
-/* inode num, inode type and filename returned */
+static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
-static int cifs_get_name_from_search_buf(struct qstr *pqst,
+                void *dirent, char *scratch_buf, unsigned int max_len)
-        char *current_entry, __u16 level, unsigned int unicode,
-        struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum)
 {
+        struct cifsFileInfo *file_info = file->private_data;
+        struct super_block *sb = file->f_path.dentry->d_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifs_dirent de = { NULL, };
+        struct cifs_fattr fattr;
+        struct dentry *dentry;
+        struct qstr name;
        int rc = 0;
-        unsigned int len = 0;
+        ino_t ino;
-        char *filename;
-        struct nls_table *nlt = cifs_sb->local_nls;
-        *pinum = 0;
-        if (level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                if (unicode) {
-                        len = cifs_unicode_bytelen(filename);
-                } else {
-                        /* BB should we make this strnlen of PATH_MAX? */
-                        len = strnlen(filename, PATH_MAX);
-                }
-                *pinum = le64_to_cpu(pFindData->basic.UniqueId);
+        rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
-        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+                              file_info->srch_inf.unicode);
-                FILE_DIRECTORY_INFO *pFindData =
+        if (rc)
-                        (FILE_DIRECTORY_INFO *)current_entry;
+                return rc;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-                FILE_FULL_DIRECTORY_INFO *pFindData =
-                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-                SEARCH_ID_FULL_DIR_INFO *pFindData =
-                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                *pinum = le64_to_cpu(pFindData->UniqueId);
-        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-                FILE_BOTH_DIRECTORY_INFO *pFindData =
-                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-        } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO *pFindData =
-                        (FIND_FILE_STANDARD_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                /* one byte length, no name conversion */
-                len = (unsigned int)pFindData->FileNameLength;
-        } else {
-                cFYI(1, "Unknown findfirst level %d", level);
-                return -EINVAL;
-        }
-        if (len > max_len) {
+        if (de.namelen > max_len) {
-                cERROR(1, "bad search response length %d past smb end", len);
+                cERROR(1, "bad search response length %zd past smb end",
+                          de.namelen);
                return -EINVAL;
        }
-        if (unicode) {
-                pqst->len = cifs_from_ucs2((char *) pqst->name,
-                                           (__le16 *) filename,
-                                           UNICODE_NAME_MAX,
-                                           min(len, max_len), nlt,
-                                           cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                pqst->len -= nls_nullsize(nlt);
-        } else {
-                pqst->name = filename;
-                pqst->len = len;
-        }
-        return rc;
-}
-static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
-                        void *direntry, char *scratch_buf, unsigned int max_len)
-{
-        int rc = 0;
-        struct qstr qstring;
-        struct cifsFileInfo *pCifsF;
-        u64    inum;
-        ino_t  ino;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
-        struct dentry *tmp_dentry;
-        struct cifs_fattr fattr;
-        /* get filename and len into qstring */
-        /* get dentry */
-        /* decide whether to create and populate ionde */
-        if ((direntry == NULL) || (file == NULL))
-                return -EINVAL;
-        pCifsF = file->private_data;
-        if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL))
-                return -ENOENT;
-        rc = cifs_entry_is_dot(pfindEntry, pCifsF);
        /* skip . and .. since we added them first */
-        if (rc != 0)
+        if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode))
                return 0;
-        sb = file->f_path.dentry->d_sb;
+        if (file_info->srch_inf.unicode) {
-        cifs_sb = CIFS_SB(sb);
+                struct nls_table *nlt = cifs_sb->local_nls;
-        qstring.name = scratch_buf;
-        rc = cifs_get_name_from_search_buf(&qstring, pfindEntry,
-                        pCifsF->srch_inf.info_level,
-                        pCifsF->srch_inf.unicode, cifs_sb,
-                        max_len, &inum /* returned */);
-        if (rc)
+                name.name = scratch_buf;
-                return rc;
+                name.len =
+                        cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
+                                       UNICODE_NAME_MAX,
+                                       min(de.namelen, (size_t)max_len), nlt,
+                                       cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                name.len -= nls_nullsize(nlt);
+        } else {
+                name.name = de.name;
+                name.len = de.namelen;
+        }
-        if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
+        switch (file_info->srch_inf.info_level) {
+        case SMB_FIND_FILE_UNIX:
                cifs_unix_basic_to_fattr(&fattr,
-                                 &((FILE_UNIX_INFO *) pfindEntry)->basic,
+                                         &((FILE_UNIX_INFO *)find_entry)->basic,
-                                 cifs_sb);
+                                         cifs_sb);
-        else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
+                break;
-                cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *)
+        case SMB_FIND_FILE_INFO_STANDARD:
-                                        pfindEntry, cifs_sb);
+                cifs_std_info_to_fattr(&fattr,
-        else
+                                       (FIND_FILE_STANDARD_INFO *)find_entry,
-                cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
+                                       cifs_sb);
-                                        pfindEntry, cifs_sb);
+                break;
+        default:
+                cifs_dir_info_to_fattr(&fattr,
+                                       (FILE_DIRECTORY_INFO *)find_entry,
+                                       cifs_sb);
+                break;
+        }
-        if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+        if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
-                fattr.cf_uniqueid = inum;
+                fattr.cf_uniqueid = de.ino;
        } else {
                fattr.cf_uniqueid = iunique(sb, ROOT_I);
                cifs_autodisable_serverino(cifs_sb);
@@ -750,12 +687,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
                fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-        tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
+        dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
-        rc = filldir(direntry, qstring.name, qstring.len, file->f_pos,
+        rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
-                     ino, fattr.cf_dtype);
+                     fattr.cf_dtype);
-        dput(tmp_dentry);
+        dput(dentry);
        return rc;
 }
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 1c5b770c3141..42b9fff48751 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
                cERROR(1, "%s: Could not init md4 shash\n", __func__);
                goto mdfour_err;
        }
-        crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+        rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+        if (rc) {
+                cERROR(1, "%s: Could not update with link_str\n", __func__);
+                goto mdfour_err;
+        }
        rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
+        if (rc)
+                cERROR(1, "%s: Could not genereate md4 hash\n", __func__);
 mdfour_err:
        crypto_free_shash(md4);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 147aa22c3c3a..10ca6b2c26b7 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -266,15 +266,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
        while (1) {
                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
                        spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_STATS2
+                        cifs_num_waiters_inc(server);
-                        atomic_inc(&server->num_waiters);
-#endif
                        wait_event(server->request_q,
                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
-#ifdef CONFIG_CIFS_STATS2
+                        cifs_num_waiters_dec(server);
-                        atomic_dec(&server->num_waiters);
-#endif
                        spin_lock(&GlobalMid_Lock);
                } else {
                        if (server->tcpStatus == CifsExiting) {
@@ -362,6 +358,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
        mid = AllocMidQEntry(hdr, server);
        if (mid == NULL) {
                mutex_unlock(&server->srv_mutex);
+                atomic_dec(&server->inFlight);
+                wake_up(&server->request_q);
                return -ENOMEM;
        }
@@ -379,15 +377,13 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
        mid->callback = callback;
        mid->callback_data = cbdata;
        mid->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-        atomic_inc(&server->inSend);
+        cifs_in_send_inc(server);
-#endif
        rc = smb_sendv(server, iov, nvec);
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(server);
-        atomic_dec(&server->inSend);
+        cifs_save_when_sent(mid);
-        mid->when_sent = jiffies;
-#endif
        mutex_unlock(&server->srv_mutex);
        if (rc)
                goto out_err;
@@ -573,14 +569,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_inc(ses->server);
-        atomic_inc(&ses->server->inSend);
-#endif
        rc = smb_sendv(ses->server, iov, n_vec);
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
@@ -701,14 +693,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-        atomic_inc(&ses->server->inSend);
+        cifs_in_send_inc(ses->server);
-#endif
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0)
@@ -841,14 +830,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_inc(ses->server);
-        atomic_inc(&ses->server->inSend);
-#endif
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
diff --git a/fs/compat.c b/fs/compat.c
index 0b48d018e38a..58b1da459893 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1675,11 +1675,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
-long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
-{
-        return sys_ni_syscall();
-}
 #ifdef CONFIG_EPOLL
 #ifdef HAVE_SET_RESTORE_SIGMASK
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8be086e9abe4..51352de88ef1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
 COMPATIBLE_IOCTL(PPPIOCDISCONN)
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
+COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
diff --git a/fs/dcache.c b/fs/dcache.c
index be18598c7fd7..a88948b8bd17 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        return parent;
 }
+/*
+ * Unhash a dentry without inserting an RCU walk barrier or checking that
+ * dentry->d_lock is locked.  The caller must take care of that, if
+ * appropriate.
+ */
+static void __d_shrink(struct dentry *dentry)
+{
+        if (!d_unhashed(dentry)) {
+                struct hlist_bl_head *b;
+                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+                        b = &dentry->d_sb->s_anon;
+                else
+                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
+                hlist_bl_lock(b);
+                __hlist_bl_del(&dentry->d_hash);
+                dentry->d_hash.pprev = NULL;
+                hlist_bl_unlock(b);
+        }
+}
 /**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
@@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 void __d_drop(struct dentry *dentry)
 {
        if (!d_unhashed(dentry)) {
-                struct hlist_bl_head *b;
+                __d_shrink(dentry);
-                if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
-                        b = &dentry->d_sb->s_anon;
-                else
-                        b = d_hash(dentry->d_parent, dentry->d_name.hash);
-                hlist_bl_lock(b);
-                __hlist_bl_del(&dentry->d_hash);
-                dentry->d_hash.pprev = NULL;
-                hlist_bl_unlock(b);
                dentry_rcuwalk_barrier(dentry);
        }
 }
@@ -784,6 +795,7 @@ relock:
 /**
 * prune_dcache_sb - shrink the dcache
+ * @sb: superblock
 * @nr_to_scan: number of entries to try to free
 *
 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
@@ -828,44 +840,24 @@ EXPORT_SYMBOL(shrink_dcache_sb);
 static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 {
        struct dentry *parent;
-        unsigned detached = 0;
        BUG_ON(!IS_ROOT(dentry));
-        /* detach this root from the system */
-        spin_lock(&dentry->d_lock);
-        dentry_lru_del(dentry);
-        __d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
        for (;;) {
                /* descend to the first leaf in the current subtree */
-                while (!list_empty(&dentry->d_subdirs)) {
+                while (!list_empty(&dentry->d_subdirs))
-                        struct dentry *loop;
-                        /* this is a branch with children - detach all of them
-                         * from the system in one go */
-                        spin_lock(&dentry->d_lock);
-                        list_for_each_entry(loop, &dentry->d_subdirs,
-                                            d_u.d_child) {
-                                spin_lock_nested(&loop->d_lock,
-                                                DENTRY_D_LOCK_NESTED);
-                                dentry_lru_del(loop);
-                                __d_drop(loop);
-                                spin_unlock(&loop->d_lock);
-                        }
-                        spin_unlock(&dentry->d_lock);
-                        /* move to the first child */
                        dentry = list_entry(dentry->d_subdirs.next,
                                            struct dentry, d_u.d_child);
-                }
                /* consume the dentries from this leaf up through its parents
                 * until we find one with children or run out altogether */
                do {
                        struct inode *inode;
+                        /* detach from the system */
+                        dentry_lru_del(dentry);
+                        __d_shrink(dentry);
                        if (dentry->d_count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
@@ -886,14 +878,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                                list_del(&dentry->d_u.d_child);
                        } else {
                                parent = dentry->d_parent;
-                                spin_lock(&parent->d_lock);
                                parent->d_count--;
                                list_del(&dentry->d_u.d_child);
-                                spin_unlock(&parent->d_lock);
                        }
-                        detached++;
                        inode = dentry->d_inode;
                        if (inode) {
                                dentry->d_inode = NULL;
@@ -938,9 +926,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
        dentry = sb->s_root;
        sb->s_root = NULL;
-        spin_lock(&dentry->d_lock);
        dentry->d_count--;
-        spin_unlock(&dentry->d_lock);
        shrink_dcache_for_umount_subtree(dentry);
        while (!hlist_bl_empty(&sb->s_anon)) {
@@ -1743,7 +1729,7 @@ seqretry:
                 */
                if (read_seqcount_retry(&dentry->d_seq, *seq))
                        goto seqretry;
-                if (parent->d_flags & DCACHE_OP_COMPARE) {
+                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        if (parent->d_op->d_compare(parent, *inode,
                                                dentry, i,
                                                tlen, tname, name))
@@ -2138,8 +2124,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.  Caller hold
+ * dcache entries should not be moved in this way. Caller must hold
- * rename_lock.
+ * rename_lock, the i_mutex of the source and target directories,
+ * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
 static void __d_move(struct dentry * dentry, struct dentry * target)
 {
@@ -2202,7 +2189,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way.
+ * dcache entries should not be moved in this way. See the locking
+ * requirements for __d_move.
 */
 void d_move(struct dentry *dentry, struct dentry *target)
 {
@@ -2320,7 +2308,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 * @inode: inode to bind to the dentry, to which aliases may be attached
 *
 * Introduces an dentry into the tree, substituting an extant disconnected
- * root directory alias in its place if there is one
+ * root directory alias in its place if there is one. Caller must hold the
+ * i_mutex of the parent directory.
 */
 struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 01d2d9ef609c..44a360ca8046 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -35,7 +35,7 @@
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <linux/uio.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1cd6d9d3e29a..cc16562654de 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
 config ECRYPT_FS
        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-        depends on EXPERIMENTAL && KEYS && CRYPTO
+        depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
        select CRYPTO_ECB
        select CRYPTO_CBC
        select CRYPTO_MD5
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 43c7c43b06f5..b36c5572b3f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -29,6 +29,7 @@
 #define ECRYPTFS_KERNEL_H
 #include <keys/user-type.h>
+#include <keys/encrypted-type.h>
 #include <linux/fs.h>
 #include <linux/fs_stack.h>
 #include <linux/namei.h>
@@ -36,125 +37,18 @@
 #include <linux/hash.h>
 #include <linux/nsproxy.h>
 #include <linux/backing-dev.h>
+#include <linux/ecryptfs.h>
-/* Version verification for shared data structures w/ userspace */
-#define ECRYPTFS_VERSION_MAJOR 0x00
-#define ECRYPTFS_VERSION_MINOR 0x04
-#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03
-/* These flags indicate which features are supported by the kernel
- * module; userspace tools such as the mount helper read
- * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
- * how to behave. */
-#define ECRYPTFS_VERSIONING_PASSPHRASE            0x00000001
-#define ECRYPTFS_VERSIONING_PUBKEY                0x00000002
-#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
-#define ECRYPTFS_VERSIONING_POLICY                0x00000008
-#define ECRYPTFS_VERSIONING_XATTR                 0x00000010
-#define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
-#define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
-#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
-#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
-#define ECRYPTFS_VERSIONING_GCM                   0x00000200
-#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
-                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
-                                  | ECRYPTFS_VERSIONING_PUBKEY \
-                                  | ECRYPTFS_VERSIONING_XATTR \
-                                  | ECRYPTFS_VERSIONING_MULTKEY \
-                                  | ECRYPTFS_VERSIONING_DEVMISC \
-                                  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
-#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
-#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
-#define ECRYPTFS_SALT_SIZE 8
-#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
-/* The original signature size is only for what is stored on disk; all
- * in-memory representations are expanded hex, so it better adapted to
- * be passed around or referenced on the command line */
-#define ECRYPTFS_SIG_SIZE 8
-#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
-#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
-#define ECRYPTFS_MAX_KEY_BYTES 64
-#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
 #define ECRYPTFS_DEFAULT_IV_BYTES 16
-#define ECRYPTFS_FILE_VERSION 0x03
 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32
 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ
 #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3)
-#define ECRYPTFS_MAX_PKI_NAME_BYTES 16
 #define ECRYPTFS_DEFAULT_NUM_USERS 4
 #define ECRYPTFS_MAX_NUM_USERS 32768
 #define ECRYPTFS_XATTR_NAME "user.ecryptfs"
-#define RFC2440_CIPHER_DES3_EDE 0x02
-#define RFC2440_CIPHER_CAST_5 0x03
-#define RFC2440_CIPHER_BLOWFISH 0x04
-#define RFC2440_CIPHER_AES_128 0x07
-#define RFC2440_CIPHER_AES_192 0x08
-#define RFC2440_CIPHER_AES_256 0x09
-#define RFC2440_CIPHER_TWOFISH 0x0a
-#define RFC2440_CIPHER_CAST_6 0x0b
-#define RFC2440_CIPHER_RSA 0x01
-/**
- * For convenience, we may need to pass around the encrypted session
- * key between kernel and userspace because the authentication token
- * may not be extractable.  For example, the TPM may not release the
- * private key, instead requiring the encrypted data and returning the
- * decrypted data.
- */
-struct ecryptfs_session_key {
-#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
-#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
-#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
-#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
-        u32 flags;
-        u32 encrypted_key_size;
-        u32 decrypted_key_size;
-        u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
-        u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
-};
-struct ecryptfs_password {
-        u32 password_bytes;
-        s32 hash_algo;
-        u32 hash_iterations;
-        u32 session_key_encryption_key_bytes;
-#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
-#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
-        u32 flags;
-        /* Iterated-hash concatenation of salt and passphrase */
-        u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
-        u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
-        /* Always in expanded hex */
-        u8 salt[ECRYPTFS_SALT_SIZE];
-};
-enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
-struct ecryptfs_private_key {
-        u32 key_size;
-        u32 data_len;
-        u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
-        char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1];
-        u8 data[];
-};
-/* May be a password or a private key */
-struct ecryptfs_auth_tok {
-        u16 version; /* 8-bit major and 8-bit minor */
-        u16 token_type;
-#define ECRYPTFS_ENCRYPT_ONLY 0x00000001
-        u32 flags;
-        struct ecryptfs_session_key session_key;
-        u8 reserved[32];
-        union {
-                struct ecryptfs_password password;
-                struct ecryptfs_private_key private_key;
-        } token;
-} __attribute__ ((packed));
 void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
 extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
 extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
@@ -185,11 +79,47 @@ struct ecryptfs_page_crypt_context {
        } param;
 };
+#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_encrypted_key_payload_data(struct key *key)
+{
+        if (key->type == &key_type_encrypted)
+                return (struct ecryptfs_auth_tok *)
+                        (&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+        else
+                return NULL;
+}
+static inline struct key *ecryptfs_get_encrypted_key(char *sig)
+{
+        return request_key(&key_type_encrypted, sig, NULL);
+}
+#else
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_encrypted_key_payload_data(struct key *key)
+{
+        return NULL;
+}
+static inline struct key *ecryptfs_get_encrypted_key(char *sig)
+{
+        return ERR_PTR(-ENOKEY);
+}
+#endif /* CONFIG_ENCRYPTED_KEYS */
 static inline struct ecryptfs_auth_tok *
 ecryptfs_get_key_payload_data(struct key *key)
 {
-        return (struct ecryptfs_auth_tok *)
+        struct ecryptfs_auth_tok *auth_tok;
-                (((struct user_key_payload*)key->payload.data)->data);
+        auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
+        if (!auth_tok)
+                return (struct ecryptfs_auth_tok *)
+                        (((struct user_key_payload *)key->payload.data)->data);
+        else
+                return auth_tok;
 }
 #define ECRYPTFS_MAX_KEYSET_SIZE 1024
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 340c657a108c..11f8582d7218 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -69,6 +69,7 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
        inode->i_ino = lower_inode->i_ino;
        inode->i_version++;
        inode->i_mapping->a_ops = &ecryptfs_aops;
+        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        if (S_ISLNK(inode->i_mode))
                inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index fa8049ecdc64..ac1ad48c2376 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
        (*auth_tok_key) = request_key(&key_type_user, sig, NULL);
        if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
-                printk(KERN_ERR "Could not find key with description: [%s]\n",
+                (*auth_tok_key) = ecryptfs_get_encrypted_key(sig);
-                       sig);
+                if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
-                rc = process_request_key_err(PTR_ERR(*auth_tok_key));
+                        printk(KERN_ERR "Could not find key with description: [%s]\n",
-                (*auth_tok_key) = NULL;
+                              sig);
-                goto out;
+                        rc = process_request_key_err(PTR_ERR(*auth_tok_key));
+                        (*auth_tok_key) = NULL;
+                        goto out;
+                }
        }
        down_write(&(*auth_tok_key)->sem);
        rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
@@ -1868,11 +1871,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
         * just one will be sufficient to decrypt to get the FEK. */
 find_next_matching_auth_tok:
        found_auth_tok = 0;
-        if (auth_tok_key) {
-                up_write(&(auth_tok_key->sem));
-                key_put(auth_tok_key);
-                auth_tok_key = NULL;
-        }
        list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) {
                candidate_auth_tok = &auth_tok_list_item->auth_tok;
                if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1909,14 +1907,22 @@ found_matching_auth_tok:
                memcpy(&(candidate_auth_tok->token.private_key),
                       &(matching_auth_tok->token.private_key),
                       sizeof(struct ecryptfs_private_key));
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
                rc = decrypt_pki_encrypted_session_key(candidate_auth_tok,
                                                       crypt_stat);
        } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) {
                memcpy(&(candidate_auth_tok->token.password),
                       &(matching_auth_tok->token.password),
                       sizeof(struct ecryptfs_password));
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
                rc = decrypt_passphrase_encrypted_session_key(
                        candidate_auth_tok, crypt_stat);
+        } else {
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
+                rc = -EINVAL;
        }
        if (rc) {
                struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp;
@@ -1956,21 +1962,18 @@ found_matching_auth_tok:
 out_wipe_list:
        wipe_auth_tok_list(&auth_tok_list);
 out:
-        if (auth_tok_key) {
-                up_write(&(auth_tok_key->sem));
-                key_put(auth_tok_key);
-        }
        return rc;
 }
 static int
-pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
+pki_encrypt_session_key(struct key *auth_tok_key,
+                        struct ecryptfs_auth_tok *auth_tok,
                        struct ecryptfs_crypt_stat *crypt_stat,
                        struct ecryptfs_key_record *key_rec)
 {
        struct ecryptfs_msg_ctx *msg_ctx = NULL;
        char *payload = NULL;
-        size_t payload_len;
+        size_t payload_len = 0;
        struct ecryptfs_message *msg;
        int rc;
@@ -1979,6 +1982,8 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
                                         crypt_stat->cipher,
                                         crypt_stat->key_size),
                                 crypt_stat, &payload, &payload_len);
+        up_write(&(auth_tok_key->sem));
+        key_put(auth_tok_key);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
                goto out;
@@ -2008,6 +2013,8 @@ out:
 * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet
 * @dest: Buffer into which to write the packet
 * @remaining_bytes: Maximum number of bytes that can be writtn
+ * @auth_tok_key: The authentication token key to unlock and put when done with
+ *                @auth_tok
 * @auth_tok: The authentication token used for generating the tag 1 packet
 * @crypt_stat: The cryptographic context
 * @key_rec: The key record struct for the tag 1 packet
@@ -2018,7 +2025,7 @@ out:
 */
 static int
 write_tag_1_packet(char *dest, size_t *remaining_bytes,
-                   struct ecryptfs_auth_tok *auth_tok,
+                   struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok,
                   struct ecryptfs_crypt_stat *crypt_stat,
                   struct ecryptfs_key_record *key_rec, size_t *packet_size)
 {
@@ -2039,12 +2046,15 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes,
                memcpy(key_rec->enc_key,
                       auth_tok->session_key.encrypted_key,
                       auth_tok->session_key.encrypted_key_size);
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
                goto encrypted_session_key_set;
        }
        if (auth_tok->session_key.encrypted_key_size == 0)
                auth_tok->session_key.encrypted_key_size =
                        auth_tok->token.private_key.key_size;
-        rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec);
+        rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat,
+                                     key_rec);
        if (rc) {
                printk(KERN_ERR "Failed to encrypt session key via a key "
                       "module; rc = [%d]\n", rc);
@@ -2421,6 +2431,8 @@ ecryptfs_generate_key_packet_set(char *dest_base,
                                                &max, auth_tok,
                                                crypt_stat, key_rec,
                                                &written);
+                        up_write(&(auth_tok_key->sem));
+                        key_put(auth_tok_key);
                        if (rc) {
                                ecryptfs_printk(KERN_WARNING, "Error "
                                                "writing tag 3 packet\n");
@@ -2438,8 +2450,8 @@ ecryptfs_generate_key_packet_set(char *dest_base,
                        }
                        (*len) += written;
                } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) {
-                        rc = write_tag_1_packet(dest_base + (*len),
+                        rc = write_tag_1_packet(dest_base + (*len), &max,
-                                                &max, auth_tok,
+                                                auth_tok_key, auth_tok,
                                                crypt_stat, key_rec, &written);
                        if (rc) {
                                ecryptfs_printk(KERN_WARNING, "Error "
@@ -2448,14 +2460,13 @@ ecryptfs_generate_key_packet_set(char *dest_base,
                        }
                        (*len) += written;
                } else {
+                        up_write(&(auth_tok_key->sem));
+                        key_put(auth_tok_key);
                        ecryptfs_printk(KERN_WARNING, "Unsupported "
                                        "authentication token type\n");
                        rc = -EINVAL;
                        goto out_free;
                }
-                up_write(&(auth_tok_key->sem));
-                key_put(auth_tok_key);
-                auth_tok_key = NULL;
        }
        if (likely(max > 0)) {
                dest_base[(*len)] = 0x00;
@@ -2468,11 +2479,6 @@ out_free:
 out:
        if (rc)
                (*len) = 0;
-        if (auth_tok_key) {
-                up_write(&(auth_tok_key->sem));
-                key_put(auth_tok_key);
-        }
        mutex_unlock(&crypt_stat->keysig_list_mutex);
        return rc;
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f1bb747d77d..b4a6befb1216 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_check_dev_ruid,
       ecryptfs_opt_err };
 static const match_table_t tokens = {
@@ -191,6 +192,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
        {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
+        {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
        {ecryptfs_opt_err, NULL}
 };
@@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat(
 * ecryptfs_parse_options
 * @sb: The ecryptfs super block
 * @options: The options passed to the kernel
+ * @check_ruid: set to 1 if device uid should be checked against the ruid
 *
 * Parse mount options:
 * debug=N         - ecryptfs_verbosity level for debug output
@@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat(
 *
 * Returns zero on success; non-zero on error
 */
-static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
+                                  uid_t *check_ruid)
 {
        char *p;
        int rc = 0;
@@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
        char *cipher_key_bytes_src;
        char *fn_cipher_key_bytes_src;
+        *check_ruid = 0;
        if (!options) {
                rc = -EINVAL;
                goto out;
@@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
                        mount_crypt_stat->flags |=
                                ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
                        break;
+                case ecryptfs_opt_check_dev_ruid:
+                        *check_ruid = 1;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        const char *err = "Getting sb failed";
        struct inode *inode;
        struct path path;
+        uid_t check_ruid;
        int rc;
        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
-        rc = ecryptfs_parse_options(sbi, raw_data);
+        rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
        if (rc) {
                err = "Error parsing options";
                goto out;
@@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                        "known incompatibilities\n");
                goto out_free;
        }
+        if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
+                rc = -EPERM;
+                printk(KERN_ERR "Mount of device (uid: %d) not owned by "
+                       "requested user (uid: %d)\n",
+                       path.dentry->d_inode->i_uid, current_uid());
+                goto out_free;
+        }
        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        s->s_blocksize = path.dentry->d_sb->s_blocksize;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 85d430963116..3745f7c2b9c2 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -39,15 +39,16 @@
 int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
                         loff_t offset, size_t size)
 {
-        struct ecryptfs_inode_info *inode_info;
+        struct file *lower_file;
        mm_segment_t fs_save;
        ssize_t rc;
-        inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
+        lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
-        BUG_ON(!inode_info->lower_file);
+        if (!lower_file)
+                return -EIO;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_write(inode_info->lower_file, data, size, &offset);
+        rc = vfs_write(lower_file, data, size, &offset);
        set_fs(fs_save);
        mark_inode_dirty_sync(ecryptfs_inode);
        return rc;
@@ -225,15 +226,16 @@ out:
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
                        struct inode *ecryptfs_inode)
 {
-        struct ecryptfs_inode_info *inode_info =
+        struct file *lower_file;
-                ecryptfs_inode_to_private(ecryptfs_inode);
        mm_segment_t fs_save;
        ssize_t rc;
-        BUG_ON(!inode_info->lower_file);
+        lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+        if (!lower_file)
+                return -EIO;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_read(inode_info->lower_file, data, size, &offset);
+        rc = vfs_read(lower_file, data, size, &offset);
        set_fs(fs_save);
        return rc;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5e480d555049..9026fc91fe3b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,7 +37,7 @@
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/mman.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * LOCKING:
diff --git a/fs/exec.c b/fs/exec.c
index 842d5700c155..25dcbe5fc356 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
                return;
        bprm->vma_pages = pages;
-#ifdef SPLIT_RSS_COUNTING
-        add_mm_counter(mm, MM_ANONPAGES, diff);
-#else
-        spin_lock(&mm->page_table_lock);
        add_mm_counter(mm, MM_ANONPAGES, diff);
-        spin_unlock(&mm->page_table_lock);
-#endif
 }
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -277,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
-        BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
@@ -1430,9 +1423,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        }
                }
                read_unlock(&binfmt_lock);
+#ifdef CONFIG_MODULES
                if (retval != -ENOEXEC || bprm->mm == NULL) {
                        break;
-#ifdef CONFIG_MODULES
                } else {
 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
                        if (printable(bprm->buf[0]) &&
@@ -1440,9 +1433,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                            printable(bprm->buf[2]) &&
                            printable(bprm->buf[3]))
                                break; /* -ENOEXEC */
+                        if (try)
+                                break; /* -ENOEXEC */
                        request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
-#endif
                }
+#else
+                break;
+#endif
        }
        return retval;
 }
@@ -1462,6 +1459,23 @@ static int do_execve_common(const char *filename,
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
+        const struct cred *cred = current_cred();
+        /*
+         * We move the actual failure in case of RLIMIT_NPROC excess from
+         * set*uid() to execve() because too many poorly written programs
+         * don't check setuid() return code.  Here we additionally recheck
+         * whether NPROC limit is still exceeded.
+         */
+        if ((current->flags & PF_NPROC_EXCEEDED) &&
+            atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+                retval = -EAGAIN;
+                goto out_ret;
+        }
+        /* We're below the limit (still or again), so we don't want to make
+         * further execve() calls fail. */
+        current->flags &= ~PF_NPROC_EXCEEDED;
        retval = unshare_files(&displaced);
        if (retval)
@@ -1649,15 +1663,26 @@ expand_fail:
        return ret;
 }
+static void cn_escape(char *str)
+{
+        for (; *str; str++)
+                if (*str == '/')
+                        *str = '!';
+}
 static int cn_print_exe_file(struct core_name *cn)
 {
        struct file *exe_file;
-        char *pathbuf, *path, *p;
+        char *pathbuf, *path;
        int ret;
        exe_file = get_mm_exe_file(current->mm);
-        if (!exe_file)
+        if (!exe_file) {
-                return cn_printf(cn, "(unknown)");
+                char *commstart = cn->corename + cn->used;
+                ret = cn_printf(cn, "%s (path unknown)", current->comm);
+                cn_escape(commstart);
+                return ret;
+        }
        pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
        if (!pathbuf) {
@@ -1671,9 +1696,7 @@ static int cn_print_exe_file(struct core_name *cn)
                goto free_buf;
        }
-        for (p = path; *p; p++)
+        cn_escape(path);
-                if (*p == '/')
-                        *p = '!';
        ret = cn_printf(cn, "%s", path);
@@ -1745,16 +1768,22 @@ static int format_corename(struct core_name *cn, long signr)
                                break;
                        }
                        /* hostname */
-                        case 'h':
+                        case 'h': {
+                                char *namestart = cn->corename + cn->used;
                                down_read(&uts_sem);
                                err = cn_printf(cn, "%s",
                                              utsname()->nodename);
                                up_read(&uts_sem);
+                                cn_escape(namestart);
                                break;
+                        }
                        /* executable */
-                        case 'e':
+                        case 'e': {
+                                char *commstart = cn->corename + cn->used;
                                err = cn_printf(cn, "%s", current->comm);
+                                cn_escape(commstart);
                                break;
+                        }
                        case 'E':
                                err = cn_print_exe_file(cn);
                                break;
@@ -2118,16 +2147,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(&cn, signr);
-        if (ispipe == -ENOMEM) {
-                printk(KERN_WARNING "format_corename failed\n");
-                printk(KERN_WARNING "Aborting core\n");
-                goto fail_corename;
-        }
        if (ispipe) {
                int dump_count;
                char **helper_argv;
+                if (ispipe < 0) {
+                        printk(KERN_WARNING "format_corename failed\n");
+                        printk(KERN_WARNING "Aborting core\n");
+                        goto fail_corename;
+                }
                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 2d0f757fda3e..c5a5855a6c44 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,8 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
-exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
+# ore module library
+obj-$(CONFIG_ORE) += ore.o
+exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 86194b2f799d..70bae4149291 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,6 +1,10 @@
+config ORE
+        tristate
 config EXOFS_FS
        tristate "exofs: OSD based file system support"
        depends on SCSI_OSD_ULD
+        select ORE
        help
          EXOFS is a file system that uses an OSD storage device,
          as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c965806c2821..f4e442ec7445 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -36,12 +36,9 @@
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/backing-dev.h>
-#include "common.h"
+#include <scsi/osd_ore.h>
-/* FIXME: Remove once pnfs hits mainline
+#include "common.h"
- * #include <linux/exportfs/pnfs_osd_xdr.h>
- */
-#include "pnfs.h"
 #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
@@ -56,27 +53,11 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
-struct exofs_layout {
-        osd_id          s_pid;                  /* partition ID of file system*/
-        /* Our way of looking at the data_map */
-        unsigned stripe_unit;
-        unsigned mirrors_p1;
-        unsigned group_width;
-        u64      group_depth;
-        unsigned group_count;
-        enum exofs_inode_layout_gen_functions lay_func;
-        unsigned        s_numdevs;              /* Num of devices in array    */
-        struct osd_dev  *s_ods[0];              /* Variable length            */
-};
 /*
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
+        struct backing_dev_info bdi;            /* register our bdi with VFS  */
        struct exofs_sb_stats s_ess;            /* Written often, pre-allocate*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
@@ -84,16 +65,13 @@ struct exofs_sb_info {
        spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
-        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
-        struct          backing_dev_info bdi;   /* register our bdi with VFS  */
        struct pnfs_osd_data_map data_map;      /* Default raid to use
                                                 * FIXME: Needed ?
                                                 */
-/*      struct exofs_layout     dir_layout;*/   /* Default dir layout */
+        struct ore_layout       layout;         /* Default files layout       */
-        struct exofs_layout     layout;         /* Default files layout,
+        struct ore_comp one_comp;               /* id & cred of partition id=0*/
-                                                 * contains the variable osd_dev
+        struct ore_components comps;            /* comps for the partition    */
-                                                 * array. Keep last */
        struct osd_dev  *_min_one_dev[1];       /* Place holder for one dev   */
 };
@@ -107,7 +85,8 @@ struct exofs_i_info {
        uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
        uint32_t       i_dir_start_lookup; /* which page to start lookup      */
        uint64_t       i_commit_size;      /* the object's written length     */
-        uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+        struct ore_comp one_comp;          /* same component for all devices  */
+        struct ore_components comps;       /* inode view of the device table  */
 };
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
        return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
 }
-struct exofs_io_state;
-typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
-struct exofs_io_state {
-        struct kref             kref;
-        void                    *private;
-        exofs_io_done_fn        done;
-        struct exofs_layout     *layout;
-        struct osd_obj_id       obj;
-        u8                      *cred;
-        /* Global read/write IO*/
-        loff_t                  offset;
-        unsigned long           length;
-        void                    *kern_buff;
-        struct page             **pages;
-        unsigned                nr_pages;
-        unsigned                pgbase;
-        unsigned                pages_consumed;
-        /* Attributes */
-        unsigned                in_attr_len;
-        struct osd_attr         *in_attr;
-        unsigned                out_attr_len;
-        struct osd_attr         *out_attr;
-        /* Variable array of size numdevs */
-        unsigned numdevs;
-        struct exofs_per_dev_state {
-                struct osd_request *or;
-                struct bio *bio;
-                loff_t offset;
-                unsigned length;
-                unsigned dev;
-        } per_dev[];
-};
-static inline unsigned exofs_io_state_size(unsigned numdevs)
-{
-        return sizeof(struct exofs_io_state) +
-                sizeof(struct exofs_per_dev_state) * numdevs;
-}
 /*
 * our inode flags
 */
@@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 }
 /*
- * Given a layout, object_number and stripe_index return the associated global
- * dev_index
- */
-unsigned exofs_layout_od_id(struct exofs_layout *layout,
-                            osd_id obj_no, unsigned layout_index);
-/*
 * Maximum count of links to a file
 */
 #define EXOFS_LINK_MAX           32000
@@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout,
 * function declarations *
 *************************/
-/* ios.c */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
-                           const struct osd_obj_id *obj);
-int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
-                    u64 offset, void *p, unsigned length);
-int  exofs_get_io_state(struct exofs_layout *layout,
-                        struct exofs_io_state **ios);
-void exofs_put_io_state(struct exofs_io_state *ios);
-int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
-int exofs_sbi_create(struct exofs_io_state *ios);
-int exofs_sbi_remove(struct exofs_io_state *ios);
-int exofs_sbi_write(struct exofs_io_state *ios);
-int exofs_sbi_read(struct exofs_io_state *ios);
-int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
-int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
-static inline int exofs_oi_write(struct exofs_i_info *oi,
-                                 struct exofs_io_state *ios)
-{
-        ios->obj.id = exofs_oi_objno(oi);
-        ios->cred = oi->i_cred;
-        return exofs_sbi_write(ios);
-}
-static inline int exofs_oi_read(struct exofs_i_info *oi,
-                                struct exofs_io_state *ios)
-{
-        ios->obj.id = exofs_oi_objno(oi);
-        ios->cred = oi->i_cred;
-        return exofs_sbi_read(ios);
-}
 /* inode.c               */
-unsigned exofs_max_io_pages(struct exofs_layout *layout,
+unsigned exofs_max_io_pages(struct ore_layout *layout,
                            unsigned expected_pages);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
                    struct inode *);
 /* super.c               */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                           const struct osd_obj_id *obj);
 int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 /*********************
@@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
-extern const struct osd_attr g_attr_logical_length;
 /* namei.c           */
 extern const struct inode_operations exofs_dir_inode_operations;
@@ -305,4 +197,33 @@ extern const struct inode_operations exofs_special_inode_operations;
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
+/* exofs_init_comps will initialize an ore_components device array
+ * pointing to a single ore_comp struct, and a round-robin view
+ * of the device table.
+ * The first device of each inode is the [inode->ino % num_devices]
+ * and the rest of the devices sequentially following where the
+ * first device is after the last device.
+ * It is assumed that the global device array at @sbi is twice
+ * bigger and that the device table repeats twice.
+ * See: exofs_read_lookup_dev_table()
+ */
+static inline void exofs_init_comps(struct ore_components *comps,
+                                    struct ore_comp *one_comp,
+                                    struct exofs_sb_info *sbi, osd_id oid)
+{
+        unsigned dev_mod = (unsigned)oid, first_dev;
+        one_comp->obj.partition = sbi->one_comp.obj.partition;
+        one_comp->obj.id = oid;
+        exofs_make_credential(one_comp->cred, &one_comp->obj);
+        comps->numdevs = sbi->comps.numdevs;
+        comps->single_comp = EC_SINGLE_COMP;
+        comps->comps = one_comp;
+        /* Round robin device view of the table */
+        first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
+        comps->ods = sbi->comps.ods + first_dev;
+}
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 8472c098445d..f39a38fc2349 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,7 +43,7 @@ enum { BIO_MAX_PAGES_KMALLOC =
                PAGE_SIZE / sizeof(struct page *),
 };
-unsigned exofs_max_io_pages(struct exofs_layout *layout,
+unsigned exofs_max_io_pages(struct ore_layout *layout,
                            unsigned expected_pages)
 {
        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
@@ -58,7 +58,7 @@ struct page_collect {
        struct exofs_sb_info *sbi;
        struct inode *inode;
        unsigned expected_pages;
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        struct page **pages;
        unsigned alloc_pages;
@@ -110,13 +110,6 @@ static int pcol_try_alloc(struct page_collect *pcol)
 {
        unsigned pages;
-        if (!pcol->ios) { /* First time allocate io_state */
-                int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
-                if (ret)
-                        return ret;
-        }
        /* TODO: easily support bio chaining */
        pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
@@ -140,7 +133,7 @@ static void pcol_free(struct page_collect *pcol)
        pcol->pages = NULL;
        if (pcol->ios) {
-                exofs_put_io_state(pcol->ios);
+                ore_put_io_state(pcol->ios);
                pcol->ios = NULL;
        }
 }
@@ -200,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol)
        u64 resid;
        u64 good_bytes;
        u64 length = 0;
-        int ret = exofs_check_io(pcol->ios, &resid);
+        int ret = ore_check_io(pcol->ios, &resid);
        if (likely(!ret))
                good_bytes = pcol->length;
@@ -241,7 +234,7 @@ static int __readpages_done(struct page_collect *pcol)
 }
 /* callback of async reads */
-static void readpages_done(struct exofs_io_state *ios, void *p)
+static void readpages_done(struct ore_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
@@ -269,20 +262,28 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct exofs_io_state *ios = pcol->ios;
+        struct ore_io_state *ios;
        struct page_collect *pcol_copy = NULL;
        int ret;
        if (!pcol->pages)
                return 0;
+        if (!pcol->ios) {
+                int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+                                             pcol->pg_first << PAGE_CACHE_SHIFT,
+                                             pcol->length, &pcol->ios);
+                if (ret)
+                        return ret;
+        }
+        ios = pcol->ios;
        ios->pages = pcol->pages;
        ios->nr_pages = pcol->nr_pages;
-        ios->length = pcol->length;
-        ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
        if (pcol->read_4_write) {
-                exofs_oi_read(oi, pcol->ios);
+                ore_read(pcol->ios);
                return __readpages_done(pcol);
        }
@@ -295,14 +296,14 @@ static int read_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
        ios->done = readpages_done;
        ios->private = pcol_copy;
-        ret = exofs_oi_read(oi, ios);
+        ret = ore_read(ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
        EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  ios->obj.id, _LLU(ios->offset), pcol->length);
+                  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
        /* pages ownership was passed to pcol_copy */
        _pcol_reset(pcol);
@@ -457,14 +458,14 @@ static int exofs_readpage(struct file *file, struct page *page)
 }
 /* Callback for osd_write. All writes are asynchronous */
-static void writepages_done(struct exofs_io_state *ios, void *p)
+static void writepages_done(struct ore_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
        int i;
        u64 resid;
        u64  good_bytes;
        u64  length = 0;
-        int ret = exofs_check_io(ios, &resid);
+        int ret = ore_check_io(ios, &resid);
        atomic_dec(&pcol->sbi->s_curr_pending);
@@ -507,13 +508,21 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 static int write_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
-        struct exofs_io_state *ios = pcol->ios;
+        struct ore_io_state *ios;
        struct page_collect *pcol_copy = NULL;
        int ret;
        if (!pcol->pages)
                return 0;
+        BUG_ON(pcol->ios);
+        ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+                                 pcol->pg_first << PAGE_CACHE_SHIFT,
+                                 pcol->length, &pcol->ios);
+        if (unlikely(ret))
+                goto err;
        pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
        if (!pcol_copy) {
                EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
@@ -523,16 +532,15 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
+        ios = pcol->ios;
        ios->pages = pcol_copy->pages;
        ios->nr_pages = pcol_copy->nr_pages;
-        ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
-        ios->length = pcol_copy->length;
        ios->done = writepages_done;
        ios->private = pcol_copy;
-        ret = exofs_oi_write(oi, ios);
+        ret = ore_write(ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("write_exec: exofs_oi_write() Failed\n");
+                EXOFS_ERR("write_exec: ore_write() Failed\n");
                goto err;
        }
@@ -844,17 +852,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
        return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
 }
-const struct osd_attr g_attr_logical_length = ATTR_DEF(
-        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
 static int _do_truncate(struct inode *inode, loff_t newsize)
 {
        struct exofs_i_info *oi = exofs_i(inode);
+        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        int ret;
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ret = exofs_oi_truncate(oi, (u64)newsize);
+        ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
        if (likely(!ret))
                truncate_setsize(inode, newsize);
@@ -917,30 +923,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
                [1] = g_attr_inode_file_layout,
                [2] = g_attr_inode_dir_layout,
        };
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        struct exofs_on_disk_inode_layout *layout;
        int ret;
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
        }
-        ios->obj.id = exofs_oi_objno(oi);
+        attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
-        exofs_make_credential(oi->i_cred, &ios->obj);
+        attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
-        ios->cred = oi->i_cred;
-        attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
-        attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
        ios->in_attr = attrs;
        ios->in_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sbi_read(ios);
+        ret = ore_read(ios);
        if (unlikely(ret)) {
                EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
-                          _LLU(ios->obj.id), ret);
+                          _LLU(oi->one_comp.obj.id), ret);
                memset(inode, 0, sizeof(*inode));
                inode->i_mode = 0040000 | (0777 & ~022);
                /* If object is lost on target we might as well enable it's
@@ -990,7 +992,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
        }
 out:
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        return ret;
 }
@@ -1016,6 +1018,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                return inode;
        oi = exofs_i(inode);
        __oi_init(oi);
+        exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+                         exofs_oi_objno(oi));
        /* read the inode from the osd */
        ret = exofs_get_inode(sb, oi, &fcb);
@@ -1107,21 +1111,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 * set the obj_created flag so that other methods know that the object exists on
 * the OSD.
 */
-static void create_done(struct exofs_io_state *ios, void *p)
+static void create_done(struct ore_io_state *ios, void *p)
 {
        struct inode *inode = p;
        struct exofs_i_info *oi = exofs_i(inode);
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
        int ret;
-        ret = exofs_check_io(ios, NULL);
+        ret = ore_check_io(ios, NULL);
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
        if (unlikely(ret)) {
                EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
-                          _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
+                          _LLU(exofs_oi_objno(oi)),
+                          _LLU(oi->one_comp.obj.partition));
                /*TODO: When FS is corrupted creation can fail, object already
                 * exist. Get rid of this asynchronous creation, if exist
                 * increment the obj counter and try the next object. Until we
@@ -1140,14 +1145,13 @@ static void create_done(struct exofs_io_state *ios, void *p)
 */
 struct inode *exofs_new_inode(struct inode *dir, int mode)
 {
-        struct super_block *sb;
+        struct super_block *sb = dir->i_sb;
+        struct exofs_sb_info *sbi = sb->s_fs_info;
        struct inode *inode;
        struct exofs_i_info *oi;
-        struct exofs_sb_info *sbi;
+        struct ore_io_state *ios;
-        struct exofs_io_state *ios;
        int ret;
-        sb = dir->i_sb;
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -1157,8 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        set_obj_2bcreated(oi);
-        sbi = sb->s_fs_info;
        inode->i_mapping->backing_dev_info = sb->s_bdi;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
@@ -1170,25 +1172,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        spin_unlock(&sbi->s_next_gen_lock);
        insert_inode_hash(inode);
+        exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+                         exofs_oi_objno(oi));
        exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
        mark_inode_dirty(inode);
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
+                EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
                return ERR_PTR(ret);
        }
-        ios->obj.id = exofs_oi_objno(oi);
-        exofs_make_credential(oi->i_cred, &ios->obj);
        ios->done = create_done;
        ios->private = inode;
-        ios->cred = oi->i_cred;
-        ret = exofs_sbi_create(ios);
+        ret = ore_create(ios);
        if (ret) {
-                exofs_put_io_state(ios);
+                ore_put_io_state(ios);
                return ERR_PTR(ret);
        }
        atomic_inc(&sbi->s_curr_pending);
@@ -1207,11 +1208,11 @@ struct updatei_args {
 /*
 * Callback function from exofs_update_inode().
 */
-static void updatei_done(struct exofs_io_state *ios, void *p)
+static void updatei_done(struct ore_io_state *ios, void *p)
 {
        struct updatei_args *args = p;
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        atomic_dec(&args->sbi->s_curr_pending);
@@ -1227,7 +1228,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        struct osd_attr attr;
        struct exofs_fcb *fcb;
        struct updatei_args *args;
@@ -1266,9 +1267,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        } else
                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                goto free_args;
        }
@@ -1285,13 +1286,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
                ios->private = args;
        }
-        ret = exofs_oi_write(oi, ios);
+        ret = ore_write(ios);
        if (!do_sync && !ret) {
                atomic_inc(&sbi->s_curr_pending);
                goto out; /* deallocation in updatei_done */
        }
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
 free_args:
        kfree(args);
 out:
@@ -1310,11 +1311,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 * Callback function from exofs_delete_inode() - don't have much cleaning up to
 * do.
 */
-static void delete_done(struct exofs_io_state *ios, void *p)
+static void delete_done(struct ore_io_state *ios, void *p)
 {
        struct exofs_sb_info *sbi = p;
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        atomic_dec(&sbi->s_curr_pending);
 }
@@ -1329,7 +1330,7 @@ void exofs_evict_inode(struct inode *inode)
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
@@ -1349,20 +1350,19 @@ void exofs_evict_inode(struct inode *inode)
        /* ignore the error, attempt a remove anyway */
        /* Now Remove the OSD objects */
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
+                EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
                return;
        }
-        ios->obj.id = exofs_oi_objno(oi);
        ios->done = delete_done;
        ios->private = sbi;
-        ios->cred = oi->i_cred;
-        ret = exofs_sbi_remove(ios);
+        ret = ore_remove(ios);
        if (ret) {
-                EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
+                EXOFS_ERR("%s: ore_remove failed\n", __func__);
-                exofs_put_io_state(ios);
+                ore_put_io_state(ios);
                return;
        }
        atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ore.c
index f74a2ec027a6..25305af88198 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ore.c
@@ -23,81 +23,87 @@
 */
 #include <linux/slab.h>
-#include <scsi/scsi_device.h>
 #include <asm/div64.h>
-#include "exofs.h"
+#include <scsi/osd_ore.h>
-#define EXOFS_DBGMSG2(M...) do {} while (0)
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
-void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+#ifdef CONFIG_EXOFS_DEBUG
-{
+#define ORE_DBGMSG(fmt, a...) \
-        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+        printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-}
+#else
+#define ORE_DBGMSG(fmt, a...) \
+        do { if (0) printk(fmt, ##a); } while (0)
+#endif
-int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+/* u64 has problems with printk this will cast it to unsigned long long */
-                    u64 offset, void *p, unsigned length)
+#define _LLU(x) (unsigned long long)(x)
-{
-        struct osd_request *or = osd_start_request(od, GFP_KERNEL);
-/*      struct osd_sense_info osi = {.key = 0};*/
-        int ret;
-        if (unlikely(!or)) {
+#define ORE_DBGMSG2(M...) do {} while (0)
-                EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
-                return -ENOMEM;
-        }
-        ret = osd_req_read_kern(or, obj, offset, p, length);
-        if (unlikely(ret)) {
-                EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
-                goto out;
-        }
-        ret = osd_finalize_request(or, 0, cred, NULL);
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
-        if (unlikely(ret)) {
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
-                EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
+MODULE_LICENSE("GPL");
-                goto out;
-        }
-        ret = osd_execute_request(or);
+static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
-        if (unlikely(ret))
+{
-                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        return ios->comps->comps[index & ios->comps->single_comp].cred;
-        /* osd_req_decode_sense(or, ret); */
+}
-out:
+static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
-        osd_end_request(or);
+{
-        return ret;
+        return &ios->comps->comps[index & ios->comps->single_comp].obj;
 }
-int exofs_get_io_state(struct exofs_layout *layout,
+static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
-                       struct exofs_io_state **pios)
 {
-        struct exofs_io_state *ios;
+        return ios->comps->ods[index];
+}
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+                      bool is_reading, u64 offset, u64 length,
+                      struct ore_io_state **pios)
+{
+        struct ore_io_state *ios;
        /*TODO: Maybe use kmem_cach per sbi of size
         * exofs_io_state_size(layout->s_numdevs)
         */
-        ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
+        ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
        if (unlikely(!ios)) {
-                EXOFS_DBGMSG("Failed kzalloc bytes=%d\n",
+                ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-                             exofs_io_state_size(layout->s_numdevs));
+                             ore_io_state_size(comps->numdevs));
                *pios = NULL;
                return -ENOMEM;
        }
        ios->layout = layout;
-        ios->obj.partition = layout->s_pid;
+        ios->comps = comps;
+        ios->offset = offset;
+        ios->length = length;
+        ios->reading = is_reading;
        *pios = ios;
        return 0;
 }
+EXPORT_SYMBOL(ore_get_rw_state);
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
+                      struct ore_io_state **ios)
+{
+        return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+}
+EXPORT_SYMBOL(ore_get_io_state);
-void exofs_put_io_state(struct exofs_io_state *ios)
+void ore_put_io_state(struct ore_io_state *ios)
 {
        if (ios) {
                unsigned i;
                for (i = 0; i < ios->numdevs; i++) {
-                        struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
+                        struct ore_per_dev_state *per_dev = &ios->per_dev[i];
                        if (per_dev->or)
                                osd_end_request(per_dev->or);
@@ -108,31 +114,9 @@ void exofs_put_io_state(struct exofs_io_state *ios)
                kfree(ios);
        }
 }
+EXPORT_SYMBOL(ore_put_io_state);
-unsigned exofs_layout_od_id(struct exofs_layout *layout,
+static void _sync_done(struct ore_io_state *ios, void *p)
-                            osd_id obj_no, unsigned layout_index)
-{
-/*      switch (layout->lay_func) {
-        case LAYOUT_MOVING_WINDOW:
-        {*/
-                unsigned dev_mod = obj_no;
-                return (layout_index + dev_mod * layout->mirrors_p1) %
-                                                              layout->s_numdevs;
-/*      }
-        case LAYOUT_FUNC_IMPLICT:
-                return layout->devs[layout_index];
-        }*/
-}
-static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
-                                           unsigned layout_index)
-{
-        return ios->layout->s_ods[
-                exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
-}
-static void _sync_done(struct exofs_io_state *ios, void *p)
 {
        struct completion *waiting = p;
@@ -141,20 +125,20 @@ static void _sync_done(struct exofs_io_state *ios, void *p)
 static void _last_io(struct kref *kref)
 {
-        struct exofs_io_state *ios = container_of(
+        struct ore_io_state *ios = container_of(
-                                        kref, struct exofs_io_state, kref);
+                                        kref, struct ore_io_state, kref);
        ios->done(ios, ios->private);
 }
 static void _done_io(struct osd_request *or, void *p)
 {
-        struct exofs_io_state *ios = p;
+        struct ore_io_state *ios = p;
        kref_put(&ios->kref, _last_io);
 }
-static int exofs_io_execute(struct exofs_io_state *ios)
+static int ore_io_execute(struct ore_io_state *ios)
 {
        DECLARE_COMPLETION_ONSTACK(wait);
        bool sync = (ios->done == NULL);
@@ -170,9 +154,9 @@ static int exofs_io_execute(struct exofs_io_state *ios)
                if (unlikely(!or))
                        continue;
-                ret = osd_finalize_request(or, 0, ios->cred, NULL);
+                ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
                if (unlikely(ret)) {
-                        EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n",
+                        ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
                                     ret);
                        return ret;
                }
@@ -194,7 +178,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
        if (sync) {
                wait_for_completion(&wait);
-                ret = exofs_check_io(ios, NULL);
+                ret = ore_check_io(ios, NULL);
        }
        return ret;
 }
@@ -214,7 +198,7 @@ static void _clear_bio(struct bio *bio)
        }
 }
-int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, u64 *resid)
 {
        enum osd_err_priority acumulated_osd_err = 0;
        int acumulated_lin_err = 0;
@@ -235,7 +219,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
                        /* start read offset passed endof file */
                        _clear_bio(ios->per_dev[i].bio);
-                        EXOFS_DBGMSG("start read offset passed end of file "
+                        ORE_DBGMSG("start read offset passed end of file "
                                "offset=0x%llx, length=0x%llx\n",
                                _LLU(ios->per_dev[i].offset),
                                _LLU(ios->per_dev[i].length));
@@ -259,6 +243,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
        return acumulated_lin_err;
 }
+EXPORT_SYMBOL(ore_check_io);
 /*
 * L - logical offset into the file
@@ -305,20 +290,21 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 struct _striping_info {
        u64 obj_offset;
        u64 group_length;
+        u64 M; /* for truncate */
        unsigned dev;
        unsigned unit_off;
 };
-static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
+static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
                              struct _striping_info *si)
 {
-        u32     stripe_unit = ios->layout->stripe_unit;
+        u32     stripe_unit = layout->stripe_unit;
-        u32     group_width = ios->layout->group_width;
+        u32     group_width = layout->group_width;
-        u64     group_depth = ios->layout->group_depth;
+        u64     group_depth = layout->group_depth;
        u32     U = stripe_unit * group_width;
        u64     T = U * group_depth;
-        u64     S = T * ios->layout->group_count;
+        u64     S = T * layout->group_count;
        u64     M = div64_u64(file_offset, S);
        /*
@@ -333,7 +319,7 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-        si->dev *= ios->layout->mirrors_p1;
+        si->dev *= layout->mirrors_p1;
        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
@@ -341,15 +327,16 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
                                  (M * group_depth * stripe_unit);
        si->group_length = T - H;
+        si->M = M;
 }
-static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct exofs_per_dev_state *per_dev,
+                unsigned pgbase, struct ore_per_dev_state *per_dev,
                int cur_len)
 {
        unsigned pg = *cur_pg;
        struct request_queue *q =
-                        osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+                        osd_request_queue(_ios_od(ios, per_dev->dev));
        per_dev->length += cur_len;
@@ -361,7 +348,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
                if (unlikely(!per_dev->bio)) {
-                        EXOFS_DBGMSG("Failed to allocate BIO size=%u\n",
+                        ORE_DBGMSG("Failed to allocate BIO size=%u\n",
                                     bio_size);
                        return -ENOMEM;
                }
@@ -387,7 +374,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
        return 0;
 }
-static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
+static int _prepare_one_group(struct ore_io_state *ios, u64 length,
                              struct _striping_info *si)
 {
        unsigned stripe_unit = ios->layout->stripe_unit;
@@ -400,7 +387,7 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
        int ret = 0;
        while (length) {
-                struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
+                struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
@@ -443,7 +430,7 @@ out:
        return ret;
 }
-static int _prepare_for_striping(struct exofs_io_state *ios)
+static int _prepare_for_striping(struct ore_io_state *ios)
 {
        u64 length = ios->length;
        u64 offset = ios->offset;
@@ -452,9 +439,9 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
        if (!ios->pages) {
                if (ios->kern_buff) {
-                        struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+                        struct ore_per_dev_state *per_dev = &ios->per_dev[0];
-                        _calc_stripe_info(ios, ios->offset, &si);
+                        _calc_stripe_info(ios->layout, ios->offset, &si);
                        per_dev->offset = si.obj_offset;
                        per_dev->dev = si.dev;
@@ -468,7 +455,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
        }
        while (length) {
-                _calc_stripe_info(ios, offset, &si);
+                _calc_stripe_info(ios->layout, offset, &si);
                if (length < si.group_length)
                        si.group_length = length;
@@ -485,57 +472,59 @@ out:
        return ret;
 }
-int exofs_sbi_create(struct exofs_io_state *ios)
+int ore_create(struct ore_io_state *ios)
 {
        int i, ret;
-        for (i = 0; i < ios->layout->s_numdevs; i++) {
+        for (i = 0; i < ios->comps->numdevs; i++) {
                struct osd_request *or;
-                or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+                or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
                if (unlikely(!or)) {
-                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ORE_ERR("%s: osd_start_request failed\n", __func__);
                        ret = -ENOMEM;
                        goto out;
                }
                ios->per_dev[i].or = or;
                ios->numdevs++;
-                osd_req_create_object(or, &ios->obj);
+                osd_req_create_object(or, _ios_obj(ios, i));
        }
-        ret = exofs_io_execute(ios);
+        ret = ore_io_execute(ios);
 out:
        return ret;
 }
+EXPORT_SYMBOL(ore_create);
-int exofs_sbi_remove(struct exofs_io_state *ios)
+int ore_remove(struct ore_io_state *ios)
 {
        int i, ret;
-        for (i = 0; i < ios->layout->s_numdevs; i++) {
+        for (i = 0; i < ios->comps->numdevs; i++) {
                struct osd_request *or;
-                or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+                or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
                if (unlikely(!or)) {
-                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ORE_ERR("%s: osd_start_request failed\n", __func__);
                        ret = -ENOMEM;
                        goto out;
                }
                ios->per_dev[i].or = or;
                ios->numdevs++;
-                osd_req_remove_object(or, &ios->obj);
+                osd_req_remove_object(or, _ios_obj(ios, i));
        }
-        ret = exofs_io_execute(ios);
+        ret = ore_io_execute(ios);
 out:
        return ret;
 }
+EXPORT_SYMBOL(ore_remove);
-static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
+static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 {
-        struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+        struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
        unsigned dev = ios->per_dev[cur_comp].dev;
        unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
        int ret = 0;
@@ -544,12 +533,12 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                return 0; /* Just an empty slot */
        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
-                struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+                struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
                struct osd_request *or;
-                or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
+                or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
                if (unlikely(!or)) {
-                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ORE_ERR("%s: osd_start_request failed\n", __func__);
                        ret = -ENOMEM;
                        goto out;
                }
@@ -563,7 +552,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                                bio = bio_kmalloc(GFP_KERNEL,
                                                  master_dev->bio->bi_max_vecs);
                                if (unlikely(!bio)) {
-                                        EXOFS_DBGMSG(
+                                        ORE_DBGMSG(
                                              "Failed to allocate BIO size=%u\n",
                                              master_dev->bio->bi_max_vecs);
                                        ret = -ENOMEM;
@@ -582,25 +571,29 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                                bio->bi_rw |= REQ_WRITE;
                        }
-                        osd_req_write(or, &ios->obj, per_dev->offset, bio,
+                        osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
-                                      per_dev->length);
+                                      bio, per_dev->length);
-                        EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+                        ORE_DBGMSG("write(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(ios->obj.id), _LLU(per_dev->offset),
+                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(per_dev->offset),
                                     _LLU(per_dev->length), dev);
                } else if (ios->kern_buff) {
-                        ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
+                        ret = osd_req_write_kern(or, _ios_obj(ios, dev),
-                                           ios->kern_buff, ios->length);
+                                                 per_dev->offset,
+                                                 ios->kern_buff, ios->length);
                        if (unlikely(ret))
                                goto out;
-                        EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+                        ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(ios->obj.id), _LLU(per_dev->offset),
+                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(per_dev->offset),
                                     _LLU(ios->length), dev);
                } else {
-                        osd_req_set_attributes(or, &ios->obj);
+                        osd_req_set_attributes(or, _ios_obj(ios, dev));
-                        EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+                        ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-                                     _LLU(ios->obj.id), ios->out_attr_len, dev);
+                                     _LLU(_ios_obj(ios, dev)->id),
+                                     ios->out_attr_len, dev);
                }
                if (ios->out_attr)
@@ -616,7 +609,7 @@ out:
        return ret;
 }
-int exofs_sbi_write(struct exofs_io_state *ios)
+int ore_write(struct ore_io_state *ios)
 {
        int i;
        int ret;
@@ -626,52 +619,55 @@ int exofs_sbi_write(struct exofs_io_state *ios)
                return ret;
        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-                ret = _sbi_write_mirror(ios, i);
+                ret = _write_mirror(ios, i);
                if (unlikely(ret))
                        return ret;
        }
-        ret = exofs_io_execute(ios);
+        ret = ore_io_execute(ios);
        return ret;
 }
+EXPORT_SYMBOL(ore_write);
-static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
+static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 {
        struct osd_request *or;
-        struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+        struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
-        unsigned first_dev = (unsigned)ios->obj.id;
+        struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
+        unsigned first_dev = (unsigned)obj->id;
        if (ios->pages && !per_dev->length)
                return 0; /* Just an empty slot */
        first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
-        or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
+        or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
        if (unlikely(!or)) {
-                EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                ORE_ERR("%s: osd_start_request failed\n", __func__);
                return -ENOMEM;
        }
        per_dev->or = or;
        if (ios->pages) {
-                osd_req_read(or, &ios->obj, per_dev->offset,
+                osd_req_read(or, obj, per_dev->offset,
                                per_dev->bio, per_dev->length);
-                EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+                ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-                             " dev=%d\n", _LLU(ios->obj.id),
+                             " dev=%d\n", _LLU(obj->id),
                             _LLU(per_dev->offset), _LLU(per_dev->length),
                             first_dev);
        } else if (ios->kern_buff) {
-                int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
+                int ret = osd_req_read_kern(or, obj, per_dev->offset,
                                            ios->kern_buff, ios->length);
-                EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+                ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
                              "length=0x%llx dev=%d ret=>%d\n",
-                              _LLU(ios->obj.id), _LLU(per_dev->offset),
+                              _LLU(obj->id), _LLU(per_dev->offset),
                              _LLU(ios->length), first_dev, ret);
                if (unlikely(ret))
                        return ret;
        } else {
-                osd_req_get_attributes(or, &ios->obj);
+                osd_req_get_attributes(or, obj);
-                EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+                ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
-                              _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+                              _LLU(obj->id),
+                              ios->in_attr_len, first_dev);
        }
        if (ios->out_attr)
                osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
@@ -682,7 +678,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
        return 0;
 }
-int exofs_sbi_read(struct exofs_io_state *ios)
+int ore_read(struct ore_io_state *ios)
 {
        int i;
        int ret;
@@ -692,16 +688,17 @@ int exofs_sbi_read(struct exofs_io_state *ios)
                return ret;
        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-                ret = _sbi_read_mirror(ios, i);
+                ret = _read_mirror(ios, i);
                if (unlikely(ret))
                        return ret;
        }
-        ret = exofs_io_execute(ios);
+        ret = ore_io_execute(ios);
        return ret;
 }
+EXPORT_SYMBOL(ore_read);
-int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
+int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
 {
        struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
        void *iter = NULL;
@@ -721,83 +718,118 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
        return -EIO;
 }
+EXPORT_SYMBOL(extract_attr_from_ios);
-static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
                             struct osd_attr *attr)
 {
        int last_comp = cur_comp + ios->layout->mirrors_p1;
        for (; cur_comp < last_comp; ++cur_comp) {
-                struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+                struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
                struct osd_request *or;
-                or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+                or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
                if (unlikely(!or)) {
-                        EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+                        ORE_ERR("%s: osd_start_request failed\n", __func__);
                        return -ENOMEM;
                }
                per_dev->or = or;
-                osd_req_set_attributes(or, &ios->obj);
+                osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
                osd_req_add_set_attr_list(or, attr, 1);
        }
        return 0;
 }
-int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
+struct _trunc_info {
+        struct _striping_info si;
+        u64 prev_group_obj_off;
+        u64 next_group_obj_off;
+        unsigned first_group_dev;
+        unsigned nex_group_dev;
+        unsigned max_devs;
+};
+void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+                       struct _trunc_info *ti)
+{
+        unsigned stripe_unit = layout->stripe_unit;
+        _calc_stripe_info(layout, file_offset, &ti->si);
+        ti->prev_group_obj_off = ti->si.M * stripe_unit;
+        ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
+        ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
+        ti->nex_group_dev = ti->first_group_dev + layout->group_width;
+        ti->max_devs = layout->group_width * layout->group_count;
+}
+int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+                   u64 size)
 {
-        struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
+        struct ore_io_state *ios;
-        struct exofs_io_state *ios;
        struct exofs_trunc_attr {
                struct osd_attr attr;
                __be64 newsize;
        } *size_attrs;
-        struct _striping_info si;
+        struct _trunc_info ti;
        int i, ret;
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(layout, comps, &ios);
        if (unlikely(ret))
                return ret;
-        size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+        _calc_trunk_info(ios->layout, size, &ti);
+        size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
                             GFP_KERNEL);
        if (unlikely(!size_attrs)) {
                ret = -ENOMEM;
                goto out;
        }
-        ios->obj.id = exofs_oi_objno(oi);
+        ios->numdevs = ios->comps->numdevs;
-        ios->cred = oi->i_cred;
-        ios->numdevs = ios->layout->s_numdevs;
+        for (i = 0; i < ti.max_devs; ++i) {
-        _calc_stripe_info(ios, size, &si);
-        for (i = 0; i < ios->layout->group_width; ++i) {
                struct exofs_trunc_attr *size_attr = &size_attrs[i];
                u64 obj_size;
-                if (i < si.dev)
+                if (i < ti.first_group_dev)
-                        obj_size = si.obj_offset +
+                        obj_size = ti.prev_group_obj_off;
-                                        ios->layout->stripe_unit - si.unit_off;
+                else if (i >= ti.nex_group_dev)
-                else if (i == si.dev)
+                        obj_size = ti.next_group_obj_off;
-                        obj_size = si.obj_offset;
+                else if (i < ti.si.dev) /* dev within this group */
-                else /* i > si.dev */
+                        obj_size = ti.si.obj_offset +
-                        obj_size = si.obj_offset - si.unit_off;
+                                      ios->layout->stripe_unit - ti.si.unit_off;
+                else if (i == ti.si.dev)
+                        obj_size = ti.si.obj_offset;
+                else /* i > ti.dev */
+                        obj_size = ti.si.obj_offset - ti.si.unit_off;
                size_attr->newsize = cpu_to_be64(obj_size);
                size_attr->attr = g_attr_logical_length;
                size_attr->attr.val_ptr = &size_attr->newsize;
+                ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+                             _LLU(comps->comps->obj.id), _LLU(obj_size), i);
                ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
                                        &size_attr->attr);
                if (unlikely(ret))
                        goto out;
        }
-        ret = exofs_io_execute(ios);
+        ret = ore_io_execute(ios);
 out:
        kfree(size_attrs);
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        return ret;
 }
+EXPORT_SYMBOL(ore_truncate);
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
deleted file mode 100644
index c52e9888b8ab..000000000000
--- a/fs/exofs/pnfs.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <bharrosh@panasas.com>
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify it under the
- * terms of the GNU General Public License  version 2 as published by the Free
- * Software Foundation.
- *
- */
-/* FIXME: Remove this file once pnfs hits mainline */
-#ifndef __EXOFS_PNFS_H__
-#define __EXOFS_PNFS_H__
-#if ! defined(__PNFS_OSD_XDR_H__)
-enum pnfs_iomode {
-        IOMODE_READ = 1,
-        IOMODE_RW = 2,
-        IOMODE_ANY = 3,
-};
-/* Layout Structure */
-enum pnfs_osd_raid_algorithm4 {
-        PNFS_OSD_RAID_0         = 1,
-        PNFS_OSD_RAID_4         = 2,
-        PNFS_OSD_RAID_5         = 3,
-        PNFS_OSD_RAID_PQ        = 4     /* Reed-Solomon P+Q */
-};
-struct pnfs_osd_data_map {
-        u32     odm_num_comps;
-        u64     odm_stripe_unit;
-        u32     odm_group_width;
-        u32     odm_group_depth;
-        u32     odm_mirror_cnt;
-        u32     odm_raid_algorithm;
-};
-#endif /* ! defined(__PNFS_OSD_XDR_H__) */
-#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index c57beddcc217..274894053b02 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -40,6 +40,8 @@
 #include "exofs.h"
+#define EXOFS_DBGMSG2(M...) do {} while (0)
 /******************************************************************************
 * MOUNT OPTIONS
 *****************************************************************************/
@@ -208,10 +210,48 @@ static void destroy_inodecache(void)
 }
 /******************************************************************************
- * SUPERBLOCK FUNCTIONS
+ * Some osd helpers
 *****************************************************************************/
-static const struct super_operations exofs_sops;
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
-static const struct export_operations exofs_export_ops;
+{
+        osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
+                    u64 offset, void *p, unsigned length)
+{
+        struct osd_request *or = osd_start_request(od, GFP_KERNEL);
+/*      struct osd_sense_info osi = {.key = 0};*/
+        int ret;
+        if (unlikely(!or)) {
+                EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
+                return -ENOMEM;
+        }
+        ret = osd_req_read_kern(or, obj, offset, p, length);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
+                goto out;
+        }
+        ret = osd_finalize_request(or, 0, cred, NULL);
+        if (unlikely(ret)) {
+                EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
+                goto out;
+        }
+        ret = osd_execute_request(or);
+        if (unlikely(ret))
+                EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+        /* osd_req_decode_sense(or, ret); */
+out:
+        osd_end_request(or);
+        EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+                      "length=0x%llx dev=%p ret=>%d\n",
+                      _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
+        return ret;
+}
 static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
        EXOFS_APAGE_SB_DATA,
@@ -223,21 +263,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
        struct osd_attr attrs[] = {
                [0] = g_attr_sb_stats,
        };
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        int ret;
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
        }
-        ios->cred = sbi->s_cred;
        ios->in_attr = attrs;
        ios->in_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sbi_read(ios);
+        ret = ore_read(ios);
        if (unlikely(ret)) {
                EXOFS_ERR("Error reading super_block stats => %d\n", ret);
                goto out;
@@ -264,13 +302,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
        }
 out:
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        return ret;
 }
-static void stats_done(struct exofs_io_state *ios, void *p)
+static void stats_done(struct ore_io_state *ios, void *p)
 {
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        /* Good thanks nothing to do anymore */
 }
@@ -280,12 +318,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
        struct osd_attr attrs[] = {
                [0] = g_attr_sb_stats,
        };
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        int ret;
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
        }
@@ -293,21 +331,27 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
        sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
        attrs[0].val_ptr = &sbi->s_ess;
-        ios->cred = sbi->s_cred;
        ios->done = stats_done;
        ios->private = sbi;
        ios->out_attr = attrs;
        ios->out_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sbi_write(ios);
+        ret = ore_write(ios);
        if (unlikely(ret)) {
-                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+                EXOFS_ERR("%s: ore_write failed.\n", __func__);
-                exofs_put_io_state(ios);
+                ore_put_io_state(ios);
        }
        return ret;
 }
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
@@ -315,7 +359,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
-        struct exofs_io_state *ios;
+        struct ore_comp one_comp;
+        struct ore_components comps;
+        struct ore_io_state *ios;
        int ret = -ENOMEM;
        fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
@@ -331,7 +377,10 @@ int exofs_sync_fs(struct super_block *sb, int wait)
         * version). Otherwise the exofs_fscb is read-only from mkfs time. All
         * the writeable info is set in exofs_sbi_write_stats() above.
         */
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+        ret = ore_get_io_state(&sbi->layout, &comps, &ios);
        if (unlikely(ret))
                goto out;
@@ -345,14 +394,12 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        fscb->s_newfs = 0;
        fscb->s_version = EXOFS_FSCB_VER;
-        ios->obj.id = EXOFS_SUPER_ID;
        ios->offset = 0;
        ios->kern_buff = fscb;
-        ios->cred = sbi->s_cred;
-        ret = exofs_sbi_write(ios);
+        ret = ore_write(ios);
        if (unlikely(ret))
-                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+                EXOFS_ERR("%s: ore_write failed.\n", __func__);
        else
                sb->s_dirt = 0;
@@ -360,7 +407,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        unlock_super(sb);
 out:
        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        kfree(fscb);
        return ret;
 }
@@ -384,15 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-        while (sbi->layout.s_numdevs) {
+        while (sbi->comps.numdevs) {
-                int i = --sbi->layout.s_numdevs;
+                int i = --sbi->comps.numdevs;
-                struct osd_dev *od = sbi->layout.s_ods[i];
+                struct osd_dev *od = sbi->comps.ods[i];
                if (od) {
-                        sbi->layout.s_ods[i] = NULL;
+                        sbi->comps.ods[i] = NULL;
                        osduld_put_device(od);
                }
        }
+        if (sbi->comps.ods != sbi->_min_one_dev)
+                kfree(sbi->comps.ods);
        kfree(sbi);
 }
@@ -419,8 +468,8 @@ static void exofs_put_super(struct super_block *sb)
                                  msecs_to_jiffies(100));
        }
-        _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
+        _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
-                            sbi->layout.s_pid);
+                            sbi->one_comp.obj.partition);
        bdi_destroy(&sbi->bdi);
        exofs_free_sbi(sbi);
@@ -501,10 +550,19 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
                return -EINVAL;
        }
+        EXOFS_DBGMSG("exofs: layout: "
+                "num_comps=%u stripe_unit=0x%x group_width=%u "
+                "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
+                numdevs,
+                sbi->layout.stripe_unit,
+                sbi->layout.group_width,
+                _LLU(sbi->layout.group_depth),
+                sbi->layout.mirrors_p1,
+                sbi->data_map.odm_raid_algorithm);
        return 0;
 }
-static unsigned __ra_pages(struct exofs_layout *layout)
+static unsigned __ra_pages(struct ore_layout *layout)
 {
        const unsigned _MIN_RA = 32; /* min 128K read-ahead */
        unsigned ra_pages = layout->group_width * layout->stripe_unit /
@@ -547,13 +605,11 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
        return !(odi->systemid_len || odi->osdname_len);
 }
-static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
+static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
+                                       struct osd_dev *fscb_od,
                                       unsigned table_count)
 {
-        struct exofs_sb_info *sbi = *psbi;
+        struct ore_comp comp;
-        struct osd_dev *fscb_od;
-        struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
-                                 .id = EXOFS_DEVTABLE_ID};
        struct exofs_device_table *dt;
        unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
                                             sizeof(*dt);
@@ -567,10 +623,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                return -ENOMEM;
        }
-        fscb_od = sbi->layout.s_ods[0];
+        sbi->comps.numdevs = 0;
-        sbi->layout.s_ods[0] = NULL;
-        sbi->layout.s_numdevs = 0;
+        comp.obj.partition = sbi->one_comp.obj.partition;
-        ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
+        comp.obj.id = EXOFS_DEVTABLE_ID;
+        exofs_make_credential(comp.cred, &comp.obj);
+        ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
+                              table_bytes);
        if (unlikely(ret)) {
                EXOFS_ERR("ERROR: reading device table\n");
                goto out;
@@ -588,16 +648,18 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                goto out;
        if (likely(numdevs > 1)) {
-                unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
+                unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
-                sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
+                /* Twice bigger table: See exofs_init_comps() and below
-                if (unlikely(!sbi)) {
+                 * comment
+                 */
+                sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
+                if (unlikely(!sbi->comps.ods)) {
+                        EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+                                  numdevs);
                        ret = -ENOMEM;
                        goto out;
                }
-                memset(&sbi->layout.s_ods[1], 0,
-                       size - sizeof(sbi->layout.s_ods[0]));
-                *psbi = sbi;
        }
        for (i = 0; i < numdevs; i++) {
@@ -619,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                 * line. We always keep them in device-table order.
                 */
                if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-                        sbi->layout.s_ods[i] = fscb_od;
+                        sbi->comps.ods[i] = fscb_od;
-                        ++sbi->layout.s_numdevs;
+                        ++sbi->comps.numdevs;
                        fscb_od = NULL;
                        continue;
                }
@@ -633,13 +695,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                        goto out;
                }
-                sbi->layout.s_ods[i] = od;
+                sbi->comps.ods[i] = od;
-                ++sbi->layout.s_numdevs;
+                ++sbi->comps.numdevs;
                /* Read the fscb of the other devices to make sure the FS
                 * partition is there.
                 */
-                ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
+                ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
                                      sizeof(fscb));
                if (unlikely(ret)) {
                        EXOFS_ERR("ERROR: Malformed participating device "
@@ -656,13 +718,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 out:
        kfree(dt);
-        if (unlikely(!ret && fscb_od)) {
+        if (likely(!ret)) {
-                EXOFS_ERR(
+                unsigned numdevs = sbi->comps.numdevs;
-                      "ERROR: Bad device-table container device not present\n");
-                osduld_put_device(fscb_od);
-                ret = -EINVAL;
-        }
+                if (unlikely(fscb_od)) {
+                        EXOFS_ERR("ERROR: Bad device-table container device not present\n");
+                        osduld_put_device(fscb_od);
+                        return -EINVAL;
+                }
+                /* exofs round-robins the device table view according to inode
+                 * number. We hold a: twice bigger table hence inodes can point
+                 * to any device and have a sequential view of the table
+                 * starting at this device. See exofs_init_comps()
+                 */
+                for (i = 0; i < numdevs - 1; ++i)
+                        sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
+        }
        return ret;
 }
@@ -676,7 +747,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        struct exofs_sb_info *sbi;      /*extended info                  */
        struct osd_dev *od;             /* Master device                 */
        struct exofs_fscb fscb;         /*on-disk superblock info        */
-        struct osd_obj_id obj;
+        struct ore_comp comp;
        unsigned table_count;
        int ret;
@@ -684,10 +755,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
-        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
-        if (ret)
-                goto free_bdi;
        /* use mount options to fill superblock */
        if (opts->is_osdname) {
                struct osd_dev_info odi = {.systemid_len = 0};
@@ -695,6 +762,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                odi.osdname_len = strlen(opts->dev_name);
                odi.osdname = (u8 *)opts->dev_name;
                od = osduld_info_lookup(&odi);
+                kfree(opts->dev_name);
+                opts->dev_name = NULL;
        } else {
                od = osduld_path_lookup(opts->dev_name);
        }
@@ -709,11 +778,16 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->layout.group_width = 1;
        sbi->layout.group_depth = -1;
        sbi->layout.group_count = 1;
-        sbi->layout.s_ods[0] = od;
-        sbi->layout.s_numdevs = 1;
-        sbi->layout.s_pid = opts->pid;
        sbi->s_timeout = opts->timeout;
+        sbi->one_comp.obj.partition = opts->pid;
+        sbi->one_comp.obj.id = 0;
+        exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
+        sbi->comps.numdevs = 1;
+        sbi->comps.single_comp = EC_SINGLE_COMP;
+        sbi->comps.comps = &sbi->one_comp;
+        sbi->comps.ods = sbi->_min_one_dev;
        /* fill in some other data by hand */
        memset(sb->s_id, 0, sizeof(sb->s_id));
        strcpy(sb->s_id, "exofs");
@@ -724,11 +798,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_bdev = NULL;
        sb->s_dev = 0;
-        obj.partition = sbi->layout.s_pid;
+        comp.obj.partition = sbi->one_comp.obj.partition;
-        obj.id = EXOFS_SUPER_ID;
+        comp.obj.id = EXOFS_SUPER_ID;
-        exofs_make_credential(sbi->s_cred, &obj);
+        exofs_make_credential(comp.cred, &comp.obj);
-        ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
+        ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
        if (unlikely(ret))
                goto free_sbi;
@@ -757,9 +831,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        table_count = le64_to_cpu(fscb.s_dev_table_count);
        if (table_count) {
-                ret = exofs_read_lookup_dev_table(&sbi, table_count);
+                ret = exofs_read_lookup_dev_table(sbi, od, table_count);
                if (unlikely(ret))
                        goto free_sbi;
+        } else {
+                sbi->comps.ods[0] = od;
        }
        __sbi_read_stats(sbi);
@@ -793,20 +869,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
-                            sbi->layout.s_pid);
+        if (ret) {
-        if (opts->is_osdname)
+                EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
-                kfree(opts->dev_name);
+                goto free_sbi;
+        }
+        _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+                            sbi->one_comp.obj.partition);
        return 0;
 free_sbi:
-        bdi_destroy(&sbi->bdi);
-free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-                  opts->dev_name, sbi->layout.s_pid, ret);
+                  opts->dev_name, sbi->one_comp.obj.partition, ret);
        exofs_free_sbi(sbi);
-        if (opts->is_osdname)
-                kfree(opts->dev_name);
        return ret;
 }
@@ -837,7 +913,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        struct exofs_io_state *ios;
+        struct ore_io_state *ios;
        struct osd_attr attrs[] = {
                ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
                        OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -846,21 +922,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        };
        uint64_t capacity = ULLONG_MAX;
        uint64_t used = ULLONG_MAX;
-        uint8_t cred_a[OSD_CAP_LEN];
        int ret;
-        ret = exofs_get_io_state(&sbi->layout, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
        if (ret) {
-                EXOFS_DBGMSG("exofs_get_io_state failed.\n");
+                EXOFS_DBGMSG("ore_get_io_state failed.\n");
                return ret;
        }
-        exofs_make_credential(cred_a, &ios->obj);
-        ios->cred = sbi->s_cred;
        ios->in_attr = attrs;
        ios->in_attr_len = ARRAY_SIZE(attrs);
-        ret = exofs_sbi_read(ios);
+        ret = ore_read(ios);
        if (unlikely(ret))
                goto out;
@@ -889,7 +962,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = EXOFS_NAME_LEN;
 out:
-        exofs_put_io_state(ios);
+        ore_put_io_state(ios);
        return ret;
 }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 52c053763942..35d6a3cfd9ff 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                case ACL_TYPE_ACCESS:
                        name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
                        if (acl) {
-                                mode_t mode = inode->i_mode;
+                                error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                                error = posix_acl_equiv_mode(acl, &mode);
                                if (error < 0)
                                        return error;
                                else {
-                                        inode->i_mode = mode;
                                        inode->i_ctime = CURRENT_TIME_SEC;
                                        mark_inode_dirty(inode);
                                        if (error == 0)
@@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                mode_t mode = inode->i_mode;
                if (S_ISDIR(inode->i_mode)) {
                        error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
                        if (error)
                                goto cleanup;
                }
-                error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+                error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
                if (error < 0)
                        return error;
-                inode->i_mode = mode;
                if (error > 0) {
                        /* This is an extended ACL */
                        error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 5c0a6a4fb052..503bfb0ed79b 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -61,7 +61,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
 #else
 #include <linux/sched.h>
 #define ext2_get_acl    NULL
-#define ext2_get_acl    NULL
 #define ext2_set_acl    NULL
 static inline int
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 529970617a21..d27b71f1d183 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
        if (name == NULL)
                return -EINVAL;
+        name_len = strlen(name);
+        if (name_len > 255)
+                return -ERANGE;
        down_read(&EXT2_I(inode)->xattr_sem);
        error = -ENODATA;
        if (!EXT2_I(inode)->i_file_acl)
@@ -181,12 +185,8 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
                error = -EIO;
                goto cleanup;
        }
-        /* find named attribute */
-        name_len = strlen(name);
-        error = -ERANGE;
+        /* find named attribute */
-        if (name_len > 255)
-                goto cleanup;
        entry = FIRST_ENTRY(bh);
        while (!IS_LAST_ENTRY(entry)) {
                struct ext2_xattr_entry *next =
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 6c29bf0df04a..3091f62e55b6 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
                case ACL_TYPE_ACCESS:
                        name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
                        if (acl) {
-                                mode_t mode = inode->i_mode;
+                                error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                                error = posix_acl_equiv_mode(acl, &mode);
                                if (error < 0)
                                        return error;
                                else {
-                                        inode->i_mode = mode;
                                        inode->i_ctime = CURRENT_TIME_SEC;
                                        ext3_mark_inode_dirty(handle, inode);
                                        if (error == 0)
@@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                mode_t mode = inode->i_mode;
                if (S_ISDIR(inode->i_mode)) {
                        error = ext3_set_acl(handle, inode,
                                             ACL_TYPE_DEFAULT, acl);
                        if (error)
                                goto cleanup;
                }
-                error = posix_acl_create(&acl, GFP_NOFS, &mode);
+                error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
                if (error < 0)
                        return error;
-                inode->i_mode = mode;
                if (error > 0) {
                        /* This is an extended ACL */
                        error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index fe52297e31ad..6386d76f44a7 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -21,6 +21,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <trace/events/ext3.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
        desc = ext3_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return NULL;
+        trace_ext3_read_block_bitmap(sb, block_group);
        bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
@@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb,
        struct rb_node * parent = NULL;
        struct ext3_reserve_window_node *this;
+        trace_ext3_rsv_window_add(sb, rsv);
        while (*p)
        {
                parent = *p;
@@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode)
        rsv = &block_i->rsv_window_node;
        if (!rsv_is_empty(&rsv->rsv_window)) {
                spin_lock(rsv_lock);
-                if (!rsv_is_empty(&rsv->rsv_window))
+                if (!rsv_is_empty(&rsv->rsv_window)) {
+                        trace_ext3_discard_reservation(inode, rsv);
                        rsv_window_remove(inode->i_sb, rsv);
+                }
                spin_unlock(rsv_lock);
        }
 }
@@ -683,14 +688,10 @@ error_return:
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t block, unsigned long count)
 {
-        struct super_block * sb;
+        struct super_block *sb = inode->i_sb;
        unsigned long dquot_freed_blocks;
-        sb = inode->i_sb;
+        trace_ext3_free_blocks(inode, block, count);
-        if (!sb) {
-                printk ("ext3_free_blocks: nonexistent device");
-                return;
-        }
        ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
        if (dquot_freed_blocks)
                dquot_free_block(inode, dquot_freed_blocks);
@@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
        else
                start_block = grp_goal + group_first_block;
+        trace_ext3_alloc_new_reservation(sb, start_block);
        size = my_rsv->rsv_goal_size;
        if (!rsv_is_empty(&my_rsv->rsv_window)) {
@@ -1230,8 +1232,11 @@ retry:
         * check if the first free block is within the
         * free space we just reserved
         */
-        if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
+        if (start_block >= my_rsv->rsv_start &&
+            start_block <= my_rsv->rsv_end) {
+                trace_ext3_reserved(sb, start_block, my_rsv);
                return 0;               /* success */
+        }
        /*
         * if the first free bit we found is out of the reservable space
         * continue search for next reservable space,
@@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
        *errp = -ENOSPC;
        sb = inode->i_sb;
-        if (!sb) {
-                printk("ext3_new_block: nonexistent device");
-                return 0;
-        }
        /*
         * Check quota for allocation of this block.
@@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
                return 0;
        }
+        trace_ext3_request_blocks(inode, goal, num);
        sbi = EXT3_SB(sb);
-        es = EXT3_SB(sb)->s_es;
+        es = sbi->s_es;
        ext3_debug("goal=%lu.\n", goal);
        /*
         * Allocate a block from reservation only when
@@ -1742,6 +1745,10 @@ allocated:
        brelse(bitmap_bh);
        dquot_free_block(inode, *count-num);
        *count = num;
+        trace_ext3_allocate_blocks(inode, goal, num,
+                                   (unsigned long long)ret_block);
        return ret_block;
 io_error:
@@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
                if ((next - start) < minblocks)
                        goto free_extent;
+                trace_ext3_discard_blocks(sb, discard_block, next - start);
                 /* Send the TRIM command down to the device */
                err = sb_issue_discard(sb, discard_block, next - start,
                                       GFP_NOFS, 0);
@@ -2100,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
        if (start >= max_blks)
-                goto out;
+                return -EINVAL;
        if (start + len > max_blks)
                len = max_blks - start;
@@ -2148,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (ret >= 0)
                ret = 0;
-out:
        range->len = trimmed * sb->s_blocksize;
        return ret;
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 2be5b99097f1..724df69847dc 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = {
 };
 const struct inode_operations ext3_file_inode_operations = {
-        .truncate       = ext3_truncate,
        .setattr        = ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
        .setxattr       = generic_setxattr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 0bcf63adb80a..d494c554c6e6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -30,6 +30,7 @@
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
+#include <trace/events/ext3.h>
 /*
 * akpm: A new design for ext3_sync_file().
@@ -51,12 +52,14 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        int ret, needs_barrier = 0;
        tid_t commit_tid;
+        trace_ext3_sync_file_enter(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret)
-                return ret;
+                goto out;
        /*
         * Taking the mutex here just to keep consistent with how fsync was
@@ -83,7 +86,8 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        if (ext3_should_journal_data(inode)) {
                mutex_unlock(&inode->i_mutex);
-                return ext3_force_commit(inode->i_sb);
+                ret = ext3_force_commit(inode->i_sb);
+                goto out;
        }
        if (datasync)
@@ -104,6 +108,9 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        if (needs_barrier)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
        mutex_unlock(&inode->i_mutex);
+out:
+        trace_ext3_sync_file_exit(inode, ret);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index bfc2dc43681d..bf09cbf938cc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/random.h>
 #include <linux/bitops.h>
+#include <trace/events/ext3.h>
 #include <asm/byteorder.h>
@@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
        ino = inode->i_ino;
        ext3_debug ("freeing inode %lu\n", ino);
+        trace_ext3_free_inode(inode);
        is_directory = S_ISDIR(inode->i_mode);
@@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
+        trace_ext3_request_inode(dir, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -601,6 +604,7 @@ got:
        }
        ext3_debug("allocating inode %lu\n", inode->i_ino);
+        trace_ext3_allocate_inode(inode, dir, mode);
        goto really_out;
 fail:
        ext3_std_error(sb, err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2978a2a17a59..04da6acde85d 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -38,10 +38,12 @@
 #include <linux/bio.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <trace/events/ext3.h>
 #include "xattr.h"
 #include "acl.h"
 static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
 /*
 * Test whether an inode is a fast symlink.
@@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
        might_sleep();
+        trace_ext3_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");
        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 */
 void ext3_evict_inode (struct inode *inode)
 {
+        struct ext3_inode_info *ei = EXT3_I(inode);
        struct ext3_block_alloc_info *rsv;
        handle_t *handle;
        int want_delete = 0;
+        trace_ext3_evict_inode(inode);
        if (!inode->i_nlink && !is_bad_inode(inode)) {
                dquot_initialize(inode);
                want_delete = 1;
        }
+        /*
+         * When journalling data dirty buffers are tracked only in the journal.
+         * So although mm thinks everything is clean and ready for reaping the
+         * inode might still have some pages to write in the running
+         * transaction or waiting to be checkpointed. Thus calling
+         * journal_invalidatepage() (via truncate_inode_pages()) to discard
+         * these buffers can cause data loss. Also even if we did not discard
+         * these buffers, we would have no way to find them after the inode
+         * is reaped and thus user could see stale data if he tries to read
+         * them before the transaction is checkpointed. So be careful and
+         * force everything to disk here... We use ei->i_datasync_tid to
+         * store the newest transaction containing inode's data.
+         *
+         * Note that directories do not have this problem because they don't
+         * use page cache.
+         */
+        if (inode->i_nlink && ext3_should_journal_data(inode) &&
+            (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+                tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+                journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+                log_start_commit(journal, commit_tid);
+                log_wait_commit(journal, commit_tid);
+                filemap_write_and_wait(&inode->i_data);
+        }
        truncate_inode_pages(&inode->i_data, 0);
        ext3_discard_reservation(inode);
-        rsv = EXT3_I(inode)->i_block_alloc_info;
+        rsv = ei->i_block_alloc_info;
-        EXT3_I(inode)->i_block_alloc_info = NULL;
+        ei->i_block_alloc_info = NULL;
        if (unlikely(rsv))
                kfree(rsv);
@@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode)
        if (inode->i_blocks)
                ext3_truncate(inode);
        /*
-         * Kill off the orphan record which ext3_truncate created.
+         * Kill off the orphan record created when the inode lost the last
-         * AKPM: I think this can be inside the above `if'.
+         * link.  Note that ext3_orphan_del() has to be able to cope with the
-         * Note that ext3_orphan_del() has to be able to cope with the
+         * deletion of a non-existent orphan - ext3_truncate() could
-         * deletion of a non-existent orphan - this is because we don't
+         * have removed the record.
-         * know if ext3_truncate() actually created an orphan record.
-         * (Well, we could do this if we need to, but heck - it works)
         */
        ext3_orphan_del(handle, inode);
-        EXT3_I(inode)->i_dtime  = get_seconds();
+        ei->i_dtime = get_seconds();
        /*
         * One subtle ordering requirement: if anything has gone wrong
@@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        ext3_fsblk_t first_block = 0;
+        trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
        J_ASSERT(handle != NULL || create == 0);
        depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
@@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!create || err == -EIO)
                goto cleanup;
+        /*
+         * Block out ext3_truncate while we alter the tree
+         */
        mutex_lock(&ei->truncate_mutex);
        /*
@@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
         */
        count = ext3_blks_to_allocate(partial, indirect_blks,
                                        maxblocks, blocks_to_boundary);
-        /*
-         * Block out ext3_truncate while we alter the tree
-         */
        err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
                                offsets + (partial - chain), partial);
@@ -970,6 +999,9 @@ cleanup:
        }
        BUFFER_TRACE(bh_result, "returned");
 out:
+        trace_ext3_get_blocks_exit(inode, iblock,
+                                   depth ? le32_to_cpu(chain[depth-1].key) : 0,
+                                   count, err);
        return err;
 }
@@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode)
        ext3_truncate(inode);
 }
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+        ext3_block_truncate_page(inode, inode->i_size);
+        ext3_truncate(inode);
+}
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
@@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
         * we allocate blocks but write fails for some reason */
        int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
+        trace_ext3_write_begin(inode, pos, len, flags);
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file,
        unsigned from, to;
        int ret = 0, ret2;
+        trace_ext3_ordered_write_end(inode, pos, len, copied);
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file,
        struct inode *inode = file->f_mapping->host;
        int ret;
+        trace_ext3_writeback_write_end(inode, pos, len, copied);
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
        update_file_sizes(inode, pos, copied);
        /*
@@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file,
 {
        handle_t *handle = ext3_journal_current_handle();
        struct inode *inode = mapping->host;
+        struct ext3_inode_info *ei = EXT3_I(inode);
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
+        trace_ext3_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file,
        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ext3_set_inode_state(inode, EXT3_STATE_JDATA);
-        if (inode->i_size > EXT3_I(inode)->i_disksize) {
+        atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-                EXT3_I(inode)->i_disksize = inode->i_size;
+        if (inode->i_size > ei->i_disksize) {
+                ei->i_disksize = inode->i_size;
                ret2 = ext3_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
@@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto out_fail;
+        trace_ext3_ordered_writepage(page);
        if (!page_has_buffers(page)) {
                create_empty_buffers(page, inode->i_sb->s_blocksize,
                                (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto out_fail;
+        trace_ext3_writeback_writepage(page);
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0,
                                      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
@@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto no_write;
+        trace_ext3_journalled_writepage(page);
        handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page,
                if (ret == 0)
                        ret = err;
                ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+                atomic_set(&EXT3_I(inode)->i_datasync_tid,
+                           handle->h_transaction->t_tid);
                unlock_page(page);
        } else {
                /*
@@ -1739,6 +1793,7 @@ out_unlock:
 static int ext3_readpage(struct file *file, struct page *page)
 {
+        trace_ext3_readpage(page);
        return mpage_readpage(page, ext3_get_block);
 }
@@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+        trace_ext3_invalidatepage(page, offset);
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
@@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+        trace_ext3_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
        size_t count = iov_length(iov, nr_segs);
        int retries = 0;
+        trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (rw == WRITE) {
                loff_t final_size = offset + count;
@@ -1827,7 +1887,7 @@ retry:
                loff_t end = offset + iov_length(iov, nr_segs);
                if (end > isize)
-                        vmtruncate(inode, isize);
+                        ext3_truncate_failed_direct_write(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1841,7 +1901,7 @@ retry:
                        /* This is really bad luck. We've written the data
                         * but cannot extend i_size. Truncate allocated blocks
                         * and pretend the write failed... */
-                        ext3_truncate(inode);
+                        ext3_truncate_failed_direct_write(inode);
                        ret = PTR_ERR(handle);
                        goto out;
                }
@@ -1867,6 +1927,8 @@ retry:
                        ret = err;
        }
 out:
+        trace_ext3_direct_IO_exit(inode, offset,
+                                iov_length(iov, nr_segs), rw, ret);
        return ret;
 }
@@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode)
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
-                struct address_space *mapping, loff_t from)
 {
        ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned offset = from & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, length, pos;
-        struct inode *inode = mapping->host;
+        struct page *page;
+        handle_t *handle = NULL;
        struct buffer_head *bh;
        int err = 0;
+        /* Truncated on block boundary - nothing to do */
        blocksize = inode->i_sb->s_blocksize;
+        if ((from & (blocksize - 1)) == 0)
+                return 0;
+        page = grab_cache_page(inode->i_mapping, index);
+        if (!page)
+                return -ENOMEM;
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                        goto unlock;
        }
+        /* data=writeback mode doesn't need transaction to zero-out data */
+        if (!ext3_should_writeback_data(inode)) {
+                /* We journal at most one block */
+                handle = ext3_journal_start(inode, 1);
+                if (IS_ERR(handle)) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        err = PTR_ERR(handle);
+                        goto unlock;
+                }
+        }
        if (ext3_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext3_journal_get_write_access(handle, bh);
                if (err)
-                        goto unlock;
+                        goto stop;
        }
        zero_user(page, offset, length);
@@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                        err = ext3_journal_dirty_data(handle, bh);
                mark_buffer_dirty(bh);
        }
+stop:
+        if (handle)
+                ext3_journal_stop(handle);
 unlock:
        unlock_page(page);
@@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 int ext3_can_truncate(struct inode *inode)
 {
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode)
        struct ext3_inode_info *ei = EXT3_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-        struct address_space *mapping = inode->i_mapping;
        int offsets[4];
        Indirect chain[4];
        Indirect *partial;
@@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode)
        int n;
        long last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
-        struct page *page;
+        trace_ext3_truncate_enter(inode);
        if (!ext3_can_truncate(inode))
                goto out_notrans;
@@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode)
        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
                ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
-        /*
-         * We have to lock the EOF page here, because lock_page() nests
-         * outside journal_start().
-         */
-        if ((inode->i_size & (blocksize - 1)) == 0) {
-                /* Block boundary? Nothing to do */
-                page = NULL;
-        } else {
-                page = grab_cache_page(mapping,
-                                inode->i_size >> PAGE_CACHE_SHIFT);
-                if (!page)
-                        goto out_notrans;
-        }
        handle = start_transaction(inode);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle))
-                if (page) {
-                        clear_highpage(page);
-                        flush_dcache_page(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
                goto out_notrans;
-        }
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-        if (page)
-                ext3_block_truncate_page(handle, page, mapping, inode->i_size);
        n = ext3_block_to_path(inode, last_block, offsets, NULL);
        if (n == 0)
                goto out_stop;  /* error */
@@ -2596,6 +2653,7 @@ out_stop:
                ext3_orphan_del(handle, inode);
        ext3_journal_stop(handle);
+        trace_ext3_truncate_exit(inode);
        return;
 out_notrans:
        /*
@@ -2604,6 +2662,7 @@ out_notrans:
         */
        if (inode->i_nlink)
                ext3_orphan_del(NULL, inode);
+        trace_ext3_truncate_exit(inode);
 }
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2745,6 +2804,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+                trace_ext3_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
                }
                error = ext3_orphan_add(handle, inode);
+                if (error) {
+                        ext3_journal_stop(handle);
+                        goto err_out;
+                }
                EXT3_I(inode)->i_disksize = attr->ia_size;
-                rc = ext3_mark_inode_dirty(handle, inode);
+                error = ext3_mark_inode_dirty(handle, inode);
-                if (!error)
-                        error = rc;
                ext3_journal_stop(handle);
+                if (error) {
+                        /* Some hard fs error must have happened. Bail out. */
+                        ext3_orphan_del(NULL, inode);
+                        goto err_out;
+                }
+                rc = ext3_block_truncate_page(inode, attr->ia_size);
+                if (rc) {
+                        /* Cleanup orphan list and exit */
+                        handle = ext3_journal_start(inode, 3);
+                        if (IS_ERR(handle)) {
+                                ext3_orphan_del(NULL, inode);
+                                goto err_out;
+                        }
+                        ext3_orphan_del(handle, inode);
+                        ext3_journal_stop(handle);
+                        goto err_out;
+                }
        }
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                rc = vmtruncate(inode, attr->ia_size);
+                truncate_setsize(inode, attr->ia_size);
-                if (rc)
+                ext3_truncate(inode);
-                        goto err_out;
        }
        setattr_copy(inode, attr);
@@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err;
        might_sleep();
+        trace_ext3_mark_inode_dirty(inode, _RET_IP_);
        err = ext3_reserve_inode_write(handle, inode, &iloc);
        if (!err)
                err = ext3_mark_iloc_dirty(handle, inode, &iloc);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index f4090bd2f345..c7f43944f160 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -285,7 +285,7 @@ group_add_out:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
-                if (copy_from_user(&range, (struct fstrim_range *)arg,
+                if (copy_from_user(&range, (struct fstrim_range __user *)arg,
                                   sizeof(range)))
                        return -EFAULT;
@@ -293,7 +293,7 @@ group_add_out:
                if (ret < 0)
                        return ret;
-                if (copy_to_user((struct fstrim_range *)arg, &range,
+                if (copy_to_user((struct fstrim_range __user *)arg, &range,
                                 sizeof(range)))
                        return -EFAULT;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3b57230a17bb..5571708b6a58 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <trace/events/ext3.h>
 #include "namei.h"
 #include "xattr.h"
@@ -287,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
                                while (len--) printk("%c", *name++);
                                ext3fs_dirhash(de->name, de->name_len, &h);
                                printk(":%x.%u ", h.hash,
-                                       ((char *) de - base));
+                                       (unsigned) ((char *) de - base));
                        }
                        space += EXT3_DIR_REC_LEN(de->name_len);
                        names++;
@@ -1013,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
        *err = -ENOENT;
 errout:
-        dxtrace(printk("%s not found\n", name));
+        dxtrace(printk("%s not found\n", entry->name));
        dx_release (frames);
        return NULL;
 }
@@ -2140,6 +2141,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
        struct ext3_dir_entry_2 * de;
        handle_t *handle;
+        trace_ext3_unlink_enter(dir, dentry);
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        dquot_initialize(dir);
@@ -2185,6 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 end_unlink:
        ext3_journal_stop(handle);
        brelse (bh);
+        trace_ext3_unlink_exit(dentry, retval);
        return retval;
 }
@@ -2206,9 +2209,11 @@ static int ext3_symlink (struct inode * dir,
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
-                 * group descriptor, sb, inode block, quota blocks.
+                 * group descriptor, sb, inode block, quota blocks, and
+                 * possibly selinux xattr blocks.
                 */
-                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          EXT3_XATTR_TRANS_BLOCKS;
        } else {
                /*
                 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b57ea2f91269..7beb69ae0015 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,9 @@
 #include "acl.h"
 #include "namei.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext3.h>
 #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
 #else
@@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
        return &ei->vfs_inode;
 }
+static int ext3_drop_inode(struct inode *inode)
+{
+        int drop = generic_drop_inode(inode);
+        trace_ext3_drop_inode(inode, drop);
+        return drop;
+}
 static void ext3_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = {
        .destroy_inode  = ext3_destroy_inode,
        .write_inode    = ext3_write_inode,
        .dirty_inode    = ext3_dirty_inode,
+        .drop_inode     = ext3_drop_inode,
        .evict_inode    = ext3_evict_inode,
        .put_super      = ext3_put_super,
        .sync_fs        = ext3_sync_fs,
@@ -2509,6 +2521,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
 {
        tid_t target;
+        trace_ext3_sync_fs(sb, wait);
        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
                if (wait)
                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 32e6cc23bd9a..d565759d82ee 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -803,8 +803,16 @@ inserted:
                        /* We need to allocate a new block */
                        ext3_fsblk_t goal = ext3_group_first_block_no(sb,
                                                EXT3_I(inode)->i_block_group);
-                        ext3_fsblk_t block = ext3_new_block(handle, inode,
+                        ext3_fsblk_t block;
-                                                        goal, &error);
+                        /*
+                         * Protect us agaist concurrent allocations to the
+                         * same inode from ext3_..._writepage(). Reservation
+                         * code does not expect racing allocations.
+                         */
+                        mutex_lock(&EXT3_I(inode)->truncate_mutex);
+                        block = ext3_new_block(handle, inode, goal, &error);
+                        mutex_unlock(&EXT3_I(inode)->truncate_mutex);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 04109460ba9e..56fd8f865930 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
-                mmp.o
+                mmp.o indirect.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dca2d1ded931..a5c29bb3b835 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        error = posix_acl_equiv_mode(acl, &mode);
                        if (error < 0)
                                return error;
                        else {
-                                inode->i_mode = mode;
                                inode->i_ctime = ext4_current_time(inode);
                                ext4_mark_inode_dirty(handle, inode);
                                if (error == 0)
@@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
                        inode->i_mode &= ~current_umask();
        }
        if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-                mode_t mode = inode->i_mode;
                if (S_ISDIR(inode->i_mode)) {
                        error = ext4_set_acl(handle, inode,
                                             ACL_TYPE_DEFAULT, acl);
                        if (error)
                                goto cleanup;
                }
-                error = posix_acl_create(&acl, GFP_NOFS, &mode);
+                error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
                if (error < 0)
                        return error;
-                inode->i_mode = mode;
                if (error > 0) {
                        /* This is an extended ACL */
                        error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 264f6949511e..f8224adf496e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 }
+/**
+ *      ext4_inode_to_goal_block - return a hint for block allocation
+ *      @inode: inode for block allocation
+ *
+ *      Return the ideal location to start allocating blocks for a
+ *      newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        ext4_group_t block_group;
+        ext4_grpblk_t colour;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+        ext4_fsblk_t bg_start;
+        ext4_fsblk_t last_block;
+        block_group = ei->i_block_group;
+        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+                /*
+                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+                 * block groups per flexgroup, reserve the first block
+                 * group for directories and special files.  Regular
+                 * files will start at the second block group.  This
+                 * tends to speed up directory access and improves
+                 * fsck times.
+                 */
+                block_group &= ~(flex_size-1);
+                if (S_ISREG(inode->i_mode))
+                        block_group++;
+        }
+        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        /*
+         * If we are doing delayed allocation, we don't need take
+         * colour into account.
+         */
+        if (test_opt(inode->i_sb, DELALLOC))
+                return bg_start;
+        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+                colour = (current->pid % 16) *
+                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+        else
+                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+        return bg_start + colour;
+}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index fac90f3fba80..8efb2f0a3447 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        return 1;
 }
+int ext4_check_blockref(const char *function, unsigned int line,
+                        struct inode *inode, __le32 *p, unsigned int max)
+{
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        __le32 *bref = p;
+        unsigned int blk;
+        while (bref < p+max) {
+                blk = le32_to_cpu(*bref++);
+                if (blk &&
+                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                    blk, 1))) {
+                        es->s_last_error_block = cpu_to_le64(blk);
+                        ext4_error_inode(inode, function, line, blk,
+                                         "invalid block");
+                        return -EIO;
+                }
+        }
+        return 0;
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fa44df879711..b7d7bd0f066e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,6 +175,7 @@ struct mpage_da_data {
 */
 #define EXT4_IO_END_UNWRITTEN   0x0001
 #define EXT4_IO_END_ERROR       0x0002
+#define EXT4_IO_END_QUEUED      0x0004
 struct ext4_io_page {
        struct page     *p_page;
@@ -526,6 +527,7 @@ struct ext4_new_group_data {
 #define EXT4_FREE_BLOCKS_METADATA       0x0001
 #define EXT4_FREE_BLOCKS_FORGET         0x0002
 #define EXT4_FREE_BLOCKS_VALIDATED      0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
 /*
 * ioctl commands
@@ -939,6 +941,8 @@ struct ext4_inode_info {
 #define ext4_find_next_zero_bit         find_next_zero_bit_le
 #define ext4_find_next_bit              find_next_bit_le
+extern void ext4_set_bits(void *bm, int cur, int len);
 /*
 * Maximal mount counts between two filesystem checks
 */
@@ -1126,7 +1130,8 @@ struct ext4_sb_info {
        struct journal_s *s_journal;
        struct list_head s_orphan;
        struct mutex s_orphan_lock;
-        struct mutex s_resize_lock;
+        unsigned long s_resize_flags;           /* Flags indicating if there
+                                                   is a resizer */
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
@@ -1214,6 +1219,9 @@ struct ext4_sb_info {
        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;
+        /* record the last minlen when FITRIM is called. */
+        atomic_t s_last_trim_minblks;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1743,6 +1751,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                                       struct ext4_group_desc *desc);
 #define ext4_free_blocks_after_init(sb, group, desc)                    \
                ext4_init_block_bitmap(sb, NULL, group, desc)
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -1793,7 +1802,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             unsigned long count, int flags);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
@@ -1834,6 +1843,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+                                struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+                                const struct iovec *iov, loff_t offset,
+                                unsigned long nr_segs);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern void ext4_ind_truncate(struct inode *inode);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1855,6 +1875,9 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 /* super.c */
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern void ext4_kvfree(void *ptr);
 extern void __ext4_error(struct super_block *, const char *, unsigned int,
                         const char *, ...)
        __attribute__ ((format (printf, 4, 5)));
@@ -2067,11 +2090,19 @@ struct ext4_group_info {
                                         * 5 free 8-block regions. */
 };
-#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
+#define EXT4_GROUP_INFO_NEED_INIT_BIT           0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT         1
 #define EXT4_MB_GRP_NEED_INIT(grp)      \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_WAS_TRIMMED(grp)    \
+        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp)    \
+        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)  \
+        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MAX_CONTENTION             8
 #define EXT4_CONTENTION_THRESHOLD       2
@@ -2123,6 +2154,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb)
 }
 /*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        ext4_check_blockref(__func__, __LINE__, inode,                  \
+                            (__le32 *)(bh)->b_data,                     \
+                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+#define ext4_ind_check_inode(inode)                                     \
+        ext4_check_blockref(__func__, __LINE__, inode,                  \
+                            EXT4_I(inode)->i_data,                      \
+                            EXT4_NDIR_BLOCKS)
+/*
 * Inodes and files operations
 */
@@ -2151,6 +2195,8 @@ extern void ext4_exit_system_zone(void);
 extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
                                 ext4_fsblk_t start_blk,
                                 unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+                               struct inode *, __le32 *, unsigned int);
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
@@ -2230,6 +2276,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+#define EXT4_RESIZING   0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757689b6..5802fa1dab18 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (!S_ISREG(inode->i_mode))
-                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
+        if (!S_ISREG(inode->i_mode))
+                return 0;
        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f815cc81e7a2..57cf568a98ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
 {
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        ext4_fsblk_t bg_start;
-        ext4_fsblk_t last_block;
-        ext4_grpblk_t colour;
-        ext4_group_t block_group;
-        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        int depth;
        if (path) {
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        }
        /* OK. use inode's group */
-        block_group = ei->i_block_group;
+        return ext4_inode_to_goal_block(inode);
-        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
-                /*
-                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-                 * block groups per flexgroup, reserve the first block
-                 * group for directories and special files.  Regular
-                 * files will start at the second block group.  This
-                 * tends to speed up directory access and improves
-                 * fsck times.
-                 */
-                block_group &= ~(flex_size-1);
-                if (S_ISREG(inode->i_mode))
-                        block_group++;
-        }
-        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
-        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-        /*
-         * If we are doing delayed allocation, we don't need take
-         * colour into account.
-         */
-        if (test_opt(inode->i_sb, DELALLOC))
-                return bg_start;
-        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
-                colour = (current->pid % 16) *
-                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
-        else
-                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
-        return bg_start + colour + block;
 }
 /*
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                 logical, le32_to_cpu(curp->p_idx->ei_block));
                return -EIO;
        }
+        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
+                EXT4_ERROR_INODE(inode,
+                                 "eh_entries %d >= eh_max %d!",
+                                 le16_to_cpu(curp->p_hdr->eh_entries),
+                                 le16_to_cpu(curp->p_hdr->eh_max));
+                return -EIO;
+        }
        len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);
-        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
-                             > le16_to_cpu(curp->p_hdr->eh_max))) {
-                EXT4_ERROR_INODE(inode,
-                                 "logical %d == ei_block %d!",
-                                 logical, le32_to_cpu(curp->p_idx->ei_block));
-                return -EIO;
-        }
        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
                return -EIO;
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 * ext4_ext_next_leaf_block:
 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
-static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
-                                        struct ext4_ext_path *path)
 {
        int depth;
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                goto merge;
        }
-repeat:
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1765,9 +1731,10 @@ repeat:
        /* probably next leaf has space for us? */
        fex = EXT_LAST_EXTENT(eh);
-        next = ext4_ext_next_leaf_block(inode, path);
+        next = EXT_MAX_BLOCKS;
-        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
+        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
-            && next != EXT_MAX_BLOCKS) {
+                next = ext4_ext_next_leaf_block(path);
+        if (next != EXT_MAX_BLOCKS) {
                ext_debug("next leaf block - %d\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1779,7 +1746,7 @@ repeat:
                        ext_debug("next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
-                        goto repeat;
+                        goto has_space;
                }
                ext_debug("next leaf has no free space(%d,%d)\n",
                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1839,7 +1806,7 @@ has_space:
                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
-                                nearex, len, nearex + 1, nearex + 2);
+                                nearex, len, nearex, nearex + 1);
                memmove(nearex + 1, nearex, len);
                path[depth].p_ext = nearex;
        }
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 }
 /*
- * ext4_ext_in_cache()
+ * ext4_ext_check_cache()
 * Checks to see if the given block is in the cache.
 * If it is, the cached extent is stored in the given
 * cache extent pointer.  If the cached extent is a hole,
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 /*
 * ext4_ext_rm_idx:
 * removes index from the index block.
- * It's used in truncate case only, thus all requests are for
- * last index in the block only.
 */
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path)
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        err = ext4_ext_get_access(handle, inode, path);
        if (err)
                return err;
+        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+                len *= sizeof(struct ext4_extent_idx);
+                memmove(path->p_idx, path->p_idx + 1, len);
+        }
        le16_add_cpu(&path->p_hdr->eh_entries, -1);
        err = ext4_ext_dirty(handle, inode, path);
        if (err)
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
        return 1;
 }
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
-                                ext4_lblk_t end)
 {
        struct super_block *sb = inode->i_sb;
        int depth = ext_depth(inode);
@@ -2575,7 +2546,7 @@ again:
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
-                                        start, end);
+                                        start, EXT_MAX_BLOCKS - 1);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                              struct ext4_ext_path *path)
 {
        struct ext4_extent *ex;
-        struct ext4_extent_header *eh;
        int depth;
        int err = 0;
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
-        if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+        if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
-                ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
+                ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
                if (!newex.ee_start_lo && !newex.ee_start_hi) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext4_ext_mark_uninitialized(ex);
-                        err = ext4_ext_remove_space(inode, map->m_lblk,
+                        ext4_ext_invalidate_cache(inode);
-                                map->m_lblk + punched_out);
+                        err = ext4_ext_rm_leaf(handle, inode, path,
+                                map->m_lblk, map->m_lblk + punched_out);
+                        if (!err && path->p_hdr->eh_entries == 0) {
+                                /*
+                                 * Punch hole freed all of this sub tree,
+                                 * so we need to correct eh_depth
+                                 */
+                                err = ext4_ext_get_access(handle, inode, path);
+                                if (err == 0) {
+                                        ext_inode_hdr(inode)->eh_depth = 0;
+                                        ext_inode_hdr(inode)->eh_max =
+                                        cpu_to_le16(ext4_ext_space_root(
+                                                inode, 0));
+                                        err = ext4_ext_dirty(
+                                                handle, inode, path);
+                                }
+                        }
                        goto out2;
                }
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        }
        err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
-        if (err)
+        if (!err)
-                goto out2;
+                err = ext4_ext_insert_extent(handle, inode, path,
+                                             &newex, flags);
-        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
+                int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+                        EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
                ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
-                                 ext4_ext_get_actual_len(&newex), 0);
+                                 ext4_ext_get_actual_len(&newex), fb_flags);
                goto out2;
        }
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode)
        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
-        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+        err = ext4_ext_remove_space(inode, last_block);
        /* In a multi-transaction truncate, we only make the final
         * transaction synchronous.
@@ -3835,7 +3824,7 @@ retry:
                                                blkbits) >> blkbits))
                        new_size = offset + len;
                else
-                        new_size = (map.m_lblk + ret) << blkbits;
+                        new_size = ((loff_t) map.m_lblk + ret) << blkbits;
                ext4_falloc_update_inode(inode, mode, new_size,
                                         (map.m_flags & EXT4_MAP_NEW));
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index da3bed3e0c29..036f78f7a1ef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode)
 {
        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        struct inode *next;
        int ret = 0;
-        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+        if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+                return 0;
+        inode = igrab(inode);
+        while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
-                dentry = list_entry(inode->i_dentry.next,
+                dentry = NULL;
-                                    struct dentry, d_alias);
+                spin_lock(&inode->i_lock);
-                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+                if (!list_empty(&inode->i_dentry)) {
+                        dentry = list_first_entry(&inode->i_dentry,
+                                                  struct dentry, d_alias);
+                        dget(dentry);
+                }
+                spin_unlock(&inode->i_lock);
+                if (!dentry)
                        break;
-                inode = dentry->d_parent->d_inode;
+                next = igrab(dentry->d_parent->d_inode);
+                dput(dentry);
+                if (!next)
+                        break;
+                iput(inode);
+                inode = next;
                ret = sync_mapping_buffers(inode->i_mapping);
                if (ret)
                        break;
@@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode)
                if (ret)
                        break;
        }
+        iput(inode);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21bb2f61e502..9c63f273b550 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                           group, used_blks,
                           ext4_itable_unused_count(sb, gdp));
                ret = 1;
-                goto out;
+                goto err_out;
        }
        blk = ext4_inode_table(sb, gdp) + used_blks;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 000000000000..0962642119c0
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1487 @@
+/*
+ *  linux/fs/ext4/indirect.c
+ *
+ *  from
+ *
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *      (sct@redhat.com), 1993, 1998
+ */
+#include <linux/module.h>
+#include "ext4_jbd2.h"
+#include "truncate.h"
+#include <trace/events/ext4.h>
+typedef struct {
+        __le32  *p;
+        __le32  key;
+        struct buffer_head *bh;
+} Indirect;
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+        p->key = *(p->p = v);
+        p->bh = bh;
+}
+/**
+ *      ext4_block_to_path - parse the block number into array of offsets
+ *      @inode: inode in question (we are only interested in its superblock)
+ *      @i_block: block number to be parsed
+ *      @offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *
+ *      To store the locations of file's data ext4 uses a data structure common
+ *      for UNIX filesystems - tree of pointers anchored in the inode, with
+ *      data blocks at leaves and indirect blocks in intermediate nodes.
+ *      This function translates the block number into path in that tree -
+ *      return value is the path length and @offsets[n] is the offset of
+ *      pointer to (n+1)th node in the nth one. If @block is out of range
+ *      (negative or too large) warning is printed and zero returned.
+ *
+ *      Note: function doesn't find node addresses, so no IO is needed. All
+ *      we need to know is the capacity of indirect blocks (taken from the
+ *      inode->i_sb).
+ */
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+static int ext4_block_to_path(struct inode *inode,
+                              ext4_lblk_t i_block,
+                              ext4_lblk_t offsets[4], int *boundary)
+{
+        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+        const long direct_blocks = EXT4_NDIR_BLOCKS,
+                indirect_blocks = ptrs,
+                double_blocks = (1 << (ptrs_bits * 2));
+        int n = 0;
+        int final = 0;
+        if (i_block < direct_blocks) {
+                offsets[n++] = i_block;
+                final = direct_blocks;
+        } else if ((i_block -= direct_blocks) < indirect_blocks) {
+                offsets[n++] = EXT4_IND_BLOCK;
+                offsets[n++] = i_block;
+                final = ptrs;
+        } else if ((i_block -= indirect_blocks) < double_blocks) {
+                offsets[n++] = EXT4_DIND_BLOCK;
+                offsets[n++] = i_block >> ptrs_bits;
+                offsets[n++] = i_block & (ptrs - 1);
+                final = ptrs;
+        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+                offsets[n++] = EXT4_TIND_BLOCK;
+                offsets[n++] = i_block >> (ptrs_bits * 2);
+                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+                offsets[n++] = i_block & (ptrs - 1);
+                final = ptrs;
+        } else {
+                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+                             i_block + direct_blocks +
+                             indirect_blocks + double_blocks, inode->i_ino);
+        }
+        if (boundary)
+                *boundary = final - 1 - (i_block & (ptrs - 1));
+        return n;
+}
+/**
+ *      ext4_get_branch - read the chain of indirect blocks leading to data
+ *      @inode: inode in question
+ *      @depth: depth of the chain (1 - direct pointer, etc.)
+ *      @offsets: offsets of pointers in inode/indirect blocks
+ *      @chain: place to store the result
+ *      @err: here we store the error value
+ *
+ *      Function fills the array of triples <key, p, bh> and returns %NULL
+ *      if everything went OK or the pointer to the last filled triple
+ *      (incomplete one) otherwise. Upon the return chain[i].key contains
+ *      the number of (i+1)-th block in the chain (as it is stored in memory,
+ *      i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *      number (it points into struct inode for i==0 and into the bh->b_data
+ *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *      block for i>0 and NULL for i==0. In other words, it holds the block
+ *      numbers of the chain, addresses they were taken from (and where we can
+ *      verify that chain did not change) and buffer_heads hosting these
+ *      numbers.
+ *
+ *      Function stops when it stumbles upon zero pointer (absent block)
+ *              (pointer to last triple returned, *@err == 0)
+ *      or when it gets an IO error reading an indirect block
+ *              (ditto, *@err == -EIO)
+ *      or when it reads all @depth-1 indirect blocks successfully and finds
+ *      the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+                                 ext4_lblk_t  *offsets,
+                                 Indirect chain[4], int *err)
+{
+        struct super_block *sb = inode->i_sb;
+        Indirect *p = chain;
+        struct buffer_head *bh;
+        *err = 0;
+        /* i_data is not going away, no lock needed */
+        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+        if (!p->key)
+                goto no_block;
+        while (--depth) {
+                bh = sb_getblk(sb, le32_to_cpu(p->key));
+                if (unlikely(!bh))
+                        goto failure;
+                if (!bh_uptodate_or_lock(bh)) {
+                        if (bh_submit_read(bh) < 0) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                        /* validate block references */
+                        if (ext4_check_indirect_blockref(inode, bh)) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                }
+                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+                /* Reader: end */
+                if (!p->key)
+                        goto no_block;
+        }
+        return NULL;
+failure:
+        *err = -EIO;
+no_block:
+        return p;
+}
+/**
+ *      ext4_find_near - find a place for allocation with sufficient locality
+ *      @inode: owner
+ *      @ind: descriptor of indirect block.
+ *
+ *      This function returns the preferred place for block allocation.
+ *      It is used when heuristic for sequential allocation fails.
+ *      Rules are:
+ *        + if there is a block to the left of our position - allocate near it.
+ *        + if pointer will live in indirect block - allocate near that block.
+ *        + if pointer will live in inode - allocate in the same
+ *          cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *      Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+        __le32 *p;
+        /* Try to find previous block */
+        for (p = ind->p - 1; p >= start; p--) {
+                if (*p)
+                        return le32_to_cpu(*p);
+        }
+        /* No such thing, so let's try location of indirect block */
+        if (ind->bh)
+                return ind->bh->b_blocknr;
+        /*
+         * It is going to be referred to from the inode itself? OK, just put it
+         * into the same cylinder group then.
+         */
+        return ext4_inode_to_goal_block(inode);
+}
+/**
+ *      ext4_find_goal - find a preferred place for allocation.
+ *      @inode: owner
+ *      @block:  block we want
+ *      @partial: pointer to the last triple within a chain
+ *
+ *      Normally this function find the preferred place for block allocation,
+ *      returns it.
+ *      Because this is only used for non-extent files, we limit the block nr
+ *      to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+                                   Indirect *partial)
+{
+        ext4_fsblk_t goal;
+        /*
+         * XXX need to get goal block from mballoc's data structures
+         */
+        goal = ext4_find_near(inode, partial);
+        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+        return goal;
+}
+/**
+ *      ext4_blks_to_allocate - Look up the block map and count the number
+ *      of direct blocks need to be allocated for the given branch.
+ *
+ *      @branch: chain of indirect blocks
+ *      @k: number of blocks need for indirect blocks
+ *      @blks: number of data blocks to be mapped.
+ *      @blocks_to_boundary:  the offset in the indirect block
+ *
+ *      return the total number of blocks to be allocate, including the
+ *      direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+                                 int blocks_to_boundary)
+{
+        unsigned int count = 0;
+        /*
+         * Simple case, [t,d]Indirect block(s) has not allocated yet
+         * then it's clear blocks on that path have not allocated
+         */
+        if (k > 0) {
+                /* right now we don't handle cross boundary allocation */
+                if (blks < blocks_to_boundary + 1)
+                        count += blks;
+                else
+                        count += blocks_to_boundary + 1;
+                return count;
+        }
+        count++;
+        while (count < blks && count <= blocks_to_boundary &&
+                le32_to_cpu(*(branch[0].p + count)) == 0) {
+                count++;
+        }
+        return count;
+}
+/**
+ *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *      @handle: handle for this transaction
+ *      @inode: inode which needs allocated blocks
+ *      @iblock: the logical block to start allocated at
+ *      @goal: preferred physical block of allocation
+ *      @indirect_blks: the number of blocks need to allocate for indirect
+ *                      blocks
+ *      @blks: number of desired blocks
+ *      @new_blocks: on return it will store the new block numbers for
+ *      the indirect blocks(if needed) and the first direct block,
+ *      @err: on return it will store the error code
+ *
+ *      This function will return the number of blocks allocated as
+ *      requested by the passed-in parameters.
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+                             ext4_lblk_t iblock, ext4_fsblk_t goal,
+                             int indirect_blks, int blks,
+                             ext4_fsblk_t new_blocks[4], int *err)
+{
+        struct ext4_allocation_request ar;
+        int target, i;
+        unsigned long count = 0, blk_allocated = 0;
+        int index = 0;
+        ext4_fsblk_t current_block = 0;
+        int ret = 0;
+        /*
+         * Here we try to allocate the requested multiple blocks at once,
+         * on a best-effort basis.
+         * To build a branch, we should allocate blocks for
+         * the indirect blocks(if not allocated yet), and at least
+         * the first direct block of this branch.  That's the
+         * minimum number of blocks need to allocate(required)
+         */
+        /* first we try to allocate the indirect blocks */
+        target = indirect_blks;
+        while (target > 0) {
+                count = target;
+                /* allocating blocks for indirect blocks and direct blocks */
+                current_block = ext4_new_meta_blocks(handle, inode, goal,
+                                                     0, &count, err);
+                if (*err)
+                        goto failed_out;
+                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                        EXT4_ERROR_INODE(inode,
+                                         "current_block %llu + count %lu > %d!",
+                                         current_block, count,
+                                         EXT4_MAX_BLOCK_FILE_PHYS);
+                        *err = -EIO;
+                        goto failed_out;
+                }
+                target -= count;
+                /* allocate blocks for indirect blocks */
+                while (index < indirect_blks && count) {
+                        new_blocks[index++] = current_block++;
+                        count--;
+                }
+                if (count > 0) {
+                        /*
+                         * save the new block number
+                         * for the first direct block
+                         */
+                        new_blocks[index] = current_block;
+                        printk(KERN_INFO "%s returned more blocks than "
+                                                "requested\n", __func__);
+                        WARN_ON(1);
+                        break;
+                }
+        }
+        target = blks - count ;
+        blk_allocated = count;
+        if (!target)
+                goto allocated;
+        /* Now allocate data blocks */
+        memset(&ar, 0, sizeof(ar));
+        ar.inode = inode;
+        ar.goal = goal;
+        ar.len = target;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode))
+                /* enable in-core preallocation only for regular files */
+                ar.flags = EXT4_MB_HINT_DATA;
+        current_block = ext4_mb_new_blocks(handle, &ar, err);
+        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+                EXT4_ERROR_INODE(inode,
+                                 "current_block %llu + ar.len %d > %d!",
+                                 current_block, ar.len,
+                                 EXT4_MAX_BLOCK_FILE_PHYS);
+                *err = -EIO;
+                goto failed_out;
+        }
+        if (*err && (target == blks)) {
+                /*
+                 * if the allocation failed and we didn't allocate
+                 * any blocks before
+                 */
+                goto failed_out;
+        }
+        if (!*err) {
+                if (target == blks) {
+                        /*
+                         * save the new block number
+                         * for the first direct block
+                         */
+                        new_blocks[index] = current_block;
+                }
+                blk_allocated += ar.len;
+        }
+allocated:
+        /* total number of blocks allocated for direct blocks */
+        ret = blk_allocated;
+        *err = 0;
+        return ret;
+failed_out:
+        for (i = 0; i < index; i++)
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+        return ret;
+}
+/**
+ *      ext4_alloc_branch - allocate and set up a chain of blocks.
+ *      @handle: handle for this transaction
+ *      @inode: owner
+ *      @indirect_blks: number of allocated indirect blocks
+ *      @blks: number of allocated direct blocks
+ *      @goal: preferred place for allocation
+ *      @offsets: offsets (in the blocks) to store the pointers to next.
+ *      @branch: place to store the chain in.
+ *
+ *      This function allocates blocks, zeroes out all but the last one,
+ *      links them into chain and (if we are synchronous) writes them to disk.
+ *      In other words, it prepares a branch that can be spliced onto the
+ *      inode. It stores the information about that chain in the branch[], in
+ *      the same format as ext4_get_branch() would do. We are calling it after
+ *      we had read the existing part of chain and partial points to the last
+ *      triple of that (one with zero ->key). Upon the exit we have the same
+ *      picture as after the successful ext4_get_block(), except that in one
+ *      place chain is disconnected - *branch->p is still zero (we did not
+ *      set the last link), but branch->key contains the number that should
+ *      be placed into *branch->p to fill that gap.
+ *
+ *      If allocation fails we free all blocks we've allocated (and forget
+ *      their buffer_heads) and return the error value the from failed
+ *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *      as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+                             ext4_lblk_t iblock, int indirect_blks,
+                             int *blks, ext4_fsblk_t goal,
+                             ext4_lblk_t *offsets, Indirect *branch)
+{
+        int blocksize = inode->i_sb->s_blocksize;
+        int i, n = 0;
+        int err = 0;
+        struct buffer_head *bh;
+        int num;
+        ext4_fsblk_t new_blocks[4];
+        ext4_fsblk_t current_block;
+        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
+                                *blks, new_blocks, &err);
+        if (err)
+                return err;
+        branch[0].key = cpu_to_le32(new_blocks[0]);
+        /*
+         * metadata blocks and data blocks are allocated.
+         */
+        for (n = 1; n <= indirect_blks;  n++) {
+                /*
+                 * Get buffer_head for parent block, zero it out
+                 * and set the pointer to new one, then send
+                 * parent to disk.
+                 */
+                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+                if (unlikely(!bh)) {
+                        err = -EIO;
+                        goto failed;
+                }
+                branch[n].bh = bh;
+                lock_buffer(bh);
+                BUFFER_TRACE(bh, "call get_create_access");
+                err = ext4_journal_get_create_access(handle, bh);
+                if (err) {
+                        /* Don't brelse(bh) here; it's done in
+                         * ext4_journal_forget() below */
+                        unlock_buffer(bh);
+                        goto failed;
+                }
+                memset(bh->b_data, 0, blocksize);
+                branch[n].p = (__le32 *) bh->b_data + offsets[n];
+                branch[n].key = cpu_to_le32(new_blocks[n]);
+                *branch[n].p = branch[n].key;
+                if (n == indirect_blks) {
+                        current_block = new_blocks[n];
+                        /*
+                         * End of chain, update the last new metablock of
+                         * the chain to point to the new allocated
+                         * data blocks numbers
+                         */
+                        for (i = 1; i < num; i++)
+                                *(branch[n].p + i) = cpu_to_le32(++current_block);
+                }
+                BUFFER_TRACE(bh, "marking uptodate");
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (err)
+                        goto failed;
+        }
+        *blks = num;
+        return err;
+failed:
+        /* Allocation failed, free what we already allocated */
+        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
+        for (i = 1; i <= n ; i++) {
+                /*
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
+        }
+        for (i = n+1; i < indirect_blks; i++)
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
+        return err;
+}
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *      ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+                              ext4_lblk_t block, Indirect *where, int num,
+                              int blks)
+{
+        int i;
+        int err = 0;
+        ext4_fsblk_t current_block;
+        /*
+         * If we're splicing into a [td]indirect block (as opposed to the
+         * inode) then we need to get write access to the [td]indirect block
+         * before the splice.
+         */
+        if (where->bh) {
+                BUFFER_TRACE(where->bh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, where->bh);
+                if (err)
+                        goto err_out;
+        }
+        /* That's it */
+        *where->p = where->key;
+        /*
+         * Update the host buffer_head or inode to point to more just allocated
+         * direct blocks blocks
+         */
+        if (num == 0 && blks > 1) {
+                current_block = le32_to_cpu(where->key) + 1;
+                for (i = 1; i < blks; i++)
+                        *(where->p + i) = cpu_to_le32(current_block++);
+        }
+        /* We are done with atomic stuff, now do the rest of housekeeping */
+        /* had we spliced it onto indirect block? */
+        if (where->bh) {
+                /*
+                 * If we spliced it onto an indirect block, we haven't
+                 * altered the inode.  Note however that if it is being spliced
+                 * onto an indirect block at the very end of the file (the
+                 * file is growing) then we *will* alter the inode to reflect
+                 * the new i_size.  But that is not done here - it is done in
+                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+                 */
+                jbd_debug(5, "splicing indirect only\n");
+                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+                if (err)
+                        goto err_out;
+        } else {
+                /*
+                 * OK, we spliced it into the inode itself on a direct block.
+                 */
+                ext4_mark_inode_dirty(handle, inode);
+                jbd_debug(5, "splicing direct\n");
+        }
+        return err;
+err_out:
+        for (i = 1; i <= num; i++) {
+                /*
+                 * branch[i].bh is newly allocated, so there is no
+                 * need to revoke the block, which is why we don't
+                 * need to set EXT4_FREE_BLOCKS_METADATA.
+                 */
+                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+                                 EXT4_FREE_BLOCKS_FORGET);
+        }
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+                         blks, 0);
+        return err;
+}
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+                        struct ext4_map_blocks *map,
+                        int flags)
+{
+        int err = -EIO;
+        ext4_lblk_t offsets[4];
+        Indirect chain[4];
+        Indirect *partial;
+        ext4_fsblk_t goal;
+        int indirect_blks;
+        int blocks_to_boundary = 0;
+        int depth;
+        int count = 0;
+        ext4_fsblk_t first_block = 0;
+        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+                                   &blocks_to_boundary);
+        if (depth == 0)
+                goto out;
+        partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+        /* Simplest case - block found, no allocation needed */
+        if (!partial) {
+                first_block = le32_to_cpu(chain[depth - 1].key);
+                count++;
+                /*map more blocks*/
+                while (count < map->m_len && count <= blocks_to_boundary) {
+                        ext4_fsblk_t blk;
+                        blk = le32_to_cpu(*(chain[depth-1].p + count));
+                        if (blk == first_block + count)
+                                count++;
+                        else
+                                break;
+                }
+                goto got_it;
+        }
+        /* Next simple case - plain lookup or failed read of indirect block */
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+                goto cleanup;
+        /*
+         * Okay, we need to do block allocation.
+        */
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
+        /* the number of blocks need to allocate for [d,t]indirect blocks */
+        indirect_blks = (chain + depth) - partial - 1;
+        /*
+         * Next look up the indirect map to count the totoal number of
+         * direct blocks to allocate for this branch.
+         */
+        count = ext4_blks_to_allocate(partial, indirect_blks,
+                                      map->m_len, blocks_to_boundary);
+        /*
+         * Block out ext4_truncate while we alter the tree
+         */
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+                                &count, goal,
+                                offsets + (partial - chain), partial);
+        /*
+         * The ext4_splice_branch call will free and forget any buffers
+         * on the new chain if there is a failure, but that risks using
+         * up transaction credits, especially for bitmaps where the
+         * credits cannot be returned.  Can we handle this somehow?  We
+         * may need to return -EAGAIN upwards in the worst case.  --sct
+         */
+        if (!err)
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
+                                         partial, indirect_blks, count);
+        if (err)
+                goto cleanup;
+        map->m_flags |= EXT4_MAP_NEW;
+        ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
+        if (count > blocks_to_boundary)
+                map->m_flags |= EXT4_MAP_BOUNDARY;
+        err = count;
+        /* Clean up and exit */
+        partial = chain + depth - 1;    /* the whole chain */
+cleanup:
+        while (partial > chain) {
+                BUFFER_TRACE(partial->bh, "call brelse");
+                brelse(partial->bh);
+                partial--;
+        }
+out:
+        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+                                map->m_pblk, map->m_len, err);
+        return err;
+}
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+                           const struct iovec *iov, loff_t offset,
+                           unsigned long nr_segs)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        handle_t *handle;
+        ssize_t ret;
+        int orphan = 0;
+        size_t count = iov_length(iov, nr_segs);
+        int retries = 0;
+        if (rw == WRITE) {
+                loff_t final_size = offset + count;
+                if (final_size > inode->i_size) {
+                        /* Credits for sb + inode write */
+                        handle = ext4_journal_start(inode, 2);
+                        if (IS_ERR(handle)) {
+                                ret = PTR_ERR(handle);
+                                goto out;
+                        }
+                        ret = ext4_orphan_add(handle, inode);
+                        if (ret) {
+                                ext4_journal_stop(handle);
+                                goto out;
+                        }
+                        orphan = 1;
+                        ei->i_disksize = inode->i_size;
+                        ext4_journal_stop(handle);
+                }
+        }
+retry:
+        if (rw == READ && ext4_should_dioread_nolock(inode)) {
+                if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+                        mutex_lock(&inode->i_mutex);
+                        ext4_flush_completed_IO(inode);
+                        mutex_unlock(&inode->i_mutex);
+                }
+                ret = __blockdev_direct_IO(rw, iocb, inode,
+                                 inode->i_sb->s_bdev, iov,
+                                 offset, nr_segs,
+                                 ext4_get_block, NULL, NULL, 0);
+        } else {
+                ret = blockdev_direct_IO(rw, iocb, inode, iov,
+                                 offset, nr_segs, ext4_get_block);
+                if (unlikely((rw & WRITE) && ret < 0)) {
+                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
+                        if (end > isize)
+                                ext4_truncate_failed_write(inode);
+                }
+        }
+        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+                goto retry;
+        if (orphan) {
+                int err;
+                /* Credits for sb + inode write */
+                handle = ext4_journal_start(inode, 2);
+                if (IS_ERR(handle)) {
+                        /* This is really bad luck. We've written the data
+                         * but cannot extend i_size. Bail out and pretend
+                         * the write failed... */
+                        ret = PTR_ERR(handle);
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
+                        goto out;
+                }
+                if (inode->i_nlink)
+                        ext4_orphan_del(handle, inode);
+                if (ret > 0) {
+                        loff_t end = offset + ret;
+                        if (end > inode->i_size) {
+                                ei->i_disksize = end;
+                                i_size_write(inode, end);
+                                /*
+                                 * We're going to return a positive `ret'
+                                 * here due to non-zero-length I/O, so there's
+                                 * no way of reporting error returns from
+                                 * ext4_mark_inode_dirty() to userspace.  So
+                                 * ignore it.
+                                 */
+                                ext4_mark_inode_dirty(handle, inode);
+                        }
+                }
+                err = ext4_journal_stop(handle);
+                if (ret == 0)
+                        ret = err;
+        }
+out:
+        return ret;
+}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+        int blk_bits;
+        if (lblock < EXT4_NDIR_BLOCKS)
+                return 0;
+        lblock -= EXT4_NDIR_BLOCKS;
+        if (ei->i_da_metadata_calc_len &&
+            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+                ei->i_da_metadata_calc_len++;
+                return 0;
+        }
+        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+        ei->i_da_metadata_calc_len = 1;
+        blk_bits = order_base_2(lblock);
+        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int indirects;
+        /* if nrblocks are contiguous */
+        if (chunk) {
+                /*
+                 * With N contiguous data blocks, we need at most
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+                 * 2 dindirect blocks, and 1 tindirect block
+                 */
+                return DIV_ROUND_UP(nrblocks,
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+        }
+        /*
+         * if nrblocks are not contiguous, worse case, each block touch
+         * a indirect block, and each indirect block touch a double indirect
+         * block, plus a triple indirect block
+         */
+        indirects = nrblocks * 2 + 1;
+        return indirects;
+}
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+        handle_t *result;
+        result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
+        if (!IS_ERR(result))
+                return result;
+        ext4_std_error(inode->i_sb, PTR_ERR(result));
+        return result;
+}
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+        if (!ext4_handle_valid(handle))
+                return 0;
+        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+                return 0;
+        if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+                return 0;
+        return 1;
+}
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+        while (p < q)
+                if (*p++)
+                        return 0;
+        return 1;
+}
+/**
+ *      ext4_find_shared - find the indirect blocks for partial truncation.
+ *      @inode:   inode in question
+ *      @depth:   depth of the affected branch
+ *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *      @chain:   place to store the pointers to partial indirect blocks
+ *      @top:     place to the (detached) top of branch
+ *
+ *      This is a helper function used by ext4_truncate().
+ *
+ *      When we do truncate() we may have to clean the ends of several
+ *      indirect blocks but leave the blocks themselves alive. Block is
+ *      partially truncated if some data below the new i_size is referred
+ *      from it (and it is on the path to the first completely truncated
+ *      data block, indeed).  We have to free the top of that path along
+ *      with everything to the right of the path. Since no allocation
+ *      past the truncation point is possible until ext4_truncate()
+ *      finishes, we may safely do the latter, but top of branch may
+ *      require special attention - pageout below the truncation point
+ *      might try to populate it.
+ *
+ *      We atomically detach the top of branch from the tree, store the
+ *      block number of its root in *@top, pointers to buffer_heads of
+ *      partially truncated blocks - in @chain[].bh and pointers to
+ *      their last elements that should not be removed - in
+ *      @chain[].p. Return value is the pointer to last filled element
+ *      of @chain.
+ *
+ *      The work left to caller to do the actual freeing of subtrees:
+ *              a) free the subtree starting from *@top
+ *              b) free the subtrees whose roots are stored in
+ *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *              c) free the subtrees growing from the inode past the @chain[0].
+ *                      (no partially truncated stuff there).  */
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+                                  ext4_lblk_t offsets[4], Indirect chain[4],
+                                  __le32 *top)
+{
+        Indirect *partial, *p;
+        int k, err;
+        *top = 0;
+        /* Make k index the deepest non-null offset + 1 */
+        for (k = depth; k > 1 && !offsets[k-1]; k--)
+                ;
+        partial = ext4_get_branch(inode, k, offsets, chain, &err);
+        /* Writer: pointers */
+        if (!partial)
+                partial = chain + k-1;
+        /*
+         * If the branch acquired continuation since we've looked at it -
+         * fine, it should all survive and (new) top doesn't belong to us.
+         */
+        if (!partial->key && *partial->p)
+                /* Writer: end */
+                goto no_top;
+        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+                ;
+        /*
+         * OK, we've found the last block that must survive. The rest of our
+         * branch should be detached before unlocking. However, if that rest
+         * of branch is all ours and does not grow immediately from the inode
+         * it's easier to cheat and just decrement partial->p.
+         */
+        if (p == chain + k - 1 && p > chain) {
+                p->p--;
+        } else {
+                *top = *p->p;
+                /* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+                *p->p = 0;
+#endif
+        }
+        /* Writer: end */
+        while (partial > p) {
+                brelse(partial->bh);
+                partial--;
+        }
+no_top:
+        return partial;
+}
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+                             struct buffer_head *bh,
+                             ext4_fsblk_t block_to_free,
+                             unsigned long count, __le32 *first,
+                             __le32 *last)
+{
+        __le32 *p;
+        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+        int     err;
+        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                flags |= EXT4_FREE_BLOCKS_METADATA;
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+                                   count)) {
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+                                 "blocks %llu len %lu",
+                                 (unsigned long long) block_to_free, count);
+                return 1;
+        }
+        if (try_to_extend_transaction(handle, inode)) {
+                if (bh) {
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                        if (unlikely(err))
+                                goto out_err;
+                }
+                err = ext4_mark_inode_dirty(handle, inode);
+                if (unlikely(err))
+                        goto out_err;
+                err = ext4_truncate_restart_trans(handle, inode,
+                                        ext4_blocks_for_truncate(inode));
+                if (unlikely(err))
+                        goto out_err;
+                if (bh) {
+                        BUFFER_TRACE(bh, "retaking write access");
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err))
+                                goto out_err;
+                }
+        }
+        for (p = first; p < last; p++)
+                *p = 0;
+        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+        return 0;
+out_err:
+        ext4_std_error(inode->i_sb, err);
+        return err;
+}
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:     handle for this transaction
+ * @inode:      inode we are dealing with
+ * @this_bh:    indirect buffer_head which contains *@first and *@last
+ * @first:      array of block numbers
+ * @last:       points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+                           struct buffer_head *this_bh,
+                           __le32 *first, __le32 *last)
+{
+        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+        unsigned long count = 0;            /* Number of blocks in the run */
+        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
+                                               corresponding to
+                                               block_to_free */
+        ext4_fsblk_t nr;                    /* Current block # */
+        __le32 *p;                          /* Pointer into inode/ind
+                                               for current block */
+        int err = 0;
+        if (this_bh) {                          /* For indirect block */
+                BUFFER_TRACE(this_bh, "get_write_access");
+                err = ext4_journal_get_write_access(handle, this_bh);
+                /* Important: if we can't update the indirect pointers
+                 * to the blocks, we can't free them. */
+                if (err)
+                        return;
+        }
+        for (p = first; p < last; p++) {
+                nr = le32_to_cpu(*p);
+                if (nr) {
+                        /* accumulate blocks to free if they're contiguous */
+                        if (count == 0) {
+                                block_to_free = nr;
+                                block_to_free_p = p;
+                                count = 1;
+                        } else if (nr == block_to_free + count) {
+                                count++;
+                        } else {
+                                err = ext4_clear_blocks(handle, inode, this_bh,
+                                                        block_to_free, count,
+                                                        block_to_free_p, p);
+                                if (err)
+                                        break;
+                                block_to_free = nr;
+                                block_to_free_p = p;
+                                count = 1;
+                        }
+                }
+        }
+        if (!err && count > 0)
+                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                                        count, block_to_free_p, p);
+        if (err < 0)
+                /* fatal error */
+                return;
+        if (this_bh) {
+                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+                /*
+                 * The buffer head should have an attached journal head at this
+                 * point. However, if the data is corrupted and an indirect
+                 * block pointed to itself, it would have been detached when
+                 * the block was cleared. Check for this instead of OOPSing.
+                 */
+                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+                        ext4_handle_dirty_metadata(handle, inode, this_bh);
+                else
+                        EXT4_ERROR_INODE(inode,
+                                         "circular indirect block detected at "
+                                         "block %llu",
+                                (unsigned long long) this_bh->b_blocknr);
+        }
+}
+/**
+ *      ext4_free_branches - free an array of branches
+ *      @handle: JBD handle for this transaction
+ *      @inode: inode we are dealing with
+ *      @parent_bh: the buffer_head which contains *@first and *@last
+ *      @first: array of block numbers
+ *      @last:  pointer immediately past the end of array
+ *      @depth: depth of the branches to free
+ *
+ *      We are freeing all blocks referred from these branches (numbers are
+ *      stored as little-endian 32-bit) and updating @inode->i_blocks
+ *      appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+                               struct buffer_head *parent_bh,
+                               __le32 *first, __le32 *last, int depth)
+{
+        ext4_fsblk_t nr;
+        __le32 *p;
+        if (ext4_handle_is_aborted(handle))
+                return;
+        if (depth--) {
+                struct buffer_head *bh;
+                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                p = last;
+                while (--p >= first) {
+                        nr = le32_to_cpu(*p);
+                        if (!nr)
+                                continue;               /* A hole */
+                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+                                                   nr, 1)) {
+                                EXT4_ERROR_INODE(inode,
+                                                 "invalid indirect mapped "
+                                                 "block %lu (level %d)",
+                                                 (unsigned long) nr, depth);
+                                break;
+                        }
+                        /* Go read the buffer for the next level down */
+                        bh = sb_bread(inode->i_sb, nr);
+                        /*
+                         * A read failure? Report error and clear slot
+                         * (should be rare).
+                         */
+                        if (!bh) {
+                                EXT4_ERROR_INODE_BLOCK(inode, nr,
+                                                       "Read failure");
+                                continue;
+                        }
+                        /* This zaps the entire block.  Bottom up. */
+                        BUFFER_TRACE(bh, "free child branches");
+                        ext4_free_branches(handle, inode, bh,
+                                        (__le32 *) bh->b_data,
+                                        (__le32 *) bh->b_data + addr_per_block,
+                                        depth);
+                        brelse(bh);
+                        /*
+                         * Everything below this this pointer has been
+                         * released.  Now let this top-of-subtree go.
+                         *
+                         * We want the freeing of this indirect block to be
+                         * atomic in the journal with the updating of the
+                         * bitmap block which owns it.  So make some room in
+                         * the journal.
+                         *
+                         * We zero the parent pointer *after* freeing its
+                         * pointee in the bitmaps, so if extend_transaction()
+                         * for some reason fails to put the bitmap changes and
+                         * the release into the same transaction, recovery
+                         * will merely complain about releasing a free block,
+                         * rather than leaking blocks.
+                         */
+                        if (ext4_handle_is_aborted(handle))
+                                return;
+                        if (try_to_extend_transaction(handle, inode)) {
+                                ext4_mark_inode_dirty(handle, inode);
+                                ext4_truncate_restart_trans(handle, inode,
+                                            ext4_blocks_for_truncate(inode));
+                        }
+                        /*
+                         * The forget flag here is critical because if
+                         * we are journaling (and not doing data
+                         * journaling), we have to make sure a revoke
+                         * record is written to prevent the journal
+                         * replay from overwriting the (former)
+                         * indirect block if it gets reallocated as a
+                         * data block.  This must happen in the same
+                         * transaction where the data blocks are
+                         * actually freed.
+                         */
+                        ext4_free_blocks(handle, inode, NULL, nr, 1,
+                                         EXT4_FREE_BLOCKS_METADATA|
+                                         EXT4_FREE_BLOCKS_FORGET);
+                        if (parent_bh) {
+                                /*
+                                 * The block which we have just freed is
+                                 * pointed to by an indirect block: journal it
+                                 */
+                                BUFFER_TRACE(parent_bh, "get_write_access");
+                                if (!ext4_journal_get_write_access(handle,
+                                                                   parent_bh)){
+                                        *p = 0;
+                                        BUFFER_TRACE(parent_bh,
+                                        "call ext4_handle_dirty_metadata");
+                                        ext4_handle_dirty_metadata(handle,
+                                                                   inode,
+                                                                   parent_bh);
+                                }
+                        }
+                }
+        } else {
+                /* We have reached the bottom of the tree. */
+                BUFFER_TRACE(parent_bh, "free data blocks");
+                ext4_free_data(handle, inode, parent_bh, first, last);
+        }
+}
+void ext4_ind_truncate(struct inode *inode)
+{
+        handle_t *handle;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        __le32 *i_data = ei->i_data;
+        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        struct address_space *mapping = inode->i_mapping;
+        ext4_lblk_t offsets[4];
+        Indirect chain[4];
+        Indirect *partial;
+        __le32 nr = 0;
+        int n = 0;
+        ext4_lblk_t last_block, max_block;
+        unsigned blocksize = inode->i_sb->s_blocksize;
+        handle = start_transaction(inode);
+        if (IS_ERR(handle))
+                return;         /* AKPM: return what? */
+        last_block = (inode->i_size + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        if (inode->i_size & (blocksize - 1))
+                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+                        goto out_stop;
+        if (last_block != max_block) {
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
+        /*
+         * OK.  This truncate is going to happen.  We add the inode to the
+         * orphan list, so that if this truncate spans multiple transactions,
+         * and we crash, we will resume the truncate when the filesystem
+         * recovers.  It also marks the inode dirty, to catch the new size.
+         *
+         * Implication: the file must always be in a sane, consistent
+         * truncatable state while each transaction commits.
+         */
+        if (ext4_orphan_add(handle, inode))
+                goto out_stop;
+        /*
+         * From here we block out all ext4_get_block() callers who want to
+         * modify the block allocation tree.
+         */
+        down_write(&ei->i_data_sem);
+        ext4_discard_preallocations(inode);
+        /*
+         * The orphan list entry will now protect us from any crash which
+         * occurs before the truncate completes, so it is now safe to propagate
+         * the new, shorter inode size (held for now in i_size) into the
+         * on-disk inode. We do this via i_disksize, which is the value which
+         * ext4 *really* writes onto the disk inode.
+         */
+        ei->i_disksize = inode->i_size;
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
+                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+                               i_data + EXT4_NDIR_BLOCKS);
+                goto do_indirects;
+        }
+        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+        /* Kill the top of shared branch (not detached) */
+        if (nr) {
+                if (partial == chain) {
+                        /* Shared branch grows from the inode */
+                        ext4_free_branches(handle, inode, NULL,
+                                           &nr, &nr+1, (chain+n-1) - partial);
+                        *partial->p = 0;
+                        /*
+                         * We mark the inode dirty prior to restart,
+                         * and prior to stop.  No need for it here.
+                         */
+                } else {
+                        /* Shared branch grows from an indirect block */
+                        BUFFER_TRACE(partial->bh, "get_write_access");
+                        ext4_free_branches(handle, inode, partial->bh,
+                                        partial->p,
+                                        partial->p+1, (chain+n-1) - partial);
+                }
+        }
+        /* Clear the ends of indirect blocks on the shared branch */
+        while (partial > chain) {
+                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+                                   (__le32*)partial->bh->b_data+addr_per_block,
+                                   (chain+n-1) - partial);
+                BUFFER_TRACE(partial->bh, "call brelse");
+                brelse(partial->bh);
+                partial--;
+        }
+do_indirects:
+        /* Kill the remaining (whole) subtrees */
+        switch (offsets[0]) {
+        default:
+                nr = i_data[EXT4_IND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+                        i_data[EXT4_IND_BLOCK] = 0;
+                }
+        case EXT4_IND_BLOCK:
+                nr = i_data[EXT4_DIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+                        i_data[EXT4_DIND_BLOCK] = 0;
+                }
+        case EXT4_DIND_BLOCK:
+                nr = i_data[EXT4_TIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+                        i_data[EXT4_TIND_BLOCK] = 0;
+                }
+        case EXT4_TIND_BLOCK:
+                ;
+        }
+out_unlock:
+        up_write(&ei->i_data_sem);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+        /*
+         * In a multi-transaction truncate, we only make the final transaction
+         * synchronous
+         */
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+out_stop:
+        /*
+         * If this was a simple ftruncate(), and the file will remain alive
+         * then we need to clear up the orphan record which we created above.
+         * However, if this was a real unlink then we were called by
+         * ext4_delete_inode(), and we allow that function to clean up the
+         * orphan info for us.
+         */
+        if (inode->i_nlink)
+                ext4_orphan_del(handle, inode);
+        ext4_journal_stop(handle);
+        trace_ext4_truncate_exit(inode);
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 678cde834f19..18d2558b7624 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,10 +12,6 @@
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
- *  Goal-directed block allocation by Stephen Tweedie
- *      (sct@redhat.com), 1993, 1998
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *      (jj@sunsite.ms.mff.cuni.cz)
 *
@@ -47,6 +43,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
+#include "truncate.h"
 #include <trace/events/ext4.h>
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 /*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
-{
-        ext4_lblk_t needed;
-        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
-        /* Give ourselves just enough room to cope with inodes in which
-         * i_blocks is corrupt: we've seen disk corruptions in the past
-         * which resulted in random data in an inode which looked enough
-         * like a regular file for ext4 to try to delete it.  Things
-         * will go a bit crazy if that happens, but at least we should
-         * try not to panic the whole kernel. */
-        if (needed < 2)
-                needed = 2;
-        /* But we need to bound the transaction so we don't overflow the
-         * journal. */
-        if (needed > EXT4_MAX_TRANS_DATA)
-                needed = EXT4_MAX_TRANS_DATA;
-        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
-}
-/*
- * Truncate transactions can be complex and absolutely huge.  So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit.  If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
-        handle_t *result;
-        result = ext4_journal_start(inode, blocks_for_truncate(inode));
-        if (!IS_ERR(result))
-                return result;
-        ext4_std_error(inode->i_sb, PTR_ERR(result));
-        return result;
-}
-/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room.  If we can't create more
- * room, and the transaction must be restarted we return 1.
- */
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
-{
-        if (!ext4_handle_valid(handle))
-                return 0;
-        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
-                return 0;
-        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
-                return 0;
-        return 1;
-}
-/*
 * Restart the transaction associated with *handle.  This does a commit,
 * so before we call here everything must be consistently dirtied against
 * this transaction.
@@ -189,7 +120,37 @@ void ext4_evict_inode(struct inode *inode)
        int err;
        trace_ext4_evict_inode(inode);
+        ext4_ioend_wait(inode);
        if (inode->i_nlink) {
+                /*
+                 * When journalling data dirty buffers are tracked only in the
+                 * journal. So although mm thinks everything is clean and
+                 * ready for reaping the inode might still have some pages to
+                 * write in the running transaction or waiting to be
+                 * checkpointed. Thus calling jbd2_journal_invalidatepage()
+                 * (via truncate_inode_pages()) to discard these buffers can
+                 * cause data loss. Also even if we did not discard these
+                 * buffers, we would have no way to find them after the inode
+                 * is reaped and thus user could see stale data if he tries to
+                 * read them before the transaction is checkpointed. So be
+                 * careful and force everything to disk here... We use
+                 * ei->i_datasync_tid to store the newest transaction
+                 * containing inode's data.
+                 *
+                 * Note that directories do not have this problem because they
+                 * don't use page cache.
+                 */
+                if (ext4_should_journal_data(inode) &&
+                    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+                        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+                        tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+                        jbd2_log_start_commit(journal, commit_tid);
+                        jbd2_log_wait_commit(journal, commit_tid);
+                        filemap_write_and_wait(&inode->i_data);
+                }
                truncate_inode_pages(&inode->i_data, 0);
                goto no_delete;
        }
@@ -204,7 +165,7 @@ void ext4_evict_inode(struct inode *inode)
        if (is_bad_inode(inode))
                goto no_delete;
-        handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
+        handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@ -277,793 +238,6 @@ no_delete:
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 }
-typedef struct {
-        __le32  *p;
-        __le32  key;
-        struct buffer_head *bh;
-} Indirect;
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
-{
-        p->key = *(p->p = v);
-        p->bh = bh;
-}
-/**
- *      ext4_block_to_path - parse the block number into array of offsets
- *      @inode: inode in question (we are only interested in its superblock)
- *      @i_block: block number to be parsed
- *      @offsets: array to store the offsets in
- *      @boundary: set this non-zero if the referred-to block is likely to be
- *             followed (on disk) by an indirect block.
- *
- *      To store the locations of file's data ext4 uses a data structure common
- *      for UNIX filesystems - tree of pointers anchored in the inode, with
- *      data blocks at leaves and indirect blocks in intermediate nodes.
- *      This function translates the block number into path in that tree -
- *      return value is the path length and @offsets[n] is the offset of
- *      pointer to (n+1)th node in the nth one. If @block is out of range
- *      (negative or too large) warning is printed and zero returned.
- *
- *      Note: function doesn't find node addresses, so no IO is needed. All
- *      we need to know is the capacity of indirect blocks (taken from the
- *      inode->i_sb).
- */
-/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
- */
-static int ext4_block_to_path(struct inode *inode,
-                              ext4_lblk_t i_block,
-                              ext4_lblk_t offsets[4], int *boundary)
-{
-        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
-        const long direct_blocks = EXT4_NDIR_BLOCKS,
-                indirect_blocks = ptrs,
-                double_blocks = (1 << (ptrs_bits * 2));
-        int n = 0;
-        int final = 0;
-        if (i_block < direct_blocks) {
-                offsets[n++] = i_block;
-                final = direct_blocks;
-        } else if ((i_block -= direct_blocks) < indirect_blocks) {
-                offsets[n++] = EXT4_IND_BLOCK;
-                offsets[n++] = i_block;
-                final = ptrs;
-        } else if ((i_block -= indirect_blocks) < double_blocks) {
-                offsets[n++] = EXT4_DIND_BLOCK;
-                offsets[n++] = i_block >> ptrs_bits;
-                offsets[n++] = i_block & (ptrs - 1);
-                final = ptrs;
-        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
-                offsets[n++] = EXT4_TIND_BLOCK;
-                offsets[n++] = i_block >> (ptrs_bits * 2);
-                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
-                offsets[n++] = i_block & (ptrs - 1);
-                final = ptrs;
-        } else {
-                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
-                             i_block + direct_blocks +
-                             indirect_blocks + double_blocks, inode->i_ino);
-        }
-        if (boundary)
-                *boundary = final - 1 - (i_block & (ptrs - 1));
-        return n;
-}
-static int __ext4_check_blockref(const char *function, unsigned int line,
-                                 struct inode *inode,
-                                 __le32 *p, unsigned int max)
-{
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        __le32 *bref = p;
-        unsigned int blk;
-        while (bref < p+max) {
-                blk = le32_to_cpu(*bref++);
-                if (blk &&
-                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
-                                                    blk, 1))) {
-                        es->s_last_error_block = cpu_to_le64(blk);
-                        ext4_error_inode(inode, function, line, blk,
-                                         "invalid block");
-                        return -EIO;
-                }
-        }
-        return 0;
-}
-#define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, __LINE__, inode,                \
-                              (__le32 *)(bh)->b_data,                   \
-                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
-#define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, __LINE__, inode,                \
-                              EXT4_I(inode)->i_data,                    \
-                              EXT4_NDIR_BLOCKS)
-/**
- *      ext4_get_branch - read the chain of indirect blocks leading to data
- *      @inode: inode in question
- *      @depth: depth of the chain (1 - direct pointer, etc.)
- *      @offsets: offsets of pointers in inode/indirect blocks
- *      @chain: place to store the result
- *      @err: here we store the error value
- *
- *      Function fills the array of triples <key, p, bh> and returns %NULL
- *      if everything went OK or the pointer to the last filled triple
- *      (incomplete one) otherwise. Upon the return chain[i].key contains
- *      the number of (i+1)-th block in the chain (as it is stored in memory,
- *      i.e. little-endian 32-bit), chain[i].p contains the address of that
- *      number (it points into struct inode for i==0 and into the bh->b_data
- *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- *      block for i>0 and NULL for i==0. In other words, it holds the block
- *      numbers of the chain, addresses they were taken from (and where we can
- *      verify that chain did not change) and buffer_heads hosting these
- *      numbers.
- *
- *      Function stops when it stumbles upon zero pointer (absent block)
- *              (pointer to last triple returned, *@err == 0)
- *      or when it gets an IO error reading an indirect block
- *              (ditto, *@err == -EIO)
- *      or when it reads all @depth-1 indirect blocks successfully and finds
- *      the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- *      Need to be called with
- *      down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
-                                 ext4_lblk_t  *offsets,
-                                 Indirect chain[4], int *err)
-{
-        struct super_block *sb = inode->i_sb;
-        Indirect *p = chain;
-        struct buffer_head *bh;
-        *err = 0;
-        /* i_data is not going away, no lock needed */
-        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
-        if (!p->key)
-                goto no_block;
-        while (--depth) {
-                bh = sb_getblk(sb, le32_to_cpu(p->key));
-                if (unlikely(!bh))
-                        goto failure;
-                if (!bh_uptodate_or_lock(bh)) {
-                        if (bh_submit_read(bh) < 0) {
-                                put_bh(bh);
-                                goto failure;
-                        }
-                        /* validate block references */
-                        if (ext4_check_indirect_blockref(inode, bh)) {
-                                put_bh(bh);
-                                goto failure;
-                        }
-                }
-                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
-                /* Reader: end */
-                if (!p->key)
-                        goto no_block;
-        }
-        return NULL;
-failure:
-        *err = -EIO;
-no_block:
-        return p;
-}
-/**
- *      ext4_find_near - find a place for allocation with sufficient locality
- *      @inode: owner
- *      @ind: descriptor of indirect block.
- *
- *      This function returns the preferred place for block allocation.
- *      It is used when heuristic for sequential allocation fails.
- *      Rules are:
- *        + if there is a block to the left of our position - allocate near it.
- *        + if pointer will live in indirect block - allocate near that block.
- *        + if pointer will live in inode - allocate in the same
- *          cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group.   The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- *      Caller must make sure that @ind is valid and will stay that way.
- */
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
-{
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
-        __le32 *p;
-        ext4_fsblk_t bg_start;
-        ext4_fsblk_t last_block;
-        ext4_grpblk_t colour;
-        ext4_group_t block_group;
-        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
-        /* Try to find previous block */
-        for (p = ind->p - 1; p >= start; p--) {
-                if (*p)
-                        return le32_to_cpu(*p);
-        }
-        /* No such thing, so let's try location of indirect block */
-        if (ind->bh)
-                return ind->bh->b_blocknr;
-        /*
-         * It is going to be referred to from the inode itself? OK, just put it
-         * into the same cylinder group then.
-         */
-        block_group = ei->i_block_group;
-        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
-                block_group &= ~(flex_size-1);
-                if (S_ISREG(inode->i_mode))
-                        block_group++;
-        }
-        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
-        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-        /*
-         * If we are doing delayed allocation, we don't need take
-         * colour into account.
-         */
-        if (test_opt(inode->i_sb, DELALLOC))
-                return bg_start;
-        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
-                colour = (current->pid % 16) *
-                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
-        else
-                colour = (current->pid % 16) * ((last_block - bg_start) / 16);
-        return bg_start + colour;
-}
-/**
- *      ext4_find_goal - find a preferred place for allocation.
- *      @inode: owner
- *      @block:  block we want
- *      @partial: pointer to the last triple within a chain
- *
- *      Normally this function find the preferred place for block allocation,
- *      returns it.
- *      Because this is only used for non-extent files, we limit the block nr
- *      to 32 bits.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-                                   Indirect *partial)
-{
-        ext4_fsblk_t goal;
-        /*
-         * XXX need to get goal block from mballoc's data structures
-         */
-        goal = ext4_find_near(inode, partial);
-        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-        return goal;
-}
-/**
- *      ext4_blks_to_allocate - Look up the block map and count the number
- *      of direct blocks need to be allocated for the given branch.
- *
- *      @branch: chain of indirect blocks
- *      @k: number of blocks need for indirect blocks
- *      @blks: number of data blocks to be mapped.
- *      @blocks_to_boundary:  the offset in the indirect block
- *
- *      return the total number of blocks to be allocate, including the
- *      direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
-                                 int blocks_to_boundary)
-{
-        unsigned int count = 0;
-        /*
-         * Simple case, [t,d]Indirect block(s) has not allocated yet
-         * then it's clear blocks on that path have not allocated
-         */
-        if (k > 0) {
-                /* right now we don't handle cross boundary allocation */
-                if (blks < blocks_to_boundary + 1)
-                        count += blks;
-                else
-                        count += blocks_to_boundary + 1;
-                return count;
-        }
-        count++;
-        while (count < blks && count <= blocks_to_boundary &&
-                le32_to_cpu(*(branch[0].p + count)) == 0) {
-                count++;
-        }
-        return count;
-}
-/**
- *      ext4_alloc_blocks: multiple allocate blocks needed for a branch
- *      @handle: handle for this transaction
- *      @inode: inode which needs allocated blocks
- *      @iblock: the logical block to start allocated at
- *      @goal: preferred physical block of allocation
- *      @indirect_blks: the number of blocks need to allocate for indirect
- *                      blocks
- *      @blks: number of desired blocks
- *      @new_blocks: on return it will store the new block numbers for
- *      the indirect blocks(if needed) and the first direct block,
- *      @err: on return it will store the error code
- *
- *      This function will return the number of blocks allocated as
- *      requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-                             ext4_lblk_t iblock, ext4_fsblk_t goal,
-                             int indirect_blks, int blks,
-                             ext4_fsblk_t new_blocks[4], int *err)
-{
-        struct ext4_allocation_request ar;
-        int target, i;
-        unsigned long count = 0, blk_allocated = 0;
-        int index = 0;
-        ext4_fsblk_t current_block = 0;
-        int ret = 0;
-        /*
-         * Here we try to allocate the requested multiple blocks at once,
-         * on a best-effort basis.
-         * To build a branch, we should allocate blocks for
-         * the indirect blocks(if not allocated yet), and at least
-         * the first direct block of this branch.  That's the
-         * minimum number of blocks need to allocate(required)
-         */
-        /* first we try to allocate the indirect blocks */
-        target = indirect_blks;
-        while (target > 0) {
-                count = target;
-                /* allocating blocks for indirect blocks and direct blocks */
-                current_block = ext4_new_meta_blocks(handle, inode, goal,
-                                                     0, &count, err);
-                if (*err)
-                        goto failed_out;
-                if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
-                        EXT4_ERROR_INODE(inode,
-                                         "current_block %llu + count %lu > %d!",
-                                         current_block, count,
-                                         EXT4_MAX_BLOCK_FILE_PHYS);
-                        *err = -EIO;
-                        goto failed_out;
-                }
-                target -= count;
-                /* allocate blocks for indirect blocks */
-                while (index < indirect_blks && count) {
-                        new_blocks[index++] = current_block++;
-                        count--;
-                }
-                if (count > 0) {
-                        /*
-                         * save the new block number
-                         * for the first direct block
-                         */
-                        new_blocks[index] = current_block;
-                        printk(KERN_INFO "%s returned more blocks than "
-                                                "requested\n", __func__);
-                        WARN_ON(1);
-                        break;
-                }
-        }
-        target = blks - count ;
-        blk_allocated = count;
-        if (!target)
-                goto allocated;
-        /* Now allocate data blocks */
-        memset(&ar, 0, sizeof(ar));
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = target;
-        ar.logical = iblock;
-        if (S_ISREG(inode->i_mode))
-                /* enable in-core preallocation only for regular files */
-                ar.flags = EXT4_MB_HINT_DATA;
-        current_block = ext4_mb_new_blocks(handle, &ar, err);
-        if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
-                EXT4_ERROR_INODE(inode,
-                                 "current_block %llu + ar.len %d > %d!",
-                                 current_block, ar.len,
-                                 EXT4_MAX_BLOCK_FILE_PHYS);
-                *err = -EIO;
-                goto failed_out;
-        }
-        if (*err && (target == blks)) {
-                /*
-                 * if the allocation failed and we didn't allocate
-                 * any blocks before
-                 */
-                goto failed_out;
-        }
-        if (!*err) {
-                if (target == blks) {
-                        /*
-                         * save the new block number
-                         * for the first direct block
-                         */
-                        new_blocks[index] = current_block;
-                }
-                blk_allocated += ar.len;
-        }
-allocated:
-        /* total number of blocks allocated for direct blocks */
-        ret = blk_allocated;
-        *err = 0;
-        return ret;
-failed_out:
-        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        return ret;
-}
-/**
- *      ext4_alloc_branch - allocate and set up a chain of blocks.
- *      @handle: handle for this transaction
- *      @inode: owner
- *      @indirect_blks: number of allocated indirect blocks
- *      @blks: number of allocated direct blocks
- *      @goal: preferred place for allocation
- *      @offsets: offsets (in the blocks) to store the pointers to next.
- *      @branch: place to store the chain in.
- *
- *      This function allocates blocks, zeroes out all but the last one,
- *      links them into chain and (if we are synchronous) writes them to disk.
- *      In other words, it prepares a branch that can be spliced onto the
- *      inode. It stores the information about that chain in the branch[], in
- *      the same format as ext4_get_branch() would do. We are calling it after
- *      we had read the existing part of chain and partial points to the last
- *      triple of that (one with zero ->key). Upon the exit we have the same
- *      picture as after the successful ext4_get_block(), except that in one
- *      place chain is disconnected - *branch->p is still zero (we did not
- *      set the last link), but branch->key contains the number that should
- *      be placed into *branch->p to fill that gap.
- *
- *      If allocation fails we free all blocks we've allocated (and forget
- *      their buffer_heads) and return the error value the from failed
- *      ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- *      as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-                             ext4_lblk_t iblock, int indirect_blks,
-                             int *blks, ext4_fsblk_t goal,
-                             ext4_lblk_t *offsets, Indirect *branch)
-{
-        int blocksize = inode->i_sb->s_blocksize;
-        int i, n = 0;
-        int err = 0;
-        struct buffer_head *bh;
-        int num;
-        ext4_fsblk_t new_blocks[4];
-        ext4_fsblk_t current_block;
-        num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
-                                *blks, new_blocks, &err);
-        if (err)
-                return err;
-        branch[0].key = cpu_to_le32(new_blocks[0]);
-        /*
-         * metadata blocks and data blocks are allocated.
-         */
-        for (n = 1; n <= indirect_blks;  n++) {
-                /*
-                 * Get buffer_head for parent block, zero it out
-                 * and set the pointer to new one, then send
-                 * parent to disk.
-                 */
-                bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
-                if (unlikely(!bh)) {
-                        err = -EIO;
-                        goto failed;
-                }
-                branch[n].bh = bh;
-                lock_buffer(bh);
-                BUFFER_TRACE(bh, "call get_create_access");
-                err = ext4_journal_get_create_access(handle, bh);
-                if (err) {
-                        /* Don't brelse(bh) here; it's done in
-                         * ext4_journal_forget() below */
-                        unlock_buffer(bh);
-                        goto failed;
-                }
-                memset(bh->b_data, 0, blocksize);
-                branch[n].p = (__le32 *) bh->b_data + offsets[n];
-                branch[n].key = cpu_to_le32(new_blocks[n]);
-                *branch[n].p = branch[n].key;
-                if (n == indirect_blks) {
-                        current_block = new_blocks[n];
-                        /*
-                         * End of chain, update the last new metablock of
-                         * the chain to point to the new allocated
-                         * data blocks numbers
-                         */
-                        for (i = 1; i < num; i++)
-                                *(branch[n].p + i) = cpu_to_le32(++current_block);
-                }
-                BUFFER_TRACE(bh, "marking uptodate");
-                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
-                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, inode, bh);
-                if (err)
-                        goto failed;
-        }
-        *blks = num;
-        return err;
-failed:
-        /* Allocation failed, free what we already allocated */
-        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
-        for (i = 1; i <= n ; i++) {
-                /*
-                 * branch[i].bh is newly allocated, so there is no
-                 * need to revoke the block, which is why we don't
-                 * need to set EXT4_FREE_BLOCKS_METADATA.
-                 */
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
-                                 EXT4_FREE_BLOCKS_FORGET);
-        }
-        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
-        return err;
-}
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @handle: handle for this transaction
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext4_alloc_branch)
- * @where: location of missing link
- * @num:   number of indirect blocks we are adding
- * @blks:  number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-                              ext4_lblk_t block, Indirect *where, int num,
-                              int blks)
-{
-        int i;
-        int err = 0;
-        ext4_fsblk_t current_block;
-        /*
-         * If we're splicing into a [td]indirect block (as opposed to the
-         * inode) then we need to get write access to the [td]indirect block
-         * before the splice.
-         */
-        if (where->bh) {
-                BUFFER_TRACE(where->bh, "get_write_access");
-                err = ext4_journal_get_write_access(handle, where->bh);
-                if (err)
-                        goto err_out;
-        }
-        /* That's it */
-        *where->p = where->key;
-        /*
-         * Update the host buffer_head or inode to point to more just allocated
-         * direct blocks blocks
-         */
-        if (num == 0 && blks > 1) {
-                current_block = le32_to_cpu(where->key) + 1;
-                for (i = 1; i < blks; i++)
-                        *(where->p + i) = cpu_to_le32(current_block++);
-        }
-        /* We are done with atomic stuff, now do the rest of housekeeping */
-        /* had we spliced it onto indirect block? */
-        if (where->bh) {
-                /*
-                 * If we spliced it onto an indirect block, we haven't
-                 * altered the inode.  Note however that if it is being spliced
-                 * onto an indirect block at the very end of the file (the
-                 * file is growing) then we *will* alter the inode to reflect
-                 * the new i_size.  But that is not done here - it is done in
-                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
-                 */
-                jbd_debug(5, "splicing indirect only\n");
-                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
-                if (err)
-                        goto err_out;
-        } else {
-                /*
-                 * OK, we spliced it into the inode itself on a direct block.
-                 */
-                ext4_mark_inode_dirty(handle, inode);
-                jbd_debug(5, "splicing direct\n");
-        }
-        return err;
-err_out:
-        for (i = 1; i <= num; i++) {
-                /*
-                 * branch[i].bh is newly allocated, so there is no
-                 * need to revoke the block, which is why we don't
-                 * need to set EXT4_FREE_BLOCKS_METADATA.
-                 */
-                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
-                                 EXT4_FREE_BLOCKS_FORGET);
-        }
-        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
-                         blks, 0);
-        return err;
-}
-/*
- * The ext4_ind_map_blocks() function handles non-extents inodes
- * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_map_blocks().
- *
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- * The ext4_ind_get_blocks() function should be called with
- * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
- * blocks.
- */
-static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               struct ext4_map_blocks *map,
-                               int flags)
-{
-        int err = -EIO;
-        ext4_lblk_t offsets[4];
-        Indirect chain[4];
-        Indirect *partial;
-        ext4_fsblk_t goal;
-        int indirect_blks;
-        int blocks_to_boundary = 0;
-        int depth;
-        int count = 0;
-        ext4_fsblk_t first_block = 0;
-        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
-        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
-        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
-                                   &blocks_to_boundary);
-        if (depth == 0)
-                goto out;
-        partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-        /* Simplest case - block found, no allocation needed */
-        if (!partial) {
-                first_block = le32_to_cpu(chain[depth - 1].key);
-                count++;
-                /*map more blocks*/
-                while (count < map->m_len && count <= blocks_to_boundary) {
-                        ext4_fsblk_t blk;
-                        blk = le32_to_cpu(*(chain[depth-1].p + count));
-                        if (blk == first_block + count)
-                                count++;
-                        else
-                                break;
-                }
-                goto got_it;
-        }
-        /* Next simple case - plain lookup or failed read of indirect block */
-        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
-                goto cleanup;
-        /*
-         * Okay, we need to do block allocation.
-        */
-        goal = ext4_find_goal(inode, map->m_lblk, partial);
-        /* the number of blocks need to allocate for [d,t]indirect blocks */
-        indirect_blks = (chain + depth) - partial - 1;
-        /*
-         * Next look up the indirect map to count the totoal number of
-         * direct blocks to allocate for this branch.
-         */
-        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                      map->m_len, blocks_to_boundary);
-        /*
-         * Block out ext4_truncate while we alter the tree
-         */
-        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
-                                &count, goal,
-                                offsets + (partial - chain), partial);
-        /*
-         * The ext4_splice_branch call will free and forget any buffers
-         * on the new chain if there is a failure, but that risks using
-         * up transaction credits, especially for bitmaps where the
-         * credits cannot be returned.  Can we handle this somehow?  We
-         * may need to return -EAGAIN upwards in the worst case.  --sct
-         */
-        if (!err)
-                err = ext4_splice_branch(handle, inode, map->m_lblk,
-                                         partial, indirect_blks, count);
-        if (err)
-                goto cleanup;
-        map->m_flags |= EXT4_MAP_NEW;
-        ext4_update_inode_fsync_trans(handle, inode, 1);
-got_it:
-        map->m_flags |= EXT4_MAP_MAPPED;
-        map->m_pblk = le32_to_cpu(chain[depth-1].key);
-        map->m_len = count;
-        if (count > blocks_to_boundary)
-                map->m_flags |= EXT4_MAP_BOUNDARY;
-        err = count;
-        /* Clean up and exit */
-        partial = chain + depth - 1;    /* the whole chain */
-cleanup:
-        while (partial > chain) {
-                BUFFER_TRACE(partial->bh, "call brelse");
-                brelse(partial->bh);
-                partial--;
-        }
-out:
-        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
-                                map->m_pblk, map->m_len, err);
-        return err;
-}
 #ifdef CONFIG_QUOTA
 qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
@@ -1073,33 +247,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode,
-                                              sector_t lblock)
-{
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
-        int blk_bits;
-        if (lblock < EXT4_NDIR_BLOCKS)
-                return 0;
-        lblock -= EXT4_NDIR_BLOCKS;
-        if (ei->i_da_metadata_calc_len &&
-            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
-                ei->i_da_metadata_calc_len++;
-                return 0;
-        }
-        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
-        ei->i_da_metadata_calc_len = 1;
-        blk_bits = order_base_2(lblock);
-        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-/*
- * Calculate the number of metadata blocks need to reserve
 * to allocate a block located at @lblock
 */
 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
@@ -1107,7 +254,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
-        return ext4_indirect_calc_metadata_amount(inode, lblock);
+        return ext4_ind_calc_metadata_amount(inode, lblock);
 }
 /*
@@ -1589,16 +736,6 @@ static int do_journal_get_write_access(handle_t *handle,
        return ret;
 }
-/*
- * Truncate blocks that were not used by write. We have to truncate the
- * pagecache as well so that corresponding buffers get properly unmapped.
- */
-static void ext4_truncate_failed_write(struct inode *inode)
-{
-        truncate_inode_pages(inode->i_mapping, inode->i_size);
-        ext4_truncate(inode);
-}
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
@@ -1849,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
+        BUG_ON(!ext4_handle_valid(handle));
        if (copied < len) {
                if (!PageUptodate(page))
                        copied = 0;
@@ -1863,6 +1002,7 @@ static int ext4_journalled_write_end(struct file *file,
        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
        ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2148,7 +1288,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
-                        else
+                        else if (buffer_uninit(page_bufs)) {
+                                ext4_set_bh_endio(page_bufs, inode);
+                                err = block_write_full_page_endio(page,
+                                        noalloc_get_block_write,
+                                        mpd->wbc, ext4_end_io_buffer_write);
+                        } else
                                err = block_write_full_page(page,
                                        noalloc_get_block_write, mpd->wbc);
@@ -2564,6 +1709,8 @@ static int __ext4_journalled_writepage(struct page *page,
                goto out;
        }
+        BUG_ON(!ext4_handle_valid(handle));
        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                do_journal_get_write_access);
@@ -2571,6 +1718,7 @@ static int __ext4_journalled_writepage(struct page *page,
                                write_end_fn);
        if (ret == 0)
                ret = err;
+        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
@@ -2741,7 +1889,7 @@ static int write_cache_pages_da(struct address_space *mapping,
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2121,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        }
 retry:
-        if (wbc->sync_mode == WB_SYNC_ALL)
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        while (!ret && wbc->nr_to_write > 0) {
@@ -3450,112 +2598,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 /*
- * O_DIRECT for ext3 (or indirect map) based files
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
-                              const struct iovec *iov, loff_t offset,
-                              unsigned long nr_segs)
-{
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        handle_t *handle;
-        ssize_t ret;
-        int orphan = 0;
-        size_t count = iov_length(iov, nr_segs);
-        int retries = 0;
-        if (rw == WRITE) {
-                loff_t final_size = offset + count;
-                if (final_size > inode->i_size) {
-                        /* Credits for sb + inode write */
-                        handle = ext4_journal_start(inode, 2);
-                        if (IS_ERR(handle)) {
-                                ret = PTR_ERR(handle);
-                                goto out;
-                        }
-                        ret = ext4_orphan_add(handle, inode);
-                        if (ret) {
-                                ext4_journal_stop(handle);
-                                goto out;
-                        }
-                        orphan = 1;
-                        ei->i_disksize = inode->i_size;
-                        ext4_journal_stop(handle);
-                }
-        }
-retry:
-        if (rw == READ && ext4_should_dioread_nolock(inode))
-                ret = __blockdev_direct_IO(rw, iocb, inode,
-                                 inode->i_sb->s_bdev, iov,
-                                 offset, nr_segs,
-                                 ext4_get_block, NULL, NULL, 0);
-        else {
-                ret = blockdev_direct_IO(rw, iocb, inode, iov,
-                                 offset, nr_segs, ext4_get_block);
-                if (unlikely((rw & WRITE) && ret < 0)) {
-                        loff_t isize = i_size_read(inode);
-                        loff_t end = offset + iov_length(iov, nr_segs);
-                        if (end > isize)
-                                ext4_truncate_failed_write(inode);
-                }
-        }
-        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-                goto retry;
-        if (orphan) {
-                int err;
-                /* Credits for sb + inode write */
-                handle = ext4_journal_start(inode, 2);
-                if (IS_ERR(handle)) {
-                        /* This is really bad luck. We've written the data
-                         * but cannot extend i_size. Bail out and pretend
-                         * the write failed... */
-                        ret = PTR_ERR(handle);
-                        if (inode->i_nlink)
-                                ext4_orphan_del(NULL, inode);
-                        goto out;
-                }
-                if (inode->i_nlink)
-                        ext4_orphan_del(handle, inode);
-                if (ret > 0) {
-                        loff_t end = offset + ret;
-                        if (end > inode->i_size) {
-                                ei->i_disksize = end;
-                                i_size_write(inode, end);
-                                /*
-                                 * We're going to return a positive `ret'
-                                 * here due to non-zero-length I/O, so there's
-                                 * no way of reporting error returns from
-                                 * ext4_mark_inode_dirty() to userspace.  So
-                                 * ignore it.
-                                 */
-                                ext4_mark_inode_dirty(handle, inode);
-                        }
-                }
-                err = ext4_journal_stop(handle);
-                if (ret == 0)
-                        ret = err;
-        }
-out:
-        return ret;
-}
-/*
 * ext4_get_block used when preparing for a DIO write or buffer write.
 * We allocate an uinitialized extent if blocks haven't been allocated.
 * The extent will be converted to initialized after the IO is complete.
@@ -3638,8 +2680,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_END_UNWRITTEN;
+        /*
+         * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+         * but being more careful is always safe for the future change.
+         */
        inode = io_end->inode;
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -4033,383 +3082,6 @@ unlock:
        return err;
 }
-/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
- */
-static inline int all_zeroes(__le32 *p, __le32 *q)
-{
-        while (p < q)
-                if (*p++)
-                        return 0;
-        return 1;
-}
-/**
- *      ext4_find_shared - find the indirect blocks for partial truncation.
- *      @inode:   inode in question
- *      @depth:   depth of the affected branch
- *      @offsets: offsets of pointers in that branch (see ext4_block_to_path)
- *      @chain:   place to store the pointers to partial indirect blocks
- *      @top:     place to the (detached) top of branch
- *
- *      This is a helper function used by ext4_truncate().
- *
- *      When we do truncate() we may have to clean the ends of several
- *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is referred
- *      from it (and it is on the path to the first completely truncated
- *      data block, indeed).  We have to free the top of that path along
- *      with everything to the right of the path. Since no allocation
- *      past the truncation point is possible until ext4_truncate()
- *      finishes, we may safely do the latter, but top of branch may
- *      require special attention - pageout below the truncation point
- *      might try to populate it.
- *
- *      We atomically detach the top of branch from the tree, store the
- *      block number of its root in *@top, pointers to buffer_heads of
- *      partially truncated blocks - in @chain[].bh and pointers to
- *      their last elements that should not be removed - in
- *      @chain[].p. Return value is the pointer to last filled element
- *      of @chain.
- *
- *      The work left to caller to do the actual freeing of subtrees:
- *              a) free the subtree starting from *@top
- *              b) free the subtrees whose roots are stored in
- *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- *              c) free the subtrees growing from the inode past the @chain[0].
- *                      (no partially truncated stuff there).  */
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
-                                  ext4_lblk_t offsets[4], Indirect chain[4],
-                                  __le32 *top)
-{
-        Indirect *partial, *p;
-        int k, err;
-        *top = 0;
-        /* Make k index the deepest non-null offset + 1 */
-        for (k = depth; k > 1 && !offsets[k-1]; k--)
-                ;
-        partial = ext4_get_branch(inode, k, offsets, chain, &err);
-        /* Writer: pointers */
-        if (!partial)
-                partial = chain + k-1;
-        /*
-         * If the branch acquired continuation since we've looked at it -
-         * fine, it should all survive and (new) top doesn't belong to us.
-         */
-        if (!partial->key && *partial->p)
-                /* Writer: end */
-                goto no_top;
-        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
-                ;
-        /*
-         * OK, we've found the last block that must survive. The rest of our
-         * branch should be detached before unlocking. However, if that rest
-         * of branch is all ours and does not grow immediately from the inode
-         * it's easier to cheat and just decrement partial->p.
-         */
-        if (p == chain + k - 1 && p > chain) {
-                p->p--;
-        } else {
-                *top = *p->p;
-                /* Nope, don't do this in ext4.  Must leave the tree intact */
-#if 0
-                *p->p = 0;
-#endif
-        }
-        /* Writer: end */
-        while (partial > p) {
-                brelse(partial->bh);
-                partial--;
-        }
-no_top:
-        return partial;
-}
-/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
- *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
- *
- * Return 0 on success, 1 on invalid block range
- * and < 0 on fatal error.
- */
-static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
-                             struct buffer_head *bh,
-                             ext4_fsblk_t block_to_free,
-                             unsigned long count, __le32 *first,
-                             __le32 *last)
-{
-        __le32 *p;
-        int     flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
-        int     err;
-        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-                flags |= EXT4_FREE_BLOCKS_METADATA;
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
-                                   count)) {
-                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
-                                 "blocks %llu len %lu",
-                                 (unsigned long long) block_to_free, count);
-                return 1;
-        }
-        if (try_to_extend_transaction(handle, inode)) {
-                if (bh) {
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (unlikely(err))
-                                goto out_err;
-                }
-                err = ext4_mark_inode_dirty(handle, inode);
-                if (unlikely(err))
-                        goto out_err;
-                err = ext4_truncate_restart_trans(handle, inode,
-                                                  blocks_for_truncate(inode));
-                if (unlikely(err))
-                        goto out_err;
-                if (bh) {
-                        BUFFER_TRACE(bh, "retaking write access");
-                        err = ext4_journal_get_write_access(handle, bh);
-                        if (unlikely(err))
-                                goto out_err;
-                }
-        }
-        for (p = first; p < last; p++)
-                *p = 0;
-        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
-        return 0;
-out_err:
-        ext4_std_error(inode->i_sb, err);
-        return err;
-}
-/**
- * ext4_free_data - free a list of data blocks
- * @handle:     handle for this transaction
- * @inode:      inode we are dealing with
- * @this_bh:    indirect buffer_head which contains *@first and *@last
- * @first:      array of block numbers
- * @last:       points immediately past the end of array
- *
- * We are freeing all blocks referred from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free.  Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext4_free_data(handle_t *handle, struct inode *inode,
-                           struct buffer_head *this_bh,
-                           __le32 *first, __le32 *last)
-{
-        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
-        unsigned long count = 0;            /* Number of blocks in the run */
-        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
-                                               corresponding to
-                                               block_to_free */
-        ext4_fsblk_t nr;                    /* Current block # */
-        __le32 *p;                          /* Pointer into inode/ind
-                                               for current block */
-        int err = 0;
-        if (this_bh) {                          /* For indirect block */
-                BUFFER_TRACE(this_bh, "get_write_access");
-                err = ext4_journal_get_write_access(handle, this_bh);
-                /* Important: if we can't update the indirect pointers
-                 * to the blocks, we can't free them. */
-                if (err)
-                        return;
-        }
-        for (p = first; p < last; p++) {
-                nr = le32_to_cpu(*p);
-                if (nr) {
-                        /* accumulate blocks to free if they're contiguous */
-                        if (count == 0) {
-                                block_to_free = nr;
-                                block_to_free_p = p;
-                                count = 1;
-                        } else if (nr == block_to_free + count) {
-                                count++;
-                        } else {
-                                err = ext4_clear_blocks(handle, inode, this_bh,
-                                                        block_to_free, count,
-                                                        block_to_free_p, p);
-                                if (err)
-                                        break;
-                                block_to_free = nr;
-                                block_to_free_p = p;
-                                count = 1;
-                        }
-                }
-        }
-        if (!err && count > 0)
-                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-                                        count, block_to_free_p, p);
-        if (err < 0)
-                /* fatal error */
-                return;
-        if (this_bh) {
-                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
-                /*
-                 * The buffer head should have an attached journal head at this
-                 * point. However, if the data is corrupted and an indirect
-                 * block pointed to itself, it would have been detached when
-                 * the block was cleared. Check for this instead of OOPSing.
-                 */
-                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
-                        ext4_handle_dirty_metadata(handle, inode, this_bh);
-                else
-                        EXT4_ERROR_INODE(inode,
-                                         "circular indirect block detected at "
-                                         "block %llu",
-                                (unsigned long long) this_bh->b_blocknr);
-        }
-}
-/**
- *      ext4_free_branches - free an array of branches
- *      @handle: JBD handle for this transaction
- *      @inode: inode we are dealing with
- *      @parent_bh: the buffer_head which contains *@first and *@last
- *      @first: array of block numbers
- *      @last:  pointer immediately past the end of array
- *      @depth: depth of the branches to free
- *
- *      We are freeing all blocks referred from these branches (numbers are
- *      stored as little-endian 32-bit) and updating @inode->i_blocks
- *      appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
-                               struct buffer_head *parent_bh,
-                               __le32 *first, __le32 *last, int depth)
-{
-        ext4_fsblk_t nr;
-        __le32 *p;
-        if (ext4_handle_is_aborted(handle))
-                return;
-        if (depth--) {
-                struct buffer_head *bh;
-                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-                p = last;
-                while (--p >= first) {
-                        nr = le32_to_cpu(*p);
-                        if (!nr)
-                                continue;               /* A hole */
-                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
-                                                   nr, 1)) {
-                                EXT4_ERROR_INODE(inode,
-                                                 "invalid indirect mapped "
-                                                 "block %lu (level %d)",
-                                                 (unsigned long) nr, depth);
-                                break;
-                        }
-                        /* Go read the buffer for the next level down */
-                        bh = sb_bread(inode->i_sb, nr);
-                        /*
-                         * A read failure? Report error and clear slot
-                         * (should be rare).
-                         */
-                        if (!bh) {
-                                EXT4_ERROR_INODE_BLOCK(inode, nr,
-                                                       "Read failure");
-                                continue;
-                        }
-                        /* This zaps the entire block.  Bottom up. */
-                        BUFFER_TRACE(bh, "free child branches");
-                        ext4_free_branches(handle, inode, bh,
-                                        (__le32 *) bh->b_data,
-                                        (__le32 *) bh->b_data + addr_per_block,
-                                        depth);
-                        brelse(bh);
-                        /*
-                         * Everything below this this pointer has been
-                         * released.  Now let this top-of-subtree go.
-                         *
-                         * We want the freeing of this indirect block to be
-                         * atomic in the journal with the updating of the
-                         * bitmap block which owns it.  So make some room in
-                         * the journal.
-                         *
-                         * We zero the parent pointer *after* freeing its
-                         * pointee in the bitmaps, so if extend_transaction()
-                         * for some reason fails to put the bitmap changes and
-                         * the release into the same transaction, recovery
-                         * will merely complain about releasing a free block,
-                         * rather than leaking blocks.
-                         */
-                        if (ext4_handle_is_aborted(handle))
-                                return;
-                        if (try_to_extend_transaction(handle, inode)) {
-                                ext4_mark_inode_dirty(handle, inode);
-                                ext4_truncate_restart_trans(handle, inode,
-                                            blocks_for_truncate(inode));
-                        }
-                        /*
-                         * The forget flag here is critical because if
-                         * we are journaling (and not doing data
-                         * journaling), we have to make sure a revoke
-                         * record is written to prevent the journal
-                         * replay from overwriting the (former)
-                         * indirect block if it gets reallocated as a
-                         * data block.  This must happen in the same
-                         * transaction where the data blocks are
-                         * actually freed.
-                         */
-                        ext4_free_blocks(handle, inode, NULL, nr, 1,
-                                         EXT4_FREE_BLOCKS_METADATA|
-                                         EXT4_FREE_BLOCKS_FORGET);
-                        if (parent_bh) {
-                                /*
-                                 * The block which we have just freed is
-                                 * pointed to by an indirect block: journal it
-                                 */
-                                BUFFER_TRACE(parent_bh, "get_write_access");
-                                if (!ext4_journal_get_write_access(handle,
-                                                                   parent_bh)){
-                                        *p = 0;
-                                        BUFFER_TRACE(parent_bh,
-                                        "call ext4_handle_dirty_metadata");
-                                        ext4_handle_dirty_metadata(handle,
-                                                                   inode,
-                                                                   parent_bh);
-                                }
-                        }
-                }
-        } else {
-                /* We have reached the bottom of the tree. */
-                BUFFER_TRACE(parent_bh, "free data blocks");
-                ext4_free_data(handle, inode, parent_bh, first, last);
-        }
-}
 int ext4_can_truncate(struct inode *inode)
 {
        if (S_ISREG(inode->i_mode))
@@ -4476,19 +3148,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 */
 void ext4_truncate(struct inode *inode)
 {
-        handle_t *handle;
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        __le32 *i_data = ei->i_data;
-        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-        struct address_space *mapping = inode->i_mapping;
-        ext4_lblk_t offsets[4];
-        Indirect chain[4];
-        Indirect *partial;
-        __le32 nr = 0;
-        int n = 0;
-        ext4_lblk_t last_block, max_block;
-        unsigned blocksize = inode->i_sb->s_blocksize;
        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
@@ -4499,149 +3158,11 @@ void ext4_truncate(struct inode *inode)
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
-        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_ext_truncate(inode);
-                trace_ext4_truncate_exit(inode);
+        else
-                return;
+                ext4_ind_truncate(inode);
-        }
-        handle = start_transaction(inode);
-        if (IS_ERR(handle))
-                return;         /* AKPM: return what? */
-        last_block = (inode->i_size + blocksize-1)
-                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
-                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-        if (inode->i_size & (blocksize - 1))
-                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
-                        goto out_stop;
-        if (last_block != max_block) {
-                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                if (n == 0)
-                        goto out_stop;  /* error */
-        }
-        /*
-         * OK.  This truncate is going to happen.  We add the inode to the
-         * orphan list, so that if this truncate spans multiple transactions,
-         * and we crash, we will resume the truncate when the filesystem
-         * recovers.  It also marks the inode dirty, to catch the new size.
-         *
-         * Implication: the file must always be in a sane, consistent
-         * truncatable state while each transaction commits.
-         */
-        if (ext4_orphan_add(handle, inode))
-                goto out_stop;
-        /*
-         * From here we block out all ext4_get_block() callers who want to
-         * modify the block allocation tree.
-         */
-        down_write(&ei->i_data_sem);
-        ext4_discard_preallocations(inode);
-        /*
-         * The orphan list entry will now protect us from any crash which
-         * occurs before the truncate completes, so it is now safe to propagate
-         * the new, shorter inode size (held for now in i_size) into the
-         * on-disk inode. We do this via i_disksize, which is the value which
-         * ext4 *really* writes onto the disk inode.
-         */
-        ei->i_disksize = inode->i_size;
-        if (last_block == max_block) {
-                /*
-                 * It is unnecessary to free any data blocks if last_block is
-                 * equal to the indirect block limit.
-                 */
-                goto out_unlock;
-        } else if (n == 1) {            /* direct blocks */
-                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
-                               i_data + EXT4_NDIR_BLOCKS);
-                goto do_indirects;
-        }
-        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-        /* Kill the top of shared branch (not detached) */
-        if (nr) {
-                if (partial == chain) {
-                        /* Shared branch grows from the inode */
-                        ext4_free_branches(handle, inode, NULL,
-                                           &nr, &nr+1, (chain+n-1) - partial);
-                        *partial->p = 0;
-                        /*
-                         * We mark the inode dirty prior to restart,
-                         * and prior to stop.  No need for it here.
-                         */
-                } else {
-                        /* Shared branch grows from an indirect block */
-                        BUFFER_TRACE(partial->bh, "get_write_access");
-                        ext4_free_branches(handle, inode, partial->bh,
-                                        partial->p,
-                                        partial->p+1, (chain+n-1) - partial);
-                }
-        }
-        /* Clear the ends of indirect blocks on the shared branch */
-        while (partial > chain) {
-                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
-                                   (__le32*)partial->bh->b_data+addr_per_block,
-                                   (chain+n-1) - partial);
-                BUFFER_TRACE(partial->bh, "call brelse");
-                brelse(partial->bh);
-                partial--;
-        }
-do_indirects:
-        /* Kill the remaining (whole) subtrees */
-        switch (offsets[0]) {
-        default:
-                nr = i_data[EXT4_IND_BLOCK];
-                if (nr) {
-                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
-                        i_data[EXT4_IND_BLOCK] = 0;
-                }
-        case EXT4_IND_BLOCK:
-                nr = i_data[EXT4_DIND_BLOCK];
-                if (nr) {
-                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
-                        i_data[EXT4_DIND_BLOCK] = 0;
-                }
-        case EXT4_DIND_BLOCK:
-                nr = i_data[EXT4_TIND_BLOCK];
-                if (nr) {
-                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
-                        i_data[EXT4_TIND_BLOCK] = 0;
-                }
-        case EXT4_TIND_BLOCK:
-                ;
-        }
-out_unlock:
-        up_write(&ei->i_data_sem);
-        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-        ext4_mark_inode_dirty(handle, inode);
-        /*
-         * In a multi-transaction truncate, we only make the final transaction
-         * synchronous
-         */
-        if (IS_SYNC(inode))
-                ext4_handle_sync(handle);
-out_stop:
-        /*
-         * If this was a simple ftruncate(), and the file will remain alive
-         * then we need to clear up the orphan record which we created above.
-         * However, if this was a real unlink then we were called by
-         * ext4_delete_inode(), and we allow that function to clean up the
-         * orphan info for us.
-         */
-        if (inode->i_nlink)
-                ext4_orphan_del(handle, inode);
-        ext4_journal_stop(handle);
        trace_ext4_truncate_exit(inode);
 }
@@ -5012,7 +3533,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                   (S_ISLNK(inode->i_mode) &&
                    !ext4_inode_is_fast_symlink(inode))) {
                /* Validate block references which are part of inode */
-                ret = ext4_check_inode_blockref(inode);
+                ret = ext4_ind_check_inode(inode);
        }
        if (ret)
                goto bad_inode;
@@ -5459,34 +3980,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
-                                      int chunk)
-{
-        int indirects;
-        /* if nrblocks are contiguous */
-        if (chunk) {
-                /*
-                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks, and 1 tindirect block
-                 */
-                return DIV_ROUND_UP(nrblocks,
-                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-        }
-        /*
-         * if nrblocks are not contiguous, worse case, each block touch
-         * a indirect block, and each indirect block touch a double indirect
-         * block, plus a triple indirect block
-         */
-        indirects = nrblocks * 2 + 1;
-        return indirects;
-}
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+                return ext4_ind_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 808c554e773f..f18bfe37aff8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,8 +202,9 @@ setversion_out:
                struct super_block *sb = inode->i_sb;
                int err, err2=0;
-                if (!capable(CAP_SYS_RESOURCE))
+                err = ext4_resize_begin(sb);
-                        return -EPERM;
+                if (err)
+                        return err;
                if (get_user(n_blocks_count, (__u32 __user *)arg))
                        return -EFAULT;
@@ -221,6 +222,7 @@ setversion_out:
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
+                ext4_resize_end(sb);
                return err;
        }
@@ -271,8 +273,9 @@ mext_out:
                struct super_block *sb = inode->i_sb;
                int err, err2=0;
-                if (!capable(CAP_SYS_RESOURCE))
+                err = ext4_resize_begin(sb);
-                        return -EPERM;
+                if (err)
+                        return err;
                if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
                                sizeof(input)))
@@ -291,6 +294,7 @@ mext_out:
                if (err == 0)
                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
+                ext4_resize_end(sb);
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d56850..17a5a57c415a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -75,8 +75,8 @@
 *
 * The inode preallocation space is used looking at the _logical_ start
 * block. If only the logical file block falls within the range of prealloc
- * space we will consume the particular prealloc space. This make sure that
+ * space we will consume the particular prealloc space. This makes sure that
- * that the we have contiguous physical blocks representing the file blocks
+ * we have contiguous physical blocks representing the file blocks
 *
 * The important thing to be noted in case of inode prealloc space is that
 * we don't modify the values associated to inode prealloc space except
@@ -84,7 +84,7 @@
 *
 * If we are not able to find blocks in the inode prealloc space and if we
 * have the group allocation flag set then we look at the locality group
- * prealloc space. These are per CPU prealloc list repreasented as
+ * prealloc space. These are per CPU prealloc list represented as
 *
 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 *
@@ -128,12 +128,13 @@
 * we are doing a group prealloc we try to normalize the request to
 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
 * 512 blocks. This can be tuned via
- * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
- * stripe value (sbi->s_stripe)
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
 *
- * The regular allocator(using the buddy cache) supports few tunables.
+ * The regular allocator (using the buddy cache) supports a few tunables.
 *
 * /sys/fs/ext4/<partition>/mb_min_to_scan
 * /sys/fs/ext4/<partition>/mb_max_to_scan
@@ -152,7 +153,7 @@
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
- * can used for allocation. ext4_mb_good_group explains how the groups are
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
 * checked.
 *
 * Both the prealloc space are getting populated as above. So for the first
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
-                                printk(KERN_ERR "corruption in group %u "
+                                ext4_msg(e4b->bd_sb, KERN_ERR,
-                                       "at byte %u(%u): %x in copy != %x "
+                                         "corruption in group %u "
-                                       "on disk/prealloc\n",
+                                         "at byte %u(%u): %x in copy != %x "
-                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
+                                         "on disk/prealloc",
+                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                BUG();
                        }
                }
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        grp = ext4_get_group_info(sb, group);
        e4b->bd_blkbits = sb->s_blocksize_bits;
-        e4b->bd_info = ext4_get_group_info(sb, group);
+        e4b->bd_info = grp;
        e4b->bd_sb = sb;
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len)
        }
 }
-static void mb_set_bits(void *bm, int cur, int len)
+void ext4_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
-        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+        ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
-                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                        ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
-                               "buddy group\n");
+                                 "for a buddy group");
                        goto exit_meta_group_info;
                }
                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (meta_group_info[i] == NULL) {
-                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
                goto exit_group_info;
        }
        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 exit_group_info:
        /* If a meta_group_info table has been allocated, release it now */
-        if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+        }
 exit_meta_group_info:
        return -ENOMEM;
 } /* ext4_mb_add_groupinfo */
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
+        sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
-                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
                return -ENOMEM;
        }
        sbi->s_buddy_cache = new_inode(sb);
        if (sbi->s_buddy_cache == NULL) {
-                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+                ext4_msg(sb, KERN_ERR, "can't get new inode");
                goto err_freesgi;
        }
-        sbi->s_buddy_cache->i_ino = get_next_ino();
+        /* To avoid potentially colliding with an valid on-disk inode number,
+         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
+         * not in the inode hash, so it should never be found by iget(), but
+         * this will avoid confusion if it ever shows up during debugging. */
+        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
-                                "EXT4-fs: can't read descriptor %u\n", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2362,7 +2369,7 @@ err_freebuddy:
                kfree(sbi->s_group_info[i]);
        iput(sbi->s_buddy_cache);
 err_freesgi:
-        kfree(sbi->s_group_info);
+        ext4_kvfree(sbi->s_group_info);
        return -ENOMEM;
 }
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size)
                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
                                        NULL);
+        ext4_groupinfo_caches[cache_index] = cachep;
        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
        if (!cachep) {
-                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+                printk(KERN_EMERG
+                       "EXT4-fs: no memory for groupinfo slab cache\n");
                return -ENOMEM;
        }
-        ext4_groupinfo_caches[cache_index] = cachep;
        return 0;
 }
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                i++;
        } while (i <= sb->s_blocksize_bits + 1);
-        /* init file for buddy data */
-        ret = ext4_mb_init_backend(sb);
-        if (ret != 0) {
-                goto out;
-        }
        spin_lock_init(&sbi->s_md_lock);
        spin_lock_init(&sbi->s_bal_lock);
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+        /*
+         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+         * to the lowest multiple of s_stripe which is bigger than
+         * the s_mb_group_prealloc as determined above. We want
+         * the preallocation size to be an exact multiple of the
+         * RAID stripe size so that preallocations don't fragment
+         * the stripes.
+         */
+        if (sbi->s_stripe > 1) {
+                sbi->s_mb_group_prealloc = roundup(
+                        sbi->s_mb_group_prealloc, sbi->s_stripe);
+        }
        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
+        /* init file for buddy data */
+        ret = ext4_mb_init_backend(sb);
+        if (ret != 0) {
+                goto out;
+        }
        if (sbi->s_proc)
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb)
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
                        kfree(sbi->s_group_info[i]);
-                kfree(sbi->s_group_info);
+                ext4_kvfree(sbi->s_group_info);
        }
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
        if (sbi->s_buddy_cache)
                iput(sbi->s_buddy_cache);
        if (sbi->s_mb_stats) {
-                printk(KERN_INFO
+                ext4_msg(sb, KERN_INFO,
-                       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+                       "mballoc: %u blocks %u reqs (%u success)",
                                atomic_read(&sbi->s_bal_allocated),
                                atomic_read(&sbi->s_bal_reqs),
                                atomic_read(&sbi->s_bal_success));
-                printk(KERN_INFO
+                ext4_msg(sb, KERN_INFO,
-                      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
+                      "mballoc: %u extents scanned, %u goal hits, "
-                                "%u 2^N hits, %u breaks, %u lost\n",
+                                "%u 2^N hits, %u breaks, %u lost",
                                atomic_read(&sbi->s_bal_ex_scanned),
                                atomic_read(&sbi->s_bal_goals),
                                atomic_read(&sbi->s_bal_2orders),
                                atomic_read(&sbi->s_bal_breaks),
                                atomic_read(&sbi->s_mb_lost_chunks));
-                printk(KERN_INFO
+                ext4_msg(sb, KERN_INFO,
-                       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
+                       "mballoc: %lu generated and it took %Lu",
-                                sbi->s_mb_buddies_generated++,
+                                sbi->s_mb_buddies_generated,
                                sbi->s_mb_generation_time);
-                printk(KERN_INFO
+                ext4_msg(sb, KERN_INFO,
-                       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+                       "mballoc: %u preallocated, %u discarded",
                                atomic_read(&sbi->s_mb_preallocated),
                                atomic_read(&sbi->s_mb_discarded));
        }
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                rb_erase(&entry->node, &(db->bb_free_root));
                mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+                /*
+                 * Clear the trimmed flag for the group so that the next
+                 * ext4_trim_fs can trim it.
+                 * If the volume is mounted with -o discard, online discard
+                 * is supported and the free blocks will be trimmed online.
+                 */
+                if (!test_opt(sb, DISCARD))
+                        EXT4_MB_GRP_CLEAR_TRIMMED(db);
                if (!db->bb_free_root.rb_node) {
                        /* No more items in the per group rb tree
                         * balance refcounts from ext4_mb_free_metadata()
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                 * We leak some of the blocks here.
                 */
                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-                mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                            ac->ac_b_ex.fe_len);
+                              ac->ac_b_ex.fe_len);
                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
+        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                      ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_blks_set(sb, gdp,
@@ -2830,8 +2860,9 @@ out_err:
 /*
 * here we normalize request for locality group
- * Group request are normalized to s_strip size if we set the same via mount
+ * Group request are normalized to s_mb_group_prealloc, which goes to
- * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
 * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
        struct ext4_locality_group *lg = ac->ac_lg;
        BUG_ON(lg == NULL);
-        if (EXT4_SB(sb)->s_stripe)
+        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
-        else
-                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
        mb_debug(1, "#%u: goal %u blocks for locality group\n",
                current->pid, ac->ac_g_ex.fe_len);
 }
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        if (start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical) {
-                printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
+                ext4_msg(ac->ac_sb, KERN_ERR,
-                        (unsigned long) start, (unsigned long) size,
+                         "start %lu, size %lu, fe_logical %lu",
-                        (unsigned long) ac->ac_o_ex.fe_logical);
+                         (unsigned long) start, (unsigned long) size,
+                         (unsigned long) ac->ac_o_ex.fe_logical);
        }
        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
                        start > ac->ac_o_ex.fe_logical);
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-                mb_set_bits(bitmap, entry->start_blk, entry->count);
+                ext4_set_bits(bitmap, entry->start_blk, entry->count);
                n = rb_next(n);
        }
        return;
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
-                mb_set_bits(bitmap, start, len);
+                ext4_set_bits(bitmap, start, len);
                preallocated += len;
                count++;
        }
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                bit = next + 1;
        }
        if (free != pa->pa_free) {
-                printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
+                ext4_msg(e4b->bd_sb, KERN_CRIT,
-                        pa, (unsigned long) pa->pa_lstart,
+                         "pa %p: logic %lu, phys. %lu, len %lu",
-                        (unsigned long) pa->pa_pstart,
+                         pa, (unsigned long) pa->pa_lstart,
-                        (unsigned long) pa->pa_len);
+                         (unsigned long) pa->pa_pstart,
+                         (unsigned long) pa->pa_len);
                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
@@ -3775,7 +3805,8 @@ repeat:
                         * use preallocation while we're discarding it */
                        spin_unlock(&pa->pa_lock);
                        spin_unlock(&ei->i_prealloc_lock);
-                        printk(KERN_ERR "uh-oh! used pa while discarding\n");
+                        ext4_msg(sb, KERN_ERR,
+                                 "uh-oh! used pa while discarding");
                        WARN_ON(1);
                        schedule_timeout_uninterruptible(HZ);
                        goto repeat;
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
-        printk(KERN_ERR "EXT4-fs: Can't allocate:"
+        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
-                        " Allocation context details:\n");
+                        " Allocation context details:");
-        printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
                        ac->ac_status, ac->ac_flags);
-        printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
+        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
-                        "best %lu/%lu/%lu@%lu cr %d\n",
+                        "goal %lu/%lu/%lu@%lu, "
+                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
                        (unsigned long)ac->ac_o_ex.fe_start,
                        (unsigned long)ac->ac_o_ex.fe_len,
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
-        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
+        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
-                ac->ac_found);
+                 ac->ac_ex_scanned, ac->ac_found);
-        printk(KERN_ERR "EXT4-fs: groups: \n");
+        ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
        ngroups = ext4_get_groups_count(sb);
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4637,7 +4669,7 @@ do_more:
        }
        ext4_mark_super_dirty(sb);
 error_return:
-        if (freed)
+        if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                dquot_free_block(inode, freed);
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
@@ -4645,7 +4677,7 @@ error_return:
 }
 /**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
 * @handle:                     handle to this transaction
 * @sb:                         super block
 * @block:                      start physcial block to add to the block group
@@ -4653,7 +4685,7 @@ error_return:
 *
 * This marks the blocks as free in the bitmap and buddy.
 */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        struct ext4_buddy e4b;
        int err = 0, ret, blk_free_count;
        ext4_grpblk_t blocks_freed;
-        struct ext4_group_info *grp;
        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+        if (count == 0)
+                return 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-        grp = ext4_get_group_info(sb, block_group);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
-        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+                ext4_warning(sb, "too much blocks added to group %u\n",
+                             block_group);
+                err = -EINVAL;
                goto error_return;
+        }
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-        if (!bitmap_bh)
+        if (!bitmap_bh) {
+                err = -EIO;
                goto error_return;
+        }
        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-        if (!desc)
+        if (!desc) {
+                err = -EIO;
                goto error_return;
+        }
        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
+                err = -EINVAL;
                goto error_return;
        }
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
-        return;
+        return err;
 }
 /**
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
 {
        struct ext4_free_extent ex;
+        trace_ext4_trim_extent(sb, group, start, count);
        assert_spin_locked(ext4_group_lock_ptr(sb, group));
        ex.fe_start = start;
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
 /**
 * ext4_trim_all_free -- function to trim all free space in alloc. group
 * @sb:                 super block for file system
- * @e4b:                ext4 buddy
+ * @group:              group to be trimmed
 * @start:              first group block to examine
 * @max:                last group block to examine
 * @minblocks:          minimum extent block count
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                   ext4_grpblk_t minblocks)
 {
        void *bitmap;
-        ext4_grpblk_t next, count = 0;
+        ext4_grpblk_t next, count = 0, free_count = 0;
        struct ext4_buddy e4b;
        int ret;
+        trace_ext4_trim_all_free(sb, group, start, max);
        ret = ext4_mb_load_buddy(sb, group, &e4b);
        if (ret) {
                ext4_error(sb, "Error in loading buddy "
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
        bitmap = e4b.bd_bitmap;
        ext4_lock_group(sb, group);
+        if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+            minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+                goto out;
        start = (e4b.bd_info->bb_first_free > start) ?
                e4b.bd_info->bb_first_free : start;
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                                         next - start, group, &e4b);
                        count += next - start;
                }
+                free_count += next - start;
                start = next + 1;
                if (fatal_signal_pending(current)) {
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                        ext4_lock_group(sb, group);
                }
-                if ((e4b.bd_info->bb_free - count) < minblocks)
+                if ((e4b.bd_info->bb_free - free_count) < minblocks)
                        break;
        }
+        if (!ret)
+                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+out:
        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                return -EINVAL;
+        if (start + len <= first_data_blk)
+                goto out;
        if (start < first_data_blk) {
                len -= first_data_blk - start;
                start = first_data_blk;
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
        }
        range->len = trimmed * sb->s_blocksize;
+        if (!ret)
+                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+out:
        return ret;
 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 20b5e7bfebd1..9d4a636b546c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -187,7 +187,6 @@ struct ext4_allocation_context {
        __u16 ac_flags;         /* allocation hints */
        __u8 ac_status;
        __u8 ac_criteria;
-        __u8 ac_repeats;
        __u8 ac_2order;         /* if request is to allocate 2^N blocks and
                                 * N > 0, the field stores N, otherwise 0 */
        __u8 ac_op;             /* operation, for history only */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8c9babac43dc..f8068c7bae9f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                                while (len--) printk("%c", *name++);
                                ext4fs_dirhash(de->name, de->name_len, &h);
                                printk(":%x.%u ", h.hash,
-                                       ((char *) de - base));
+                                       (unsigned) ((char *) de - base));
                        }
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
        *err = -ENOENT;
 errout:
-        dxtrace(printk(KERN_DEBUG "%s not found\n", name));
+        dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
        dx_release (frames);
        return NULL;
 }
@@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
-        /* Orphan handling is only valid for files with data blocks
+        /*
-         * being truncated, or files being unlinked. */
+         * Orphan handling is only valid for files with data blocks
+         * being truncated, or files being unlinked. Note that we either
-        /* @@@ FIXME: Observation from aviro:
+         * hold i_mutex, or the inode can not be referenced from outside,
-         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
+         * so i_nlink should not be bumped due to race
-         * here (on s_orphan_lock), so race with ext4_link() which might bump
-         * ->i_nlink. For, say it, character device. Not a regular file,
-         * not a directory, not a symlink and ->i_nlink > 0.
-         *
-         * tytso, 4/25/2009: I'm not sure how that could happen;
-         * shouldn't the fs core protect us from these sort of
-         * unlink()/link() races?
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2260,9 +2253,11 @@ static int ext4_symlink(struct inode *dir,
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
-                 * group descriptor, sb, inode block, quota blocks.
+                 * group descriptor, sb, inode block, quota blocks, and
+                 * possibly selinux xattr blocks.
                 */
-                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          EXT4_XATTR_TRANS_BLOCKS;
        } else {
                /*
                 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7bb8f76d470a..92f38ee13f8a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work)
        unsigned long           flags;
        int                     ret;
-        mutex_lock(&inode->i_mutex);
+        if (!mutex_trylock(&inode->i_mutex)) {
+                /*
+                 * Requeue the work instead of waiting so that the work
+                 * items queued after this can be processed.
+                 */
+                queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
+                /*
+                 * To prevent the ext4-dio-unwritten thread from keeping
+                 * requeueing end_io requests and occupying cpu for too long,
+                 * yield the cpu if it sees an end_io request that has already
+                 * been requeued.
+                 */
+                if (io->flag & EXT4_IO_END_QUEUED)
+                        yield();
+                io->flag |= EXT4_IO_END_QUEUED;
+                return;
+        }
        ret = ext4_end_io_nolock(io);
        if (ret < 0) {
                mutex_unlock(&inode->i_mutex);
@@ -285,11 +301,7 @@ static int io_submit_init(struct ext4_io_submit *io,
        io_end = ext4_init_io_end(inode, GFP_NOFS);
        if (!io_end)
                return -ENOMEM;
-        do {
+        bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
-                bio = bio_alloc(GFP_NOIO, nvecs);
-                nvecs >>= 1;
-        } while (bio == NULL);
        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        bio->bi_private = io->io_end = io_end;
@@ -338,8 +350,10 @@ submit_and_retry:
        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
            (io_end->pages[io_end->num_io_pages-1] != io_page))
                goto submit_and_retry;
-        if (buffer_uninit(bh))
+        if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+        }
        io->io_end->size += bh->b_size;
        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 80bbc9c60c24..707d3f16f7ce 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,35 @@
 #include "ext4_jbd2.h"
+int ext4_resize_begin(struct super_block *sb)
+{
+        int ret = 0;
+        if (!capable(CAP_SYS_RESOURCE))
+                return -EPERM;
+        /*
+         * We are not allowed to do online-resizing on a filesystem mounted
+         * with error, because it can destroy the filesystem easily.
+         */
+        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+                ext4_warning(sb, "There are errors in the filesystem, "
+                             "so online resizing is not allowed\n");
+                return -EPERM;
+        }
+        if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+                ret = -EBUSY;
+        return ret;
+}
+void ext4_resize_end(struct super_block *sb)
+{
+        clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+        smp_mb__after_clear_bit();
+}
 #define outside(b, first, last) ((b) < (first) || (b) >= (last))
 #define inside(b, first, last)  ((b) >= (first) && (b) < (last))
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
                brelse(bh);
                bh = ERR_PTR(err);
        } else {
-                lock_buffer(bh);
                memset(bh->b_data, 0, sb->s_blocksize);
                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
        }
        return bh;
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 * If that fails, restart the transaction & regain write access for the
 * buffer head which is used for block_bitmap modifications.
 */
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
-                                         struct buffer_head *bh)
 {
        int err;
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
        if (err < 0)
                return err;
        if (err) {
-                if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+                err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
-                        return err;
+                if (err)
-                if ((err = ext4_journal_get_write_access(handle, bh)))
                        return err;
        }
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        mutex_lock(&sbi->s_resize_lock);
+        BUG_ON(input->group != sbi->s_groups_count);
-        if (input->group != sbi->s_groups_count) {
-                err = -EBUSY;
-                goto exit_journal;
-        }
-        if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
-                err = PTR_ERR(bh);
-                goto exit_journal;
-        }
-        if (ext4_bg_has_super(sb, input->group)) {
-                ext4_debug("mark backup superblock %#04llx (+0)\n", start);
-                ext4_set_bit(0, bh->b_data);
-        }
        /* Copy all of the GDT blocks into the backup in this group */
        for (i = 0, bit = 1, block = start + 1;
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb,
                struct buffer_head *gdb;
                ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+                err = extend_or_restart_transaction(handle, 1);
-                if ((err = extend_or_restart_transaction(handle, 1, bh)))
+                if (err)
-                        goto exit_bh;
+                        goto exit_journal;
                gdb = sb_getblk(sb, block);
                if (!gdb) {
                        err = -EIO;
-                        goto exit_bh;
+                        goto exit_journal;
                }
                if ((err = ext4_journal_get_write_access(handle, gdb))) {
                        brelse(gdb);
-                        goto exit_bh;
+                        goto exit_journal;
                }
-                lock_buffer(gdb);
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
-                unlock_buffer(gdb);
                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
                if (unlikely(err)) {
                        brelse(gdb);
-                        goto exit_bh;
+                        goto exit_journal;
                }
-                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
                               GFP_NOFS);
        if (err)
-                goto exit_bh;
+                goto exit_journal;
-        for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++)
-                ext4_set_bit(bit, bh->b_data);
+        err = extend_or_restart_transaction(handle, 2);
+        if (err)
+                goto exit_journal;
+        bh = bclean(handle, sb, input->block_bitmap);
+        if (IS_ERR(bh)) {
+                err = PTR_ERR(bh);
+                goto exit_journal;
+        }
+        if (ext4_bg_has_super(sb, input->group)) {
+                ext4_debug("mark backup group tables %#04llx (+0)\n", start);
+                ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
+        }
        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                   input->block_bitmap - start);
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
                goto exit_bh;
-        for (i = 0, bit = input->inode_table - start;
+        ext4_set_bits(bh->b_data, input->inode_table - start,
-             i < sbi->s_itb_per_group; i++, bit++)
+                      sbi->s_itb_per_group);
-                ext4_set_bit(bit, bh->b_data);
-        if ((err = extend_or_restart_transaction(handle, 2, bh)))
-                goto exit_bh;
        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
                             bh->b_data);
@@ -285,7 +303,6 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb,
 * fail once we start modifying the data on disk, because JBD has no rollback.
 */
 static int add_new_gdb(handle_t *handle, struct inode *inode,
-                       struct ext4_new_group_data *input,
+                       ext4_group_t group)
-                       struct buffer_head **primary)
 {
        struct super_block *sb = inode->i_sb;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-        unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+        unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
        ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
        struct buffer_head **o_group_desc, **n_group_desc;
        struct buffer_head *dind;
+        struct buffer_head *gdb_bh;
        int gdbackups;
        struct ext4_iloc iloc;
        __le32 *data;
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                return -EPERM;
        }
-        *primary = sb_bread(sb, gdblock);
+        gdb_bh = sb_bread(sb, gdblock);
-        if (!*primary)
+        if (!gdb_bh)
                return -EIO;
-        if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+        gdbackups = verify_reserved_gdb(sb, gdb_bh);
+        if (gdbackups < 0) {
                err = gdbackups;
                goto exit_bh;
        }
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        data = (__le32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
                ext4_warning(sb, "new group %u GDT block %llu not reserved",
-                             input->group, gdblock);
+                             group, gdblock);
                err = -EINVAL;
                goto exit_dind;
        }
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        if (unlikely(err))
                goto exit_dind;
-        err = ext4_journal_get_write_access(handle, *primary);
+        err = ext4_journal_get_write_access(handle, gdb_bh);
        if (unlikely(err))
                goto exit_sbh;
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        if (unlikely(err))
                goto exit_dindj;
-        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+        n_group_desc = ext4_kvmalloc((gdb_num + 1) *
-                        GFP_NOFS);
+                                     sizeof(struct buffer_head *),
+                                     GFP_NOFS);
        if (!n_group_desc) {
                err = -ENOMEM;
-                ext4_warning(sb,
+                ext4_warning(sb, "not enough memory for %lu groups",
-                              "not enough memory for %lu groups", gdb_num + 1);
+                             gdb_num + 1);
                goto exit_inode;
        }
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        }
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
-        memset((*primary)->b_data, 0, sb->s_blocksize);
+        memset(gdb_bh->b_data, 0, sb->s_blocksize);
-        err = ext4_handle_dirty_metadata(handle, NULL, *primary);
+        err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
        if (unlikely(err)) {
                ext4_std_error(sb, err);
                goto exit_inode;
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
               EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
-        n_group_desc[gdb_num] = *primary;
+        n_group_desc[gdb_num] = gdb_bh;
        EXT4_SB(sb)->s_group_desc = n_group_desc;
        EXT4_SB(sb)->s_gdb_count++;
-        kfree(o_group_desc);
+        ext4_kvfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
        err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        return err;
 exit_inode:
+        ext4_kvfree(n_group_desc);
        /* ext4_handle_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
@@ -508,7 +528,7 @@ exit_sbh:
 exit_dind:
        brelse(dind);
 exit_bh:
-        brelse(*primary);
+        brelse(gdb_bh);
        ext4_debug("leaving with error %d\n", err);
        return err;
@@ -528,7 +548,7 @@ exit_bh:
 * backup GDT blocks are stored in their reserved primary GDT block.
 */
 static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
-                              struct ext4_new_group_data *input)
+                              ext4_group_t group)
 {
        struct super_block *sb = inode->i_sb;
        int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
         * Finally we can add each of the reserved backup GDT blocks from
         * the new group to its reserved primary GDT block.
         */
-        blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+        blk = group * EXT4_BLOCKS_PER_GROUP(sb);
        for (i = 0; i < reserved_gdb; i++) {
                int err2;
                data = (__le32 *)primary[i]->b_data;
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                goto exit_put;
        }
-        mutex_lock(&sbi->s_resize_lock);
-        if (input->group != sbi->s_groups_count) {
-                ext4_warning(sb, "multiple resizers run on filesystem!");
-                err = -EBUSY;
-                goto exit_journal;
-        }
        if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
                goto exit_journal;
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                if ((err = ext4_journal_get_write_access(handle, primary)))
                        goto exit_journal;
-                if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
+                if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
-                    (err = reserve_backup_gdb(handle, inode, input)))
+                        err = reserve_backup_gdb(handle, inode, input->group);
+                        if (err)
+                                goto exit_journal;
+                }
+        } else {
+                /*
+                 * Note that we can access new group descriptor block safely
+                 * only if add_new_gdb() succeeds.
+                 */
+                err = add_new_gdb(handle, inode, input->group);
+                if (err)
                        goto exit_journal;
-        } else if ((err = add_new_gdb(handle, inode, input, &primary)))
+                primary = sbi->s_group_desc[gdb_num];
-                goto exit_journal;
+        }
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold s_resize_lock
-         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold s_resize_lock over the access
-         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
         *
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        ext4_handle_dirty_super(handle, sb);
 exit_journal:
-        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
-        if (!err) {
+        if (!err && primary) {
                update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
                               sizeof(struct ext4_super_block));
                update_backups(sb, primary->b_blocknr, primary->b_data,
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_grpblk_t add;
        struct buffer_head *bh;
        handle_t *handle;
-        int err;
+        int err, err2;
        ext4_group_t group;
-        /* We don't need to worry about locking wrt other resizers just
-         * yet: we're going to revalidate es->s_blocks_count after
-         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
+                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        if (n_blocks_count < o_blocks_count) {
                ext4_warning(sb, "can't shrink FS - resize aborted");
-                return -EBUSY;
+                return -EINVAL;
        }
        /* Handle the remaining blocks in the last group only. */
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
-        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
-        if (o_blocks_count != ext4_blocks_count(es)) {
-                ext4_warning(sb, "multiple resizers run on filesystem!");
-                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
-                ext4_journal_stop(handle);
-                err = -EBUSY;
-                goto exit_put;
-        }
        if ((err = ext4_journal_get_write_access(handle,
                                                 EXT4_SB(sb)->s_sbh))) {
                ext4_warning(sb, "error %d on journal write access", err);
-                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
-        ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+        err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
        ext4_handle_dirty_super(handle, sb);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
-        if ((err = ext4_journal_stop(handle)))
+        err2 = ext4_journal_stop(handle);
+        if (!err && err2)
+                err = err2;
+        if (err)
                goto exit_put;
        if (test_opt(sb, DEBUG))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa864b3..44d0c8db2239 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = {
 #define IS_EXT3_SB(sb) (0)
 #endif
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+        void *ret;
+        ret = kmalloc(size, flags);
+        if (!ret)
+                ret = __vmalloc(size, flags, PAGE_KERNEL);
+        return ret;
+}
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+        void *ret;
+        ret = kzalloc(size, flags);
+        if (!ret)
+                ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+        return ret;
+}
+void ext4_kvfree(void *ptr)
+{
+        if (is_vmalloc_addr(ptr))
+                vfree(ptr);
+        else
+                kfree(ptr);
+}
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
 {
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        journal_t *journal;
        handle_t  *handle;
+        trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
-        kfree(sbi->s_group_desc);
+        ext4_kvfree(sbi->s_group_desc);
-        if (is_vmalloc_addr(sbi->s_flex_groups))
+        ext4_kvfree(sbi->s_flex_groups);
-                vfree(sbi->s_flex_groups);
-        else
-                kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -892,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head)
 static void ext4_destroy_inode(struct inode *inode)
 {
-        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
@@ -1976,15 +2002,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
        size = flex_group_count * sizeof(struct flex_groups);
-        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+        sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                sbi->s_flex_groups = vzalloc(size);
+                ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
-                if (sbi->s_flex_groups == NULL) {
+                         flex_group_count);
-                        ext4_msg(sb, KERN_ERR,
+                goto failed;
-                                 "not enough memory for %u flex groups",
-                                 flex_group_count);
-                        goto failed;
-                }
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2383,17 +2405,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
        unsigned long stripe_width =
                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+        int ret;
        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
-                return sbi->s_stripe;
+                ret = sbi->s_stripe;
+        else if (stripe_width <= sbi->s_blocks_per_group)
-        if (stripe_width <= sbi->s_blocks_per_group)
+                ret = stripe_width;
-                return stripe_width;
+        else if (stride <= sbi->s_blocks_per_group)
+                ret = stride;
+        else
+                ret = 0;
-        if (stride <= sbi->s_blocks_per_group)
+        /*
-                return stride;
+         * If the stripe width is 1, this makes no sense and
+         * we set it to 0 to turn off stripe handling code.
+         */
+        if (ret <= 1)
+                ret = 0;
-        return 0;
+        return ret;
 }
 /* sysfs supprt */
@@ -3408,8 +3438,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
-        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
+        sbi->s_group_desc = ext4_kvmalloc(db_count *
-                                    GFP_KERNEL);
+                                          sizeof(struct buffer_head *),
+                                          GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
                goto failed_mount;
@@ -3491,7 +3522,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
-        mutex_init(&sbi->s_resize_lock);
+        sbi->s_resize_flags = 0;
        sb->s_root = NULL;
@@ -3741,12 +3772,8 @@ failed_mount_wq:
        }
 failed_mount3:
        del_timer(&sbi->s_err_report);
-        if (sbi->s_flex_groups) {
+        if (sbi->s_flex_groups)
-                if (is_vmalloc_addr(sbi->s_flex_groups))
+                ext4_kvfree(sbi->s_flex_groups);
-                        vfree(sbi->s_flex_groups);
-                else
-                        kfree(sbi->s_flex_groups);
-        }
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -3756,7 +3783,7 @@ failed_mount3:
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
-        kfree(sbi->s_group_desc);
+        ext4_kvfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
                remove_proc_entry(sb->s_id, ext4_proc_root);
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 000000000000..011ba6670d99
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        ext4_truncate(inode);
+}
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+        ext4_lblk_t needed;
+        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+        /* Give ourselves just enough room to cope with inodes in which
+         * i_blocks is corrupt: we've seen disk corruptions in the past
+         * which resulted in random data in an inode which looked enough
+         * like a regular file for ext4 to try to delete it.  Things
+         * will go a bit crazy if that happens, but at least we should
+         * try not to panic the whole kernel. */
+        if (needed < 2)
+                needed = 2;
+        /* But we need to bound the transaction so we don't overflow the
+         * journal. */
+        if (needed > EXT4_MAX_TRANS_DATA)
+                needed = EXT4_MAX_TRANS_DATA;
+        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ad64732cbce..5efbd5d7701a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
-        struct msdos_dir_entry *de;
+        struct msdos_dir_entry *uninitialized_var(de);
        int err, free_slots, i, nr_bhs;
        loff_t pos, i_pos;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5942fec22c65..1726d7303047 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
-                fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
+                fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
-                       "case sensitive!\n");
+                       "case sensitive!");
        }
        /* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
        sbi->free_clusters = -1;        /* Don't know yet */
        sbi->free_clus_valid = 0;
        sbi->prev_free = FAT_START_ENT;
+        sb->s_maxbytes = 0xffffffff;
        if (!sbi->fat_length && b->fat32_length) {
                struct fat_boot_fsinfo *fsinfo;
@@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                sbi->fat_length = le32_to_cpu(b->fat32_length);
                sbi->root_cluster = le32_to_cpu(b->root_cluster);
-                sb->s_maxbytes = 0xffffffff;
                /* MC - if info_sector is 0, don't multiply by 0 */
                sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
                if (sbi->fsinfo_sector == 0)
diff --git a/fs/file_table.c b/fs/file_table.c
index 01e4c1e8e6b6..c322794f7360 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,7 +25,7 @@
 #include <linux/percpu.h>
 #include <linux/ima.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "internal.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b8c507ca42f7..04cf3b91e501 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,7 +35,9 @@
 struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
+        unsigned long *older_than_this;
        enum writeback_sync_modes sync_mode;
+        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
@@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 */
 void inode_wb_list_del(struct inode *inode)
 {
-        spin_lock(&inode_wb_list_lock);
+        struct backing_dev_info *bdi = inode_to_bdi(inode);
+        spin_lock(&bdi->wb.list_lock);
        list_del_init(&inode->i_wb_list);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&bdi->wb.list_lock);
 }
 /*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
@@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode)
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
-static void redirty_tail(struct inode *inode)
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&wb->list_lock);
-        assert_spin_locked(&inode_wb_list_lock);
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
@@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode)
 /*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
-static void requeue_io(struct inode *inode)
+static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&wb->list_lock);
-        assert_spin_locked(&inode_wb_list_lock);
        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
@@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode)
 {
        /*
         * Prevent speculative execution through
-         * spin_unlock(&inode_wb_list_lock);
+         * spin_unlock(&wb->list_lock);
         */
        smp_mb();
@@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 /*
 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
 */
-static void move_expired_inodes(struct list_head *delaying_queue,
+static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
-                                unsigned long *older_than_this)
+                               unsigned long *older_than_this)
 {
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
+        int moved = 0;
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
@@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue,
                        do_sb_sort = 1;
                sb = inode->i_sb;
                list_move(&inode->i_wb_list, &tmp);
+                moved++;
        }
        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
-                return;
+                goto out;
        }
        /* Move inodes from one superblock together */
@@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue,
                                list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
+out:
+        return moved;
 }
 /*
@@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-        assert_spin_locked(&inode_wb_list_lock);
+        int moved;
+        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+        trace_writeback_queue_io(wb, older_than_this, moved);
 }
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
 /*
 * Wait for writeback on an inode to complete.
 */
-static void inode_wait_for_writeback(struct inode *inode)
+static void inode_wait_for_writeback(struct inode *inode,
+                                     struct bdi_writeback *wb)
 {
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;
@@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode)
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
-                spin_unlock(&inode_wb_list_lock);
+                spin_unlock(&wb->list_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-                spin_lock(&inode_wb_list_lock);
+                spin_lock(&wb->list_lock);
                spin_lock(&inode->i_lock);
        }
 }
 /*
- * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * Write out an inode's dirty pages.  Called under wb->list_lock and
 * inode->i_lock.  Either the caller has an active reference on the inode or
 * the inode has I_WILL_FREE set.
 *
@@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode)
 * livelocks, etc.
 */
 static int
-writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                       struct writeback_control *wbc)
 {
        struct address_space *mapping = inode->i_mapping;
+        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;
-        assert_spin_locked(&inode_wb_list_lock);
+        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
@@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                 * completed a full scan of b_io.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL) {
-                        requeue_io(inode);
+                        requeue_io(inode, wb);
+                        trace_writeback_single_inode_requeue(inode, wbc,
+                                                             nr_to_write);
                        return 0;
                }
                /*
                 * It's a data-integrity sync.  We must wait.
                 */
-                inode_wait_for_writeback(inode);
+                inode_wait_for_writeback(inode, wb);
        }
        BUG_ON(inode->i_state & I_SYNC);
@@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        inode->i_state |= I_SYNC;
        inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode->i_lock);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
        ret = do_writepages(mapping, wbc);
@@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
+                /*
+                 * Sync livelock prevention. Each inode is tagged and synced in
+                 * one shot. If still dirty, it will be redirty_tail()'ed below.
+                 * Update the dirty time to prevent enqueue and sync it again.
+                 */
+                if ((inode->i_state & I_DIRTY) &&
+                    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+                        inode->dirtied_when = jiffies;
                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
                         * We didn't write back all the pages.  nfs_writepages()
@@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                                /*
                                 * slice used up: queue for next turn
                                 */
-                                requeue_io(inode);
+                                requeue_io(inode, wb);
                        } else {
                                /*
                                 * Writeback blocked by something other than
@@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                                 * retrying writeback of the dirty page/inode
                                 * that cannot be performed immediately.
                                 */
-                                redirty_tail(inode);
+                                redirty_tail(inode, wb);
                        }
                } else if (inode->i_state & I_DIRTY) {
                        /*
@@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * submission or metadata updates after data IO
                         * completion.
                         */
-                        redirty_tail(inode);
+                        redirty_tail(inode, wb);
                } else {
                        /*
                         * The inode is clean.  At this point we either have
@@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                }
        }
        inode_sync_complete(inode);
+        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
 }
+static long writeback_chunk_size(struct backing_dev_info *bdi,
+                                 struct wb_writeback_work *work)
+{
+        long pages;
+        /*
+         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+         * here avoids calling into writeback_inodes_wb() more than once.
+         *
+         * The intended call sequence for WB_SYNC_ALL writeback is:
+         *
+         *      wb_writeback()
+         *          writeback_sb_inodes()       <== called only once
+         *              write_cache_pages()     <== called once for each inode
+         *                   (quickly) tag currently dirty pages
+         *                   (maybe slowly) sync all tagged pages
+         */
+        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+                pages = LONG_MAX;
+        else {
+                pages = min(bdi->avg_write_bandwidth / 2,
+                            global_dirty_limit / DIRTY_SCOPE);
+                pages = min(pages, work->nr_pages);
+                pages = round_down(pages + MIN_WRITEBACK_PAGES,
+                                   MIN_WRITEBACK_PAGES);
+        }
+        return pages;
+}
 /*
 * Write a portion of b_io inodes which belong to @sb.
 *
@@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * inodes. Otherwise write only ones which go sequentially
 * in reverse order.
 *
- * Return 1, if the caller writeback routine should be
+ * Return the number of pages and/or inodes written.
- * interrupted. Otherwise return 0.
 */
-static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
+static long writeback_sb_inodes(struct super_block *sb,
-                struct writeback_control *wbc, bool only_this_sb)
+                                struct bdi_writeback *wb,
+                                struct wb_writeback_work *work)
 {
+        struct writeback_control wbc = {
+                .sync_mode              = work->sync_mode,
+                .tagged_writepages      = work->tagged_writepages,
+                .for_kupdate            = work->for_kupdate,
+                .for_background         = work->for_background,
+                .range_cyclic           = work->range_cyclic,
+                .range_start            = 0,
+                .range_end              = LLONG_MAX,
+        };
+        unsigned long start_time = jiffies;
+        long write_chunk;
+        long wrote = 0;  /* count both pages and inodes */
        while (!list_empty(&wb->b_io)) {
-                long pages_skipped;
                struct inode *inode = wb_inode(wb->b_io.prev);
                if (inode->i_sb != sb) {
-                        if (only_this_sb) {
+                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
-                                redirty_tail(inode);
+                                redirty_tail(inode, wb);
                                continue;
                        }
@@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
-                        return 0;
+                        break;
                }
                /*
@@ -504,95 +567,96 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
-                        requeue_io(inode);
+                        redirty_tail(inode, wb);
                        continue;
                }
-                /*
-                 * Was this inode dirtied after sync_sb_inodes was called?
-                 * This keeps sync from extra jobs and livelock.
-                 */
-                if (inode_dirtied_after(inode, wbc->wb_start)) {
-                        spin_unlock(&inode->i_lock);
-                        return 1;
-                }
                __iget(inode);
+                write_chunk = writeback_chunk_size(wb->bdi, work);
+                wbc.nr_to_write = write_chunk;
+                wbc.pages_skipped = 0;
+                writeback_single_inode(inode, wb, &wbc);
-                pages_skipped = wbc->pages_skipped;
+                work->nr_pages -= write_chunk - wbc.nr_to_write;
-                writeback_single_inode(inode, wbc);
+                wrote += write_chunk - wbc.nr_to_write;
-                if (wbc->pages_skipped != pages_skipped) {
+                if (!(inode->i_state & I_DIRTY))
+                        wrote++;
+                if (wbc.pages_skipped) {
                        /*
                         * writeback is not making progress due to locked
                         * buffers.  Skip this inode for now.
                         */
-                        redirty_tail(inode);
+                        redirty_tail(inode, wb);
                }
                spin_unlock(&inode->i_lock);
-                spin_unlock(&inode_wb_list_lock);
+                spin_unlock(&wb->list_lock);
                iput(inode);
                cond_resched();
-                spin_lock(&inode_wb_list_lock);
+                spin_lock(&wb->list_lock);
-                if (wbc->nr_to_write <= 0) {
+                /*
-                        wbc->more_io = 1;
+                 * bail out to wb_writeback() often enough to check
-                        return 1;
+                 * background threshold and other termination conditions.
+                 */
+                if (wrote) {
+                        if (time_is_before_jiffies(start_time + HZ / 10UL))
+                                break;
+                        if (work->nr_pages <= 0)
+                                break;
                }
-                if (!list_empty(&wb->b_more_io))
-                        wbc->more_io = 1;
        }
-        /* b_io is empty */
+        return wrote;
-        return 1;
 }
-void writeback_inodes_wb(struct bdi_writeback *wb,
+static long __writeback_inodes_wb(struct bdi_writeback *wb,
-                struct writeback_control *wbc)
+                                  struct wb_writeback_work *work)
 {
-        int ret = 0;
+        unsigned long start_time = jiffies;
+        long wrote = 0;
-        if (!wbc->wb_start)
-                wbc->wb_start = jiffies; /* livelock avoidance */
-        spin_lock(&inode_wb_list_lock);
-        if (!wbc->for_kupdate || list_empty(&wb->b_io))
-                queue_io(wb, wbc->older_than_this);
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;
                if (!grab_super_passive(sb)) {
-                        requeue_io(inode);
+                        /*
+                         * grab_super_passive() may fail consistently due to
+                         * s_umount being grabbed by someone else. Don't use
+                         * requeue_io() to avoid busy retrying the inode/sb.
+                         */
+                        redirty_tail(inode, wb);
                        continue;
                }
-                ret = writeback_sb_inodes(sb, wb, wbc, false);
+                wrote += writeback_sb_inodes(sb, wb, work);
                drop_super(sb);
-                if (ret)
+                /* refer to the same tests at the end of writeback_sb_inodes */
-                        break;
+                if (wrote) {
+                        if (time_is_before_jiffies(start_time + HZ / 10UL))
+                                break;
+                        if (work->nr_pages <= 0)
+                                break;
+                }
        }
-        spin_unlock(&inode_wb_list_lock);
        /* Leave any unwritten inodes on b_io */
+        return wrote;
 }
-static void __writeback_inodes_sb(struct super_block *sb,
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
-                struct bdi_writeback *wb, struct writeback_control *wbc)
 {
-        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        struct wb_writeback_work work = {
+                .nr_pages       = nr_pages,
+                .sync_mode      = WB_SYNC_NONE,
+                .range_cyclic   = 1,
+        };
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
-        if (!wbc->for_kupdate || list_empty(&wb->b_io))
+        if (list_empty(&wb->b_io))
-                queue_io(wb, wbc->older_than_this);
+                queue_io(wb, NULL);
-        writeback_sb_inodes(sb, wb, wbc, true);
+        __writeback_inodes_wb(wb, &work);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
-}
-/*
+        return nr_pages - work.nr_pages;
- * The maximum number of pages to writeout in a single bdi flush/kupdate
+}
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
 static inline bool over_bground_thresh(void)
 {
@@ -605,6 +669,16 @@ static inline bool over_bground_thresh(void)
 }
 /*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+                                unsigned long start_time)
+{
+        __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
+}
+/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
@@ -622,47 +696,16 @@ static inline bool over_bground_thresh(void)
 static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
 {
-        struct writeback_control wbc = {
+        unsigned long wb_start = jiffies;
-                .sync_mode              = work->sync_mode,
+        long nr_pages = work->nr_pages;
-                .older_than_this        = NULL,
-                .for_kupdate            = work->for_kupdate,
-                .for_background         = work->for_background,
-                .range_cyclic           = work->range_cyclic,
-        };
        unsigned long oldest_jif;
-        long wrote = 0;
-        long write_chunk;
        struct inode *inode;
+        long progress;
-        if (wbc.for_kupdate) {
+        oldest_jif = jiffies;
-                wbc.older_than_this = &oldest_jif;
+        work->older_than_this = &oldest_jif;
-                oldest_jif = jiffies -
-                                msecs_to_jiffies(dirty_expire_interval * 10);
-        }
-        if (!wbc.range_cyclic) {
-                wbc.range_start = 0;
-                wbc.range_end = LLONG_MAX;
-        }
-        /*
-         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-         * here avoids calling into writeback_inodes_wb() more than once.
-         *
-         * The intended call sequence for WB_SYNC_ALL writeback is:
-         *
-         *      wb_writeback()
-         *          __writeback_inodes_sb()     <== called only once
-         *              write_cache_pages()     <== called once for each inode
-         *                   (quickly) tag currently dirty pages
-         *                   (maybe slowly) sync all tagged pages
-         */
-        if (wbc.sync_mode == WB_SYNC_NONE)
-                write_chunk = MAX_WRITEBACK_PAGES;
-        else
-                write_chunk = LONG_MAX;
-        wbc.wb_start = jiffies; /* livelock avoidance */
+        spin_lock(&wb->list_lock);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
@@ -687,52 +730,54 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (work->for_background && !over_bground_thresh())
                        break;
-                wbc.more_io = 0;
+                if (work->for_kupdate) {
-                wbc.nr_to_write = write_chunk;
+                        oldest_jif = jiffies -
-                wbc.pages_skipped = 0;
+                                msecs_to_jiffies(dirty_expire_interval * 10);
+                        work->older_than_this = &oldest_jif;
+                }
-                trace_wbc_writeback_start(&wbc, wb->bdi);
+                trace_writeback_start(wb->bdi, work);
+                if (list_empty(&wb->b_io))
+                        queue_io(wb, work->older_than_this);
                if (work->sb)
-                        __writeback_inodes_sb(work->sb, wb, &wbc);
+                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
-                        writeback_inodes_wb(wb, &wbc);
+                        progress = __writeback_inodes_wb(wb, work);
-                trace_wbc_writeback_written(&wbc, wb->bdi);
+                trace_writeback_written(wb->bdi, work);
-                work->nr_pages -= write_chunk - wbc.nr_to_write;
+                wb_update_bandwidth(wb, wb_start);
-                wrote += write_chunk - wbc.nr_to_write;
                /*
-                 * If we consumed everything, see if we have more
+                 * Did we write something? Try for more
+                 *
+                 * Dirty inodes are moved to b_io for writeback in batches.
+                 * The completion of the current batch does not necessarily
+                 * mean the overall work is done. So we keep looping as long
+                 * as made some progress on cleaning pages or inodes.
                 */
-                if (wbc.nr_to_write <= 0)
+                if (progress)
                        continue;
                /*
-                 * Didn't write everything and we don't have more IO, bail
+                 * No more inodes for IO, bail
                 */
-                if (!wbc.more_io)
+                if (list_empty(&wb->b_more_io))
                        break;
                /*
-                 * Did we write something? Try for more
-                 */
-                if (wbc.nr_to_write < write_chunk)
-                        continue;
-                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
-                spin_lock(&inode_wb_list_lock);
                if (!list_empty(&wb->b_more_io))  {
+                        trace_writeback_wait(wb->bdi, work);
                        inode = wb_inode(wb->b_more_io.prev);
-                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        spin_lock(&inode->i_lock);
-                        inode_wait_for_writeback(inode);
+                        inode_wait_for_writeback(inode, wb);
                        spin_unlock(&inode->i_lock);
                }
-                spin_unlock(&inode_wb_list_lock);
        }
+        spin_unlock(&wb->list_lock);
-        return wrote;
+        return nr_pages - work->nr_pages;
 }
 /*
@@ -1063,10 +1108,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        spin_unlock(&inode->i_lock);
-                        spin_lock(&inode_wb_list_lock);
+                        spin_lock(&bdi->wb.list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
-                        spin_unlock(&inode_wb_list_lock);
+                        spin_unlock(&bdi->wb.list_lock);
                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
@@ -1162,10 +1207,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
-                .sb             = sb,
+                .sb                     = sb,
-                .sync_mode      = WB_SYNC_NONE,
+                .sync_mode              = WB_SYNC_NONE,
-                .done           = &done,
+                .tagged_writepages      = 1,
-                .nr_pages       = nr,
+                .done                   = &done,
+                .nr_pages               = nr,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
@@ -1267,6 +1313,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
 */
 int write_inode_now(struct inode *inode, int sync)
 {
+        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        int ret;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
@@ -1279,11 +1326,11 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
        might_sleep();
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
-        ret = writeback_single_inode(inode, &wbc);
+        ret = writeback_single_inode(inode, wb, &wbc);
        spin_unlock(&inode->i_lock);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
        if (sync)
                inode_sync_wait(inode);
        return ret;
@@ -1303,13 +1350,14 @@ EXPORT_SYMBOL(write_inode_now);
 */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
+        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
        int ret;
-        spin_lock(&inode_wb_list_lock);
+        spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
-        ret = writeback_single_inode(inode, wbc);
+        ret = writeback_single_inode(inode, wb, wbc);
        spin_unlock(&inode->i_lock);
-        spin_unlock(&inode_wb_list_lock);
+        spin_unlock(&wb->list_lock);
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 640fc229df10..5cb8614508c3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
        forget->forget_one.nlookup = nlookup;
        spin_lock(&fc->lock);
-        fc->forget_list_tail->next = forget;
+        if (fc->connected) {
-        fc->forget_list_tail = forget;
+                fc->forget_list_tail->next = forget;
-        wake_up(&fc->waitq);
+                fc->forget_list_tail = forget;
-        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+                wake_up(&fc->waitq);
+                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        } else {
+                kfree(forget);
+        }
        spin_unlock(&fc->lock);
 }
@@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
        if (outarg.namelen > FUSE_NAME_MAX)
                goto err;
+        err = -EINVAL;
+        if (size != sizeof(outarg) + outarg.namelen + 1)
+                goto err;
        name.name = buf;
        name.len = outarg.namelen;
        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d480d9af46c9..594f07a81c28 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/compat.h>
+#include <linux/swap.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode)
        req = ff->reserved_req;
        fuse_prepare_release(ff, file->f_flags, opcode);
+        if (ff->flock) {
+                struct fuse_release_in *inarg = &req->misc.release.in;
+                inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+                inarg->lock_owner = fuse_lock_owner_id(ff->fc,
+                                                       (fl_owner_t) file);
+        }
        /* Hold vfsmount and dentry until release is finished */
        path_get(&file->f_path);
        req->misc.release.path = file->f_path;
@@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
        return req->misc.write.out.size;
 }
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
-{
-        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = grab_cache_page_write_begin(mapping, index, flags);
-        if (!*pagep)
-                return -ENOMEM;
-        return 0;
-}
 void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
@@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos)
        spin_unlock(&fc->lock);
 }
-static int fuse_buffered_write(struct file *file, struct inode *inode,
-                               loff_t pos, unsigned count, struct page *page)
-{
-        int err;
-        size_t nres;
-        struct fuse_conn *fc = get_fuse_conn(inode);
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        struct fuse_req *req;
-        if (is_bad_inode(inode))
-                return -EIO;
-        /*
-         * Make sure writepages on the same page are not mixed up with
-         * plain writes.
-         */
-        fuse_wait_on_page_writeback(inode, page->index);
-        req = fuse_get_req(fc);
-        if (IS_ERR(req))
-                return PTR_ERR(req);
-        req->in.argpages = 1;
-        req->num_pages = 1;
-        req->pages[0] = page;
-        req->page_offset = offset;
-        nres = fuse_send_write(req, file, pos, count, NULL);
-        err = req->out.h.error;
-        fuse_put_request(fc, req);
-        if (!err && !nres)
-                err = -EIO;
-        if (!err) {
-                pos += nres;
-                fuse_write_update_size(inode, pos);
-                if (count == PAGE_CACHE_SIZE)
-                        SetPageUptodate(page);
-        }
-        fuse_invalidate_attr(inode);
-        return err ? err : nres;
-}
-static int fuse_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        struct inode *inode = mapping->host;
-        int res = 0;
-        if (copied)
-                res = fuse_buffered_write(file, inode, pos, copied, page);
-        unlock_page(page);
-        page_cache_release(page);
-        return res;
-}
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
                                    struct inode *inode, loff_t pos,
                                    size_t count)
@@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                pagefault_enable();
                flush_dcache_page(page);
+                mark_page_accessed(page);
                if (!tmp) {
                        unlock_page(page);
                        page_cache_release(page);
@@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
        struct fuse_conn *fc = get_fuse_conn(inode);
        int err;
-        if (fc->no_lock) {
+        if (fc->no_flock) {
                err = flock_lock_file_wait(file, fl);
        } else {
+                struct fuse_file *ff = file->private_data;
                /* emulate flock with POSIX locks */
                fl->fl_owner = (fl_owner_t) file;
+                ff->flock = true;
                err = fuse_setlk(file, fl, 1);
        }
@@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops  = {
        .readpage       = fuse_readpage,
        .writepage      = fuse_writepage,
        .launder_page   = fuse_launder_page,
-        .write_begin    = fuse_write_begin,
-        .write_end      = fuse_write_end,
        .readpages      = fuse_readpages,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .bmap           = fuse_bmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c6aa2d4b8517..cf6db0a93219 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -135,6 +135,9 @@ struct fuse_file {
        /** Wait queue head for poll */
        wait_queue_head_t poll_wait;
+        /** Has flock been performed on this file? */
+        bool flock:1;
 };
 /** One input argument of a request */
@@ -448,7 +451,7 @@ struct fuse_conn {
        /** Is removexattr not implemented by fs? */
        unsigned no_removexattr:1;
-        /** Are file locking primitives not implemented by fs? */
+        /** Are posix file locking primitives not implemented by fs? */
        unsigned no_lock:1;
        /** Is access not implemented by fs? */
@@ -472,6 +475,9 @@ struct fuse_conn {
        /** Don't apply umask to creation modes */
        unsigned dont_mask:1;
+        /** Are BSD file locking primitives not implemented by fs? */
+        unsigned no_flock:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 38f84cd48b67..add96f6ffda5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,7 +71,7 @@ struct fuse_mount_data {
        unsigned blksize;
 };
-struct fuse_forget_link *fuse_alloc_forget()
+struct fuse_forget_link *fuse_alloc_forget(void)
 {
        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
 }
@@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->async_read = 1;
                        if (!(arg->flags & FUSE_POSIX_LOCKS))
                                fc->no_lock = 1;
+                        if (arg->minor >= 17) {
+                                if (!(arg->flags & FUSE_FLOCK_LOCKS))
+                                        fc->no_flock = 1;
+                        } else {
+                                if (!(arg->flags & FUSE_POSIX_LOCKS))
+                                        fc->no_flock = 1;
+                        }
                        if (arg->flags & FUSE_ATOMIC_O_TRUNC)
                                fc->atomic_o_trunc = 1;
                        if (arg->minor >= 9) {
@@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
+                        fc->no_flock = 1;
                }
                fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
@@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
+                FUSE_FLOCK_LOCKS;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index d5e33a077a67..d0dddaceac59 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                        return PTR_ERR(acl);
        }
        if (acl) {
-                mode_t mode;
                error = posix_acl_valid(acl);
                if (error)
                        goto failed;
                switch (type) {
                case ACL_TYPE_ACCESS:
-                        mode = inode->i_mode;
+                        error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        error = posix_acl_equiv_mode(acl, &mode);
                        if (error < 0)
                                goto failed;
-                        inode->i_mode = mode;
                        inode->i_ctime = CURRENT_TIME;
                        if (error == 0) {
                                posix_acl_release(acl);
@@ -125,21 +121,20 @@ int
 generic_acl_init(struct inode *inode, struct inode *dir)
 {
        struct posix_acl *acl = NULL;
-        mode_t mode = inode->i_mode;
        int error;
-        inode->i_mode = mode & ~current_umask();
        if (!S_ISLNK(inode->i_mode))
                acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
        if (acl) {
                if (S_ISDIR(inode->i_mode))
                        set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-                error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+                error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
                if (error < 0)
                        return error;
-                inode->i_mode = mode;
                if (error > 0)
                        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+        } else {
+                inode->i_mode &= ~current_umask();
        }
        error = 0;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 884c9af0542f..34501b64bc47 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
        return gfs2_acl_get(GFS2_I(inode), type);
 }
-static int gfs2_set_mode(struct inode *inode, mode_t mode)
+static int gfs2_set_mode(struct inode *inode, umode_t mode)
 {
        int error = 0;
@@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct posix_acl *acl;
-        mode_t mode = inode->i_mode;
+        umode_t mode = inode->i_mode;
        int error = 0;
        if (!sdp->sd_args.ar_posix_acl)
@@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
                goto out_release;
        if (type == ACL_TYPE_ACCESS) {
-                mode_t mode = inode->i_mode;
+                umode_t mode = inode->i_mode;
                error = posix_acl_equiv_mode(acl, &mode);
                if (error <= 0) {
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 29e1ace7953d..8a139ff1919f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -16,7 +16,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist_bl.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 516516e0c2a2..3bc073a4cf82 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1018,13 +1018,13 @@ hostdata_error:
                fsname++;
        if (lm->lm_mount == NULL) {
                fs_info(sdp, "Now mounting FS...\n");
-                complete(&sdp->sd_locking_init);
+                complete_all(&sdp->sd_locking_init);
                return 0;
        }
        ret = lm->lm_mount(sdp, fsname);
        if (ret == 0)
                fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-        complete(&sdp->sd_locking_init);
+        complete_all(&sdp->sd_locking_init);
        return ret;
 }
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 8635be5ffd97..970ea987b3f6 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -16,6 +16,7 @@
 #include <linux/statfs.h>
 #include <linux/types.h>
 #include <linux/pid_namespace.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include "os.h"
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87b6e0421c12..ec889538e5a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                        inode->i_op = &page_symlink_inode_operations;
                        break;
                }
+                lockdep_annotate_inode_mutex_key(inode);
        }
        return inode;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 96c77b81167c..ec7924696a13 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -37,7 +37,7 @@
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode_sb_list_lock protects:
 *   sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
 *   inode->i_lock
 *     inode->i_sb->s_inode_lru_lock
 *
- * inode_wb_list_lock
+ * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
@@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 /*
 * Empty aops. Can be used for the cases where the user does not
@@ -144,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_op = &empty_iops;
        inode->i_fop = &empty_fops;
        inode->i_nlink = 1;
+        inode->i_opflags = 0;
        inode->i_uid = 0;
        inode->i_gid = 0;
        atomic_set(&inode->i_writecount, 0);
@@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add);
 static inline void inode_sb_list_del(struct inode *inode)
 {
-        spin_lock(&inode_sb_list_lock);
+        if (!list_empty(&inode->i_sb_list)) {
-        list_del_init(&inode->i_sb_list);
+                spin_lock(&inode_sb_list_lock);
-        spin_unlock(&inode_sb_list_lock);
+                list_del_init(&inode->i_sb_list);
+                spin_unlock(&inode_sb_list_lock);
+        }
 }
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -398,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 EXPORT_SYMBOL(__insert_inode_hash);
 /**
- *      remove_inode_hash - remove an inode from the hash
+ *      __remove_inode_hash - remove an inode from the hash
 *      @inode: inode to unhash
 *
 *      Remove an inode from the superblock.
 */
-void remove_inode_hash(struct inode *inode)
+void __remove_inode_hash(struct inode *inode)
 {
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
@@ -411,7 +413,7 @@ void remove_inode_hash(struct inode *inode)
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
 }
-EXPORT_SYMBOL(remove_inode_hash);
+EXPORT_SYMBOL(__remove_inode_hash);
 void end_writeback(struct inode *inode)
 {
@@ -453,7 +455,9 @@ static void evict(struct inode *inode)
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));
-        inode_wb_list_del(inode);
+        if (!list_empty(&inode->i_wb_list))
+                inode_wb_list_del(inode);
        inode_sb_list_del(inode);
        if (op->evict_inode) {
@@ -797,6 +801,29 @@ unsigned int get_next_ino(void)
 EXPORT_SYMBOL(get_next_ino);
 /**
+ *      new_inode_pseudo        - obtain an inode
+ *      @sb: superblock
+ *
+ *      Allocates a new inode for given superblock.
+ *      Inode wont be chained in superblock s_inodes list
+ *      This means :
+ *      - fs can't be unmount
+ *      - quotas, fsnotify, writeback can't work
+ */
+struct inode *new_inode_pseudo(struct super_block *sb)
+{
+        struct inode *inode = alloc_inode(sb);
+        if (inode) {
+                spin_lock(&inode->i_lock);
+                inode->i_state = 0;
+                spin_unlock(&inode->i_lock);
+                INIT_LIST_HEAD(&inode->i_sb_list);
+        }
+        return inode;
+}
+/**
 *      new_inode       - obtain an inode
 *      @sb: superblock
 *
@@ -814,27 +841,16 @@ struct inode *new_inode(struct super_block *sb)
        spin_lock_prefetch(&inode_sb_list_lock);
-        inode = alloc_inode(sb);
+        inode = new_inode_pseudo(sb);
-        if (inode) {
+        if (inode)
-                spin_lock(&inode->i_lock);
-                inode->i_state = 0;
-                spin_unlock(&inode->i_lock);
                inode_sb_list_add(inode);
-        }
        return inode;
 }
 EXPORT_SYMBOL(new_inode);
-/**
- * unlock_new_inode - clear the I_NEW state and wake up any waiters
- * @inode:      new inode to unlock
- *
- * Called when the inode is fully initialised to clear the new state of the
- * inode and wake up anyone waiting for the inode to finish initialisation.
- */
-void unlock_new_inode(struct inode *inode)
-{
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+void lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
@@ -850,7 +866,20 @@ void unlock_new_inode(struct inode *inode)
                                          &type->i_mutex_dir_key);
                }
        }
+}
+EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
 #endif
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:      new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
+void unlock_new_inode(struct inode *inode)
+{
+        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
@@ -1308,7 +1337,8 @@ static void iput_final(struct inode *inode)
        }
        inode->i_state |= I_FREEING;
-        inode_lru_list_del(inode);
+        if (!list_empty(&inode->i_lru))
+                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);
        evict(inode);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index e4b87bc1fa56..f94fc48ff3a0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -22,6 +22,8 @@
 #include <linux/jbd.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <trace/events/jbd.h>
 /*
 * Unlink a buffer from a transaction checkpoint list.
@@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+                /*
+                 * Get our reference so that bh cannot be freed before
+                 * we unlock it
+                 */
+                get_bh(bh);
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
-                journal_remove_journal_head(bh);
                BUFFER_TRACE(bh, "release");
                __brelse(bh);
        } else {
@@ -220,8 +226,8 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+                get_bh(bh);
                if (buffer_locked(bh)) {
-                        get_bh(bh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        wait_on_buffer(bh);
@@ -240,7 +246,6 @@ restart:
                 */
                released = __journal_remove_checkpoint(jh);
                jbd_unlock_bh_state(bh);
-                journal_remove_journal_head(bh);
                __brelse(bh);
        }
@@ -253,9 +258,12 @@ static void
 __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
        int i;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        for (i = 0; i < *batch_count; i++)
-                write_dirty_buffer(bhs[i], WRITE);
+                write_dirty_buffer(bhs[i], WRITE_SYNC);
+        blk_finish_plug(&plug);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
@@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                ret = 1;
                if (unlikely(buffer_write_io_error(bh)))
                        ret = -EIO;
+                get_bh(bh);
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __journal_remove_checkpoint(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
-                journal_remove_journal_head(bh);
                __brelse(bh);
        } else {
                /*
@@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal)
         * journal straight away.
         */
        result = cleanup_journal_tail(journal);
+        trace_jbd_checkpoint(journal, result);
        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
        if (result <= 0)
                return result;
@@ -503,6 +512,7 @@ int cleanup_journal_tail(journal_t *journal)
        if (blocknr < journal->j_tail)
                freed = freed + journal->j_last - journal->j_first;
+        trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
        jbd_debug(1,
                  "Cleaning journal tail from %d to %d (offset %u), "
                  "freeing %u\n",
@@ -523,9 +533,9 @@ int cleanup_journal_tail(journal_t *journal)
 /*
 * journal_clean_one_cp_list
 *
- * Find all the written-back checkpoint buffers in the given list and release them.
+ * Find all the written-back checkpoint buffers in the given list and release
+ * them.
 *
- * Called with the journal locked.
 * Called with j_list_lock held.
 * Returns number of bufers reaped (for debug)
 */
@@ -632,8 +642,8 @@ out:
 * checkpoint lists.
 *
 * The function returns 1 if it frees the transaction, 0 otherwise.
+ * The function can free jh and bh.
 *
- * This function is called with the journal locked.
 * This function is called with j_list_lock held.
 * This function is called with jbd_lock_bh_state(jh2bh(jh))
 */
@@ -652,13 +662,14 @@ int __journal_remove_checkpoint(struct journal_head *jh)
        }
        journal = transaction->t_journal;
+        JBUFFER_TRACE(jh, "removing from transaction");
        __buffer_unlink(jh);
        jh->b_cp_transaction = NULL;
+        journal_put_journal_head(jh);
        if (transaction->t_checkpoint_list != NULL ||
            transaction->t_checkpoint_io_list != NULL)
                goto out;
-        JBUFFER_TRACE(jh, "transaction has no more buffers");
        /*
         * There is one special case to worry about: if we have just pulled the
@@ -669,10 +680,8 @@ int __journal_remove_checkpoint(struct journal_head *jh)
         * The locking here around t_state is a bit sleazy.
         * See the comment at the end of journal_commit_transaction().
         */
-        if (transaction->t_state != T_FINISHED) {
+        if (transaction->t_state != T_FINISHED)
-                JBUFFER_TRACE(jh, "belongs to running/committing transaction");
                goto out;
-        }
        /* OK, that was the last buffer for the transaction: we can now
           safely remove this transaction from the log */
@@ -684,7 +693,6 @@ int __journal_remove_checkpoint(struct journal_head *jh)
        wake_up(&journal->j_wait_logspace);
        ret = 1;
 out:
-        JBUFFER_TRACE(jh, "exit");
        return ret;
 }
@@ -703,6 +711,8 @@ void __journal_insert_checkpoint(struct journal_head *jh,
        J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+        /* Get reference for checkpointing transaction */
+        journal_grab_journal_head(jh2bh(jh));
        jh->b_cp_transaction = transaction;
        if (!transaction->t_checkpoint_list) {
@@ -752,6 +762,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
+        trace_jbd_drop_transaction(journal, transaction);
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
        kfree(transaction);
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 72ffa974b0b8..8799207df058 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <trace/events/jbd.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -204,6 +205,8 @@ write_out_data:
                        if (!trylock_buffer(bh)) {
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
+                                trace_jbd_do_submit_data(journal,
+                                                     commit_transaction);
                                /* Write out all data to prevent deadlocks */
                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
@@ -236,6 +239,8 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (bufs == journal->j_wbufsize) {
                                spin_unlock(&journal->j_list_lock);
+                                trace_jbd_do_submit_data(journal,
+                                                     commit_transaction);
                                journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                goto write_out_data;
@@ -253,10 +258,6 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
-                        journal_remove_journal_head(bh);
-                        /* One for our safety reference, other for
-                         * journal_remove_journal_head() */
-                        put_bh(bh);
                        release_data_buffer(bh);
                }
@@ -266,6 +267,7 @@ write_out_data:
                }
        }
        spin_unlock(&journal->j_list_lock);
+        trace_jbd_do_submit_data(journal, commit_transaction);
        journal_do_submit_data(wbuf, bufs, write_op);
        return err;
@@ -316,12 +318,14 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction = journal->j_running_transaction;
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
+        trace_jbd_start_commit(journal, commit_transaction);
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        trace_jbd_commit_locking(journal, commit_transaction);
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -392,6 +396,7 @@ void journal_commit_transaction(journal_t *journal)
         */
        journal_switch_revoke_table(journal);
+        trace_jbd_commit_flushing(journal, commit_transaction);
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
@@ -446,14 +451,9 @@ void journal_commit_transaction(journal_t *journal)
                }
                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
                    jh->b_transaction == commit_transaction &&
-                    jh->b_jlist == BJ_Locked) {
+                    jh->b_jlist == BJ_Locked)
                        __journal_unfile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
+                jbd_unlock_bh_state(bh);
-                        journal_remove_journal_head(bh);
-                        put_bh(bh);
-                } else {
-                        jbd_unlock_bh_state(bh);
-                }
                release_data_buffer(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
@@ -493,6 +493,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_COMMIT;
        spin_unlock(&journal->j_state_lock);
+        trace_jbd_commit_logging(journal, commit_transaction);
        J_ASSERT(commit_transaction->t_nr_buffers <=
                 commit_transaction->t_outstanding_credits);
@@ -797,10 +798,16 @@ restart_loop:
        while (commit_transaction->t_forget) {
                transaction_t *cp_transaction;
                struct buffer_head *bh;
+                int try_to_free = 0;
                jh = commit_transaction->t_forget;
                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
+                /*
+                 * Get a reference so that bh cannot be freed before we are
+                 * done with it.
+                 */
+                get_bh(bh);
                jbd_lock_bh_state(bh);
                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
                        jh->b_transaction == journal->j_running_transaction);
@@ -858,28 +865,27 @@ restart_loop:
                        __journal_insert_checkpoint(jh, commit_transaction);
                        if (is_journal_aborted(journal))
                                clear_buffer_jbddirty(bh);
-                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-                        __journal_refile_buffer(jh);
-                        jbd_unlock_bh_state(bh);
                } else {
                        J_ASSERT_BH(bh, !buffer_dirty(bh));
-                        /* The buffer on BJ_Forget list and not jbddirty means
+                        /*
+                         * The buffer on BJ_Forget list and not jbddirty means
                         * it has been freed by this transaction and hence it
                         * could not have been reallocated until this
                         * transaction has committed. *BUT* it could be
                         * reallocated once we have written all the data to
                         * disk and before we process the buffer on BJ_Forget
-                         * list. */
+                         * list.
-                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                         */
-                        __journal_refile_buffer(jh);
+                        if (!jh->b_next_transaction)
-                        if (!jh->b_transaction) {
+                                try_to_free = 1;
-                                jbd_unlock_bh_state(bh);
-                                 /* needs a brelse */
-                                journal_remove_journal_head(bh);
-                                release_buffer_page(bh);
-                        } else
-                                jbd_unlock_bh_state(bh);
                }
+                JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+                __journal_refile_buffer(jh);
+                jbd_unlock_bh_state(bh);
+                if (try_to_free)
+                        release_buffer_page(bh);
+                else
+                        __brelse(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
@@ -946,6 +952,7 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
+        trace_jbd_end_commit(journal, commit_transaction);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e2d4285fbe90..9fe061fb8779 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -38,6 +38,9 @@
 #include <linux/debugfs.h>
 #include <linux/ratelimit.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait)
        } else
                write_dirty_buffer(bh, WRITE);
+        trace_jbd_update_superblock_end(journal, wait);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
         * any future commit will have to be careful to update the
@@ -1799,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh)
 * When a buffer has its BH_JBD bit set it is immune from being released by
 * core kernel code, mainly via ->b_count.
 *
- * A journal_head may be detached from its buffer_head when the journal_head's
+ * A journal_head is detached from its buffer_head when the journal_head's
- * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
- * Various places in JBD call journal_remove_journal_head() to indicate that the
+ * transaction (b_cp_transaction) hold their references to b_jcount.
- * journal_head can be dropped if needed.
 *
 * Various places in the kernel want to attach a journal_head to a buffer_head
 * _before_ attaching the journal_head to a transaction.  To protect the
@@ -1815,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh)
 *      (Attach a journal_head if needed.  Increments b_jcount)
 *      struct journal_head *jh = journal_add_journal_head(bh);
 *      ...
- *      jh->b_transaction = xxx;
+ *      (Get another reference for transaction)
- *      journal_put_journal_head(jh);
+ *      journal_grab_journal_head(bh);
- *
+ *      jh->b_transaction = xxx;
- * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ *      (Put original reference)
- * because it has a non-zero b_transaction.
+ *      journal_put_journal_head(jh);
 */
 /*
 * Give a buffer_head a journal_head.
 *
- * Doesn't need the journal lock.
 * May sleep.
 */
 struct journal_head *journal_add_journal_head(struct buffer_head *bh)
@@ -1889,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
        struct journal_head *jh = bh2jh(bh);
        J_ASSERT_JH(jh, jh->b_jcount >= 0);
+        J_ASSERT_JH(jh, jh->b_transaction == NULL);
-        get_bh(bh);
+        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-        if (jh->b_jcount == 0) {
+        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-                if (jh->b_transaction == NULL &&
+        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
-                                jh->b_next_transaction == NULL &&
+        J_ASSERT_BH(bh, buffer_jbd(bh));
-                                jh->b_cp_transaction == NULL) {
+        J_ASSERT_BH(bh, jh2bh(jh) == bh);
-                        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+        BUFFER_TRACE(bh, "remove journal_head");
-                        J_ASSERT_BH(bh, buffer_jbd(bh));
+        if (jh->b_frozen_data) {
-                        J_ASSERT_BH(bh, jh2bh(jh) == bh);
+                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-                        BUFFER_TRACE(bh, "remove journal_head");
+                jbd_free(jh->b_frozen_data, bh->b_size);
-                        if (jh->b_frozen_data) {
-                                printk(KERN_WARNING "%s: freeing "
-                                                "b_frozen_data\n",
-                                                __func__);
-                                jbd_free(jh->b_frozen_data, bh->b_size);
-                        }
-                        if (jh->b_committed_data) {
-                                printk(KERN_WARNING "%s: freeing "
-                                                "b_committed_data\n",
-                                                __func__);
-                                jbd_free(jh->b_committed_data, bh->b_size);
-                        }
-                        bh->b_private = NULL;
-                        jh->b_bh = NULL;        /* debug, really */
-                        clear_buffer_jbd(bh);
-                        __brelse(bh);
-                        journal_free_journal_head(jh);
-                } else {
-                        BUFFER_TRACE(bh, "journal_head was locked");
-                }
        }
+        if (jh->b_committed_data) {
+                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
+                jbd_free(jh->b_committed_data, bh->b_size);
+        }
+        bh->b_private = NULL;
+        jh->b_bh = NULL;        /* debug, really */
+        clear_buffer_jbd(bh);
+        journal_free_journal_head(jh);
 }
 /*
- * journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * Drop a reference on the passed journal_head.  If it fell to zero then
- * and has a zero b_jcount then remove and release its journal_head.   If we did
- * see that the buffer is not used by any transaction we also "logically"
- * decrement ->b_count.
- *
- * We in fact take an additional increment on ->b_count as a convenience,
- * because the caller usually wants to do additional things with the bh
- * after calling here.
- * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
- * time.  Once the caller has run __brelse(), the buffer is eligible for
- * reaping by try_to_free_buffers().
- */
-void journal_remove_journal_head(struct buffer_head *bh)
-{
-        jbd_lock_bh_journal_head(bh);
-        __journal_remove_journal_head(bh);
-        jbd_unlock_bh_journal_head(bh);
-}
-/*
- * Drop a reference on the passed journal_head.  If it fell to zero then try to
 * release the journal_head from the buffer_head.
 */
 void journal_put_journal_head(struct journal_head *jh)
@@ -1953,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh)
        jbd_lock_bh_journal_head(bh);
        J_ASSERT_JH(jh, jh->b_jcount > 0);
        --jh->b_jcount;
-        if (!jh->b_jcount && !jh->b_transaction) {
+        if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
+                jbd_unlock_bh_journal_head(bh);
                __brelse(bh);
-        }
+        } else
-        jbd_unlock_bh_journal_head(bh);
+                jbd_unlock_bh_journal_head(bh);
 }
 /*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index f7ee81a065da..7e59c6e66f9b 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 alloc_transaction:
        if (!journal->j_running_transaction) {
-                new_transaction = kzalloc(sizeof(*new_transaction),
+                new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
-                                                GFP_NOFS|__GFP_NOFAIL);
                if (!new_transaction) {
-                        ret = -ENOMEM;
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
-                        goto out;
+                        goto alloc_transaction;
                }
        }
@@ -696,7 +696,6 @@ repeat:
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
-                jh->b_transaction = transaction;
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
-                jh->b_transaction = transaction;
                /* first access by this transaction */
                jh->b_modified = 0;
@@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        journal_cancel_revoke(handle, jh);
-        journal_put_journal_head(jh);
 out:
+        journal_put_journal_head(jh);
        return err;
 }
@@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
                                ret = -EIO;
                                goto no_journal;
                        }
+                        /* We might have slept so buffer could be refiled now */
-                        if (jh->b_transaction != NULL) {
+                        if (jh->b_transaction != NULL &&
+                            jh->b_transaction != handle->h_transaction) {
                                JBUFFER_TRACE(jh, "unfile from commit");
                                __journal_temp_unlink_buffer(jh);
                                /* It still points to the committing
@@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                        __journal_temp_unlink_buffer(jh);
-                        jh->b_transaction = handle->h_transaction;
                        JBUFFER_TRACE(jh, "file as data");
                        __journal_file_buffer(jh, handle->h_transaction,
                                                BJ_SyncData);
@@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                        __journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __journal_unfile_buffer(jh);
-                        journal_remove_journal_head(bh);
-                        __brelse(bh);
                        if (!buffer_jbd(bh)) {
                                spin_unlock(&journal->j_list_lock);
                                jbd_unlock_bh_state(bh);
@@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh)
                mark_buffer_dirty(bh);  /* Expose it to the VM */
 }
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
 void __journal_unfile_buffer(struct journal_head *jh)
 {
        __journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
+        journal_put_journal_head(jh);
 }
 void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
-        jbd_lock_bh_state(jh2bh(jh));
+        struct buffer_head *bh = jh2bh(jh);
+        /* Get reference so that buffer cannot be freed before we unlock it */
+        get_bh(bh);
+        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
        __journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
-        jbd_unlock_bh_state(jh2bh(jh));
+        jbd_unlock_bh_state(bh);
+        __brelse(bh);
 }
 /*
@@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
                        /* A written-back ordered data buffer */
                        JBUFFER_TRACE(jh, "release data");
                        __journal_unfile_buffer(jh);
-                        journal_remove_journal_head(bh);
-                        __brelse(bh);
                }
        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
                if (jh->b_jlist == BJ_None) {
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
                        __journal_remove_checkpoint(jh);
-                        journal_remove_journal_head(bh);
-                        __brelse(bh);
                }
        }
        spin_unlock(&journal->j_list_lock);
@@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal,
                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
-                 * journal_remove_journal_head() and journal_put_journal_head().
+                 * journal_put_journal_head().
                 */
                jh = journal_grab_journal_head(bh);
                if (!jh)
@@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);
-        __journal_unfile_buffer(jh);
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                __journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
@@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
-                journal_remove_journal_head(bh);
+                __journal_unfile_buffer(jh);
-                __brelse(bh);
        }
        return may_free;
 }
@@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction)
                __journal_temp_unlink_buffer(jh);
+        else
+                journal_grab_journal_head(bh);
        jh->b_transaction = transaction;
        switch (jlist) {
@@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh,
 * already started to be used by a subsequent transaction, refile the
 * buffer on that transaction's metadata list.
 *
- * Called under journal->j_list_lock
+ * Called under j_list_lock
- *
 * Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
 */
 void __journal_refile_buffer(struct journal_head *jh)
 {
@@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh)
        was_dirty = test_clear_buffer_jbddirty(bh);
        __journal_temp_unlink_buffer(jh);
+        /*
+         * We set b_transaction here because b_next_transaction will inherit
+         * our jh reference and thus __journal_file_buffer() must not take a
+         * new one.
+         */
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
        if (buffer_freed(bh))
@@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh)
 }
 /*
- * For the unlocked version of this call, also make sure that any
+ * __journal_refile_buffer() with necessary locking added. We take our bh
- * hanging journal_head is cleaned up if necessary.
+ * reference so that we can safely unlock bh.
- *
+ *
- * __journal_refile_buffer is usually called as part of a single locked
+ * The jh and bh may be freed by this call.
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists.  In that case it is up
- * to the caller to remove the journal_head if necessary.  For the
- * unlocked journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
 */
 void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
        struct buffer_head *bh = jh2bh(jh);
+        /* Get reference so that buffer cannot be freed before we unlock it */
+        get_bh(bh);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
        __journal_refile_buffer(jh);
        jbd_unlock_bh_state(bh);
-        journal_remove_journal_head(bh);
        spin_unlock(&journal->j_list_lock);
        __brelse(bh);
 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 2c62c5aae82f..16a698bd906d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -257,9 +257,12 @@ static void
 __flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        for (i = 0; i < *batch_count; i++)
-                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
+                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+        blk_finish_plug(&plug);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0dfa5b598e68..f24df13adc4e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2390,73 +2390,6 @@ static void __exit journal_exit(void)
        jbd2_journal_destroy_caches();
 }
-/* 
- * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
- * tracing infrastructure to map a dev_t to a device name.
- *
- * The caller should use rcu_read_lock() in order to make sure the
- * device name stays valid until its done with it.  We use
- * rcu_read_lock() as well to make sure we're safe in case the caller
- * gets sloppy, and because rcu_read_lock() is cheap and can be safely
- * nested.
- */
-struct devname_cache {
-        struct rcu_head rcu;
-        dev_t           device;
-        char            devname[BDEVNAME_SIZE];
-};
-#define CACHE_SIZE_BITS 6
-static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
-static DEFINE_SPINLOCK(devname_cache_lock);
-static void free_devcache(struct rcu_head *rcu)
-{
-        kfree(rcu);
-}
-const char *jbd2_dev_to_name(dev_t device)
-{
-        int     i = hash_32(device, CACHE_SIZE_BITS);
-        char    *ret;
-        struct block_device *bd;
-        static struct devname_cache *new_dev;
-        rcu_read_lock();
-        if (devcache[i] && devcache[i]->device == device) {
-                ret = devcache[i]->devname;
-                rcu_read_unlock();
-                return ret;
-        }
-        rcu_read_unlock();
-        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
-        if (!new_dev)
-                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
-        bd = bdget(device);
-        spin_lock(&devname_cache_lock);
-        if (devcache[i]) {
-                if (devcache[i]->device == device) {
-                        kfree(new_dev);
-                        bdput(bd);
-                        ret = devcache[i]->devname;
-                        spin_unlock(&devname_cache_lock);
-                        return ret;
-                }
-                call_rcu(&devcache[i]->rcu, free_devcache);
-        }
-        devcache[i] = new_dev;
-        devcache[i]->device = device;
-        if (bd) {
-                bdevname(bd, devcache[i]->devname);
-                bdput(bd);
-        } else
-                __bdevname(device, devcache[i]->devname);
-        ret = devcache[i]->devname;
-        spin_unlock(&devname_cache_lock);
-        return ret;
-}
-EXPORT_SYMBOL(jbd2_dev_to_name);
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 27c511a1cf05..926d02068a14 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        case ACL_TYPE_ACCESS:
                xprefix = JFFS2_XPREFIX_ACL_ACCESS;
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        umode_t mode = inode->i_mode;
                        rc = posix_acl_equiv_mode(acl, &mode);
                        if (rc < 0)
                                return rc;
@@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode)
+int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
 {
        struct posix_acl *acl;
        int rc;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index b3421c78d9f8..9b477246f2a6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
 struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
 extern int jffs2_acl_chmod(struct inode *);
-extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *);
+extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
 extern int jffs2_init_acl_post(struct inode *);
 extern const struct xattr_handler jffs2_acl_access_xattr_handler;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index eeead33d8ef0..bbcb9755dd2b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
                                  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
        if (ret) {
                jffs2_free_raw_inode(ri);
-                if (S_ISLNK(inode->i_mode & S_IFMT))
+                if (S_ISLNK(inode->i_mode))
                         kfree(mdata);
                return ret;
        }
@@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
   fill in the raw_inode while you're at it. */
-struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri)
+struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri)
 {
        struct inode *inode;
        struct super_block *sb = dir_i->i_sb;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 526979c607b6..6c1755c59c0f 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
 void jffs2_evict_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode, int flags);
-struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode,
+struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index b3a32caf2b45..45559dc3ea2f 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
                return PTR_ERR(acl);
        if (acl) {
-                mode_t mode = inode->i_mode;
                if (S_ISDIR(inode->i_mode)) {
                        rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
                        if (rc)
                                goto cleanup;
                }
-                rc = posix_acl_create(&acl, GFP_KERNEL, &mode);
+                rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
                if (rc < 0)
                        goto cleanup; /* posix_acl_release(NULL) is no-op */
-                inode->i_mode = mode;
                if (rc > 0)
                        rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
 cleanup:
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 4496872cf4e7..9cbd11a3f804 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 {
        int rc;
        int dbitno, word, rembits, nb, nwords, wbitno, agno;
-        s8 oldroot, *leaf;
+        s8 oldroot;
        struct dmaptree *tp = (struct dmaptree *) & dp->tree;
        /* save the current value of the root (i.e. maximum free string)
@@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
         */
        oldroot = tp->stree[ROOT];
-        /* pick up a pointer to the leaves of the dmap tree */
-        leaf = tp->stree + LEAFIND;
        /* determine the bit number and word within the dmap of the
         * starting block.
         */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f6cc0c09ec63..af9606057dde 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1143,7 +1143,6 @@ int txCommit(tid_t tid,		/* transaction identifier */
        struct jfs_log *log;
        struct tblock *tblk;
        struct lrd *lrd;
-        int lsn;
        struct inode *ip;
        struct jfs_inode_info *jfs_ip;
        int k, n;
@@ -1310,7 +1309,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
         */
        lrd->type = cpu_to_le16(LOG_COMMIT);
        lrd->length = 0;
-        lsn = lmLog(log, tblk, lrd, NULL);
+        lmLog(log, tblk, lrd, NULL);
        lmGroupCommit(log, tblk);
@@ -2935,7 +2934,6 @@ int jfs_sync(void *arg)
 {
        struct inode *ip;
        struct jfs_inode_info *jfs_ip;
-        int rc;
        tid_t tid;
        do {
@@ -2961,7 +2959,7 @@ int jfs_sync(void *arg)
                                 */
                                TXN_UNLOCK();
                                tid = txBegin(ip->i_sb, COMMIT_INODE);
-                                rc = txCommit(tid, 1, &ip, 0);
+                                txCommit(tid, 1, &ip, 0);
                                txEnd(tid);
                                mutex_unlock(&jfs_ip->commit_mutex);
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index adcf92d3b603..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
                /*
                 * Wait for outstanding transactions to be written to log:
                 */
-                jfs_flush_journal(log, 1);
+                jfs_flush_journal(log, 2);
        /*
         * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
         *
         * remove file system from log active file system list.
         */
-        jfs_flush_journal(log, 1);
+        jfs_flush_journal(log, 2);
        /*
         * Make sure all metadata makes it to disk
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 29b1f1a21142..e17545e15664 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        unchar *i_fastsymlink;
        s64 xlen = 0;
        int bmask = 0, xsize;
-        s64 extent = 0, xaddr;
+        s64 xaddr;
        struct metapage *mp;
        struct super_block *sb;
        struct tblock *tblk;
@@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
                        txAbort(tid, 0);
                        goto out3;
                }
-                extent = xaddr;
                ip->i_size = ssize - 1;
                while (ssize) {
                        /* This is kind of silly since PATH_MAX == 4K */
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24838f1eeee5..e87fedef23db 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                        return rc;
                }
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        rc = posix_acl_equiv_mode(acl, &mode);
                        posix_acl_release(acl);
                        if (rc < 0) {
                                printk(KERN_ERR
@@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
                                       rc);
                                return rc;
                        }
-                        inode->i_mode = mode;
                        mark_inode_dirty(inode);
                }
                /*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index e374050a911c..8392cb85bd54 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -302,7 +302,8 @@ nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc)
                                /* We appear to be out of the grace period */
                                wake_up_all(&host->h_gracewait);
                        }
-                        dprintk("lockd: server returns status %d\n", resp->status);
+                        dprintk("lockd: server returns status %d\n",
+                                ntohl(resp->status));
                        return 0;       /* Okay, call complete */
                }
@@ -690,7 +691,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
                goto out;
        if (resp->status != nlm_lck_denied_nolocks)
-                printk("lockd: unexpected unlock status: %d\n", resp->status);
+                printk("lockd: unexpected unlock status: %d\n",
+                        ntohl(resp->status));
        /* What to do now? I'm out of my depth... */
        status = -ENOLCK;
 out:
@@ -843,6 +845,7 @@ nlm_stat_to_errno(__be32 status)
                return -ENOLCK;
 #endif
        }
-        printk(KERN_NOTICE "lockd: unexpected server status %d\n", status);
+        printk(KERN_NOTICE "lockd: unexpected server status %d\n",
+                 ntohl(status));
        return -ENOLCK;
 }
diff --git a/fs/namei.c b/fs/namei.c
index f8c69d373793..f4788365ea22 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -179,19 +179,14 @@ static int check_acl(struct inode *inode, int mask)
 #ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;
-        /*
-         * Under RCU walk, we cannot even do a "get_cached_acl()",
-         * because that involves locking and getting a refcount on
-         * a cached ACL.
-         *
-         * So the only case we handle during RCU walking is the
-         * case of a cached "no ACL at all", which needs no locks
-         * or refcounts.
-         */
        if (mask & MAY_NOT_BLOCK) {
-                if (negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
+                if (!acl)
                        return -EAGAIN;
-                return -ECHILD;
+                /* no ->get_acl() calls in RCU mode... */
+                if (acl == ACL_NOT_CACHED)
+                        return -ECHILD;
+                return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
        }
        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
@@ -313,6 +308,26 @@ int generic_permission(struct inode *inode, int mask)
        return -EACCES;
 }
+/*
+ * We _really_ want to just do "generic_permission()" without
+ * even looking at the inode->i_op values. So we keep a cache
+ * flag in inode->i_opflags, that says "this has not special
+ * permission function, use the fast case".
+ */
+static inline int do_inode_permission(struct inode *inode, int mask)
+{
+        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+                if (likely(inode->i_op->permission))
+                        return inode->i_op->permission(inode, mask);
+                /* This gets set once for the inode lifetime */
+                spin_lock(&inode->i_lock);
+                inode->i_opflags |= IOP_FASTPERM;
+                spin_unlock(&inode->i_lock);
+        }
+        return generic_permission(inode, mask);
+}
 /**
 * inode_permission  -  check for access rights to a given inode
 * @inode:      inode to check permission on
@@ -327,7 +342,7 @@ int inode_permission(struct inode *inode, int mask)
 {
        int retval;
-        if (mask & MAY_WRITE) {
+        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;
                /*
@@ -344,11 +359,7 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
-        if (inode->i_op->permission)
+        retval = do_inode_permission(inode, mask);
-                retval = inode->i_op->permission(inode, mask);
-        else
-                retval = generic_permission(inode, mask);
        if (retval)
                return retval;
@@ -716,17 +727,20 @@ static int follow_automount(struct path *path, unsigned flags,
        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
                return -EISDIR; /* we actually want to stop here */
-        /* We want to mount if someone is trying to open/create a file of any
+        /* We don't want to mount if someone's just doing a stat -
-         * type under the mountpoint, wants to traverse through the mountpoint
+         * unless they're stat'ing a directory and appended a '/' to
-         * or wants to open the mounted directory.
+         * the name.
         *
-         * We don't want to mount if someone's just doing a stat and they've
+         * We do, however, want to mount if someone wants to open or
-         * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+         * create a file of any type under the mountpoint, wants to
-         * appended a '/' to the name.
+         * traverse through the mountpoint or wants to open the
+         * mounted directory.  Also, autofs may mark negative dentries
+         * as being automount points.  These will need the attentions
+         * of the daemon to instantiate them before they can be used.
         */
-        if (!(flags & LOOKUP_FOLLOW) &&
+        if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-            !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+                     LOOKUP_OPEN | LOOKUP_CREATE)) &&
-                       LOOKUP_OPEN | LOOKUP_CREATE)))
+            path->dentry->d_inode)
                return -EISDIR;
        current->total_link_count++;
@@ -1244,6 +1258,26 @@ static void terminate_walk(struct nameidata *nd)
        }
 }
+/*
+ * Do we need to follow links? We _really_ want to be able
+ * to do this check without having to look at inode->i_op,
+ * so we keep a cache of "no, this doesn't need follow_link"
+ * for the common case.
+ */
+static inline int should_follow_link(struct inode *inode, int follow)
+{
+        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+                if (likely(inode->i_op->follow_link))
+                        return follow;
+                /* This gets set once for the inode lifetime */
+                spin_lock(&inode->i_lock);
+                inode->i_opflags |= IOP_NOFOLLOW;
+                spin_unlock(&inode->i_lock);
+        }
+        return 0;
+}
 static inline int walk_component(struct nameidata *nd, struct path *path,
                struct qstr *name, int type, int follow)
 {
@@ -1266,7 +1300,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                terminate_walk(nd);
                return -ENOENT;
        }
-        if (unlikely(inode->i_op->follow_link) && follow) {
+        if (should_follow_link(inode, follow)) {
                if (nd->flags & LOOKUP_RCU) {
                        if (unlikely(unlazy_walk(nd, path->dentry))) {
                                terminate_walk(nd);
@@ -1319,6 +1353,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 }
 /*
+ * We really don't want to look at inode->i_op->lookup
+ * when we don't have to. So we keep a cache bit in
+ * the inode ->i_opflags field that says "yes, we can
+ * do lookup on this inode".
+ */
+static inline int can_lookup(struct inode *inode)
+{
+        if (likely(inode->i_opflags & IOP_LOOKUP))
+                return 1;
+        if (likely(!inode->i_op->lookup))
+                return 0;
+        /* We do this once for the lifetime of the inode */
+        spin_lock(&inode->i_lock);
+        inode->i_opflags |= IOP_LOOKUP;
+        spin_unlock(&inode->i_lock);
+        return 1;
+}
+/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1397,10 +1451,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        if (err)
                                return err;
                }
+                if (can_lookup(nd->inode))
+                        continue;
                err = -ENOTDIR; 
-                if (!nd->inode->i_op->lookup)
+                break;
-                        break;
-                continue;
                /* here ends the main loop */
 last_component:
@@ -2562,6 +2616,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!dir->i_op->rmdir)
                return -EPERM;
+        dget(dentry);
        mutex_lock(&dentry->d_inode->i_mutex);
        error = -EBUSY;
@@ -2582,6 +2637,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 out:
        mutex_unlock(&dentry->d_inode->i_mutex);
+        dput(dentry);
        if (!error)
                d_delete(dentry);
        return error;
@@ -2971,6 +3027,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
+        dget(new_dentry);
        if (target)
                mutex_lock(&target->i_mutex);
@@ -2991,6 +3048,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 out:
        if (target)
                mutex_unlock(&target->i_mutex);
+        dput(new_dentry);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 81515545ba75..dbcd82126aed 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -77,6 +77,7 @@ config NFS_V4
 config NFS_V4_1
        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
+        select SUNRPC_BACKCHANNEL
        select PNFS_FILE_LAYOUT
        help
          This option enables support for minor version 1 of the NFSv4 protocol
@@ -87,15 +88,15 @@ config NFS_V4_1
 config PNFS_FILE_LAYOUT
        tristate
+config PNFS_BLOCK
+        tristate
+        depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
+        default m
 config PNFS_OBJLAYOUT
-        tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+        tristate
        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
-        help
+        default m
-          Say M here if you want your pNFS client to support the Objects Layout Driver.
-          Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
-          upper level driver (SCSI_OSD_ULD).
-          If unsure, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7dd0e6f..b58613d0abb3 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 000000000000..d5815505c020
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 000000000000..9561c8fc8bdb
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,1020 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/bio.h>          /* struct bio */
+#include <linux/buffer_head.h>  /* various write calls */
+#include <linux/prefetch.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+struct dentry *bl_device_pipe;
+wait_queue_head_t bl_wq;
+static void print_page(struct page *page)
+{
+        dprintk("PRINTPAGE page %p\n", page);
+        dprintk("       PagePrivate %d\n", PagePrivate(page));
+        dprintk("       PageUptodate %d\n", PageUptodate(page));
+        dprintk("       PageError %d\n", PageError(page));
+        dprintk("       PageDirty %d\n", PageDirty(page));
+        dprintk("       PageReferenced %d\n", PageReferenced(page));
+        dprintk("       PageLocked %d\n", PageLocked(page));
+        dprintk("       PageWriteback %d\n", PageWriteback(page));
+        dprintk("       PageMappedToDisk %d\n", PageMappedToDisk(page));
+        dprintk("\n");
+}
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+        if (be->be_state == PNFS_BLOCK_NONE_DATA)
+                return 1;
+        else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+                return 0;
+        else
+                return !bl_is_sector_init(be->be_inval, isect);
+}
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+        return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+                be->be_state == PNFS_BLOCK_INVALID_DATA);
+}
+/* The data we are handed might be spread across several bios.  We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+        struct kref refcnt;
+        struct rpc_call_ops call_ops;
+        void (*pnfs_callback) (void *data);
+        void *data;
+};
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+        struct parallel_io *rv;
+        rv  = kmalloc(sizeof(*rv), GFP_NOFS);
+        if (rv) {
+                rv->data = data;
+                kref_init(&rv->refcnt);
+        }
+        return rv;
+}
+static inline void get_parallel(struct parallel_io *p)
+{
+        kref_get(&p->refcnt);
+}
+static void destroy_parallel(struct kref *kref)
+{
+        struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+        dprintk("%s enter\n", __func__);
+        p->pnfs_callback(p->data);
+        kfree(p);
+}
+static inline void put_parallel(struct parallel_io *p)
+{
+        kref_put(&p->refcnt, destroy_parallel);
+}
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+        if (bio) {
+                get_parallel(bio->bi_private);
+                dprintk("%s submitting %s bio %u@%llu\n", __func__,
+                        rw == READ ? "read" : "write",
+                        bio->bi_size, (unsigned long long)bio->bi_sector);
+                submit_bio(rw, bio);
+        }
+        return NULL;
+}
+static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+                                     struct pnfs_block_extent *be,
+                                     void (*end_io)(struct bio *, int err),
+                                     struct parallel_io *par)
+{
+        struct bio *bio;
+        bio = bio_alloc(GFP_NOIO, npg);
+        if (!bio)
+                return NULL;
+        bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+        bio->bi_bdev = be->be_mdev;
+        bio->bi_end_io = end_io;
+        bio->bi_private = par;
+        return bio;
+}
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+                                      sector_t isect, struct page *page,
+                                      struct pnfs_block_extent *be,
+                                      void (*end_io)(struct bio *, int err),
+                                      struct parallel_io *par)
+{
+retry:
+        if (!bio) {
+                bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+                if (!bio)
+                        return ERR_PTR(-ENOMEM);
+        }
+        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+                bio = bl_submit_bio(rw, bio);
+                goto retry;
+        }
+        return bio;
+}
+static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate)
+                        SetPageUptodate(page);
+        } while (bvec >= bio->bi_io_vec);
+        if (!uptodate) {
+                if (!rdata->pnfs_error)
+                        rdata->pnfs_error = -EIO;
+                bl_set_lo_fail(rdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+static void bl_read_cleanup(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_read_data *rdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        rdata = container_of(task, struct nfs_read_data, task);
+        pnfs_ld_read_done(rdata);
+}
+static void
+bl_end_par_io_read(void *data)
+{
+        struct nfs_read_data *rdata = data;
+        INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+        schedule_work(&rdata->task.u.tk_work);
+}
+/* We don't want normal .rpc_call_done callback used, so we replace it
+ * with this stub.
+ */
+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
+{
+        return;
+}
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata)
+{
+        int i, hole;
+        struct bio *bio = NULL;
+        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        sector_t isect, extent_length = 0;
+        struct parallel_io *par;
+        loff_t f_offset = rdata->args.offset;
+        size_t count = rdata->args.count;
+        struct page **pages = rdata->args.pages;
+        int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+               rdata->npages, f_offset, count);
+        par = alloc_parallel(rdata);
+        if (!par)
+                goto use_mds;
+        par->call_ops = *rdata->mds_ops;
+        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+        par->pnfs_callback = bl_end_par_io_read;
+        /* At this point, we can no longer jump to use_mds */
+        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+        /* Code assumes extents are page-aligned */
+        for (i = pg_index; i < rdata->npages; i++) {
+                if (!extent_length) {
+                        /* We've used up the previous extent */
+                        bl_put_extent(be);
+                        bl_put_extent(cow_read);
+                        bio = bl_submit_bio(READ, bio);
+                        /* Get the next one */
+                        be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+                                             isect, &cow_read);
+                        if (!be) {
+                                rdata->pnfs_error = -EIO;
+                                goto out;
+                        }
+                        extent_length = be->be_length -
+                                (isect - be->be_f_offset);
+                        if (cow_read) {
+                                sector_t cow_length = cow_read->be_length -
+                                        (isect - cow_read->be_f_offset);
+                                extent_length = min(extent_length, cow_length);
+                        }
+                }
+                hole = is_hole(be, isect);
+                if (hole && !cow_read) {
+                        bio = bl_submit_bio(READ, bio);
+                        /* Fill hole w/ zeroes w/o accessing device */
+                        dprintk("%s Zeroing page for hole\n", __func__);
+                        zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+                        print_page(pages[i]);
+                        SetPageUptodate(pages[i]);
+                } else {
+                        struct pnfs_block_extent *be_read;
+                        be_read = (hole && cow_read) ? cow_read : be;
+                        bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+                                                 isect, pages[i], be_read,
+                                                 bl_end_io_read, par);
+                        if (IS_ERR(bio)) {
+                                rdata->pnfs_error = PTR_ERR(bio);
+                                goto out;
+                        }
+                }
+                isect += PAGE_CACHE_SECTORS;
+                extent_length -= PAGE_CACHE_SECTORS;
+        }
+        if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+                rdata->res.eof = 1;
+                rdata->res.count = rdata->inode->i_size - f_offset;
+        } else {
+                rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+        }
+out:
+        bl_put_extent(be);
+        bl_put_extent(cow_read);
+        bl_submit_bio(READ, bio);
+        put_parallel(par);
+        return PNFS_ATTEMPTED;
+ use_mds:
+        dprintk("Giving up and using normal NFS\n");
+        return PNFS_NOT_ATTEMPTED;
+}
+static void mark_extents_written(struct pnfs_block_layout *bl,
+                                 __u64 offset, __u32 count)
+{
+        sector_t isect, end;
+        struct pnfs_block_extent *be;
+        dprintk("%s(%llu, %u)\n", __func__, offset, count);
+        if (count == 0)
+                return;
+        isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
+        end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+        end >>= SECTOR_SHIFT;
+        while (isect < end) {
+                sector_t len;
+                be = bl_find_get_extent(bl, isect, NULL);
+                BUG_ON(!be); /* FIXME */
+                len = min(end, be->be_f_offset + be->be_length) - isect;
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        bl_mark_for_commit(be, isect, len); /* What if fails? */
+                isect += len;
+                bl_put_extent(be);
+        }
+}
+static void bl_end_io_write_zero(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+        do {
+                struct page *page = bvec->bv_page;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                /* This is the zeroing page we added */
+                end_page_writeback(page);
+                page_cache_release(page);
+        } while (bvec >= bio->bi_io_vec);
+        if (!uptodate) {
+                if (!wdata->pnfs_error)
+                        wdata->pnfs_error = -EIO;
+                bl_set_lo_fail(wdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_write(struct bio *bio, int err)
+{
+        struct parallel_io *par = bio->bi_private;
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+        if (!uptodate) {
+                if (!wdata->pnfs_error)
+                        wdata->pnfs_error = -EIO;
+                bl_set_lo_fail(wdata->lseg);
+        }
+        bio_put(bio);
+        put_parallel(par);
+}
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_write_data *wdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        wdata = container_of(task, struct nfs_write_data, task);
+        if (!wdata->pnfs_error) {
+                /* Marks for LAYOUTCOMMIT */
+                mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                     wdata->args.offset, wdata->args.count);
+        }
+        pnfs_ld_write_done(wdata);
+}
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void bl_end_par_io_write(void *data)
+{
+        struct nfs_write_data *wdata = data;
+        wdata->task.tk_status = 0;
+        wdata->verf.committed = NFS_FILE_SYNC;
+        INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+        schedule_work(&wdata->task.u.tk_work);
+}
+/* FIXME STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+        return;
+}
+/*
+ * map_block:  map a requested I/0 block (isect) into an offset in the LVM
+ * block_device
+ */
+static void
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
+{
+        dprintk("%s enter be=%p\n", __func__, be);
+        set_buffer_mapped(bh);
+        bh->b_bdev = be->be_mdev;
+        bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+            (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
+        dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
+                __func__, (unsigned long long)isect, (long)bh->b_blocknr,
+                bh->b_size);
+        return;
+}
+/* Given an unmapped page, zero it or read in page for COW, page is locked
+ * by caller.
+ */
+static int
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+{
+        struct buffer_head *bh = NULL;
+        int ret = 0;
+        sector_t isect;
+        dprintk("%s enter, %p\n", __func__, page);
+        BUG_ON(PageUptodate(page));
+        if (!cow_read) {
+                zero_user_segment(page, 0, PAGE_SIZE);
+                SetPageUptodate(page);
+                goto cleanup;
+        }
+        bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+        if (!bh) {
+                ret = -ENOMEM;
+                goto cleanup;
+        }
+        isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
+        map_block(bh, isect, cow_read);
+        if (!bh_uptodate_or_lock(bh))
+                ret = bh_submit_read(bh);
+        if (ret)
+                goto cleanup;
+        SetPageUptodate(page);
+cleanup:
+        bl_put_extent(cow_read);
+        if (bh)
+                free_buffer_head(bh);
+        if (ret) {
+                /* Need to mark layout with bad read...should now
+                 * just use nfs4 for reads and writes.
+                 */
+                mark_bad_read();
+        }
+        return ret;
+}
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+{
+        int i, ret, npg_zero, pg_index, last = 0;
+        struct bio *bio = NULL;
+        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+        sector_t isect, last_isect = 0, extent_length = 0;
+        struct parallel_io *par;
+        loff_t offset = wdata->args.offset;
+        size_t count = wdata->args.count;
+        struct page **pages = wdata->args.pages;
+        struct page *page;
+        pgoff_t index;
+        u64 temp;
+        int npg_per_block =
+            NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
+        dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+        /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+         * We want to write each, and if there is an error set pnfs_error
+         * to have it redone using nfs.
+         */
+        par = alloc_parallel(wdata);
+        if (!par)
+                return PNFS_NOT_ATTEMPTED;
+        par->call_ops = *wdata->mds_ops;
+        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+        par->pnfs_callback = bl_end_par_io_write;
+        /* At this point, have to be more careful with error handling */
+        isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+        if (!be || !is_writable(be, isect)) {
+                dprintk("%s no matching extents!\n", __func__);
+                wdata->pnfs_error = -EINVAL;
+                goto out;
+        }
+        /* First page inside INVALID extent */
+        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                temp = offset >> PAGE_CACHE_SHIFT;
+                npg_zero = do_div(temp, npg_per_block);
+                isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
+                                     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+                extent_length = be->be_length - (isect - be->be_f_offset);
+fill_invalid_ext:
+                dprintk("%s need to zero %d pages\n", __func__, npg_zero);
+                for (;npg_zero > 0; npg_zero--) {
+                        /* page ref released in bl_end_io_write_zero */
+                        index = isect >> PAGE_CACHE_SECTOR_SHIFT;
+                        dprintk("%s zero %dth page: index %lu isect %llu\n",
+                                __func__, npg_zero, index,
+                                (unsigned long long)isect);
+                        page =
+                            find_or_create_page(wdata->inode->i_mapping, index,
+                                                GFP_NOFS);
+                        if (!page) {
+                                dprintk("%s oom\n", __func__);
+                                wdata->pnfs_error = -ENOMEM;
+                                goto out;
+                        }
+                        /* PageDirty: Other will write this out
+                         * PageWriteback: Other is writing this out
+                         * PageUptodate: It was read before
+                         * sector_initialized: already written out
+                         */
+                        if (PageDirty(page) || PageWriteback(page) ||
+                            bl_is_sector_init(be->be_inval, isect)) {
+                                print_page(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto next_page;
+                        }
+                        if (!PageUptodate(page)) {
+                                /* New page, readin or zero it */
+                                init_page_for_write(page, cow_read);
+                        }
+                        set_page_writeback(page);
+                        unlock_page(page);
+                        ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                       PAGE_CACHE_SECTORS,
+                                                       NULL);
+                        if (unlikely(ret)) {
+                                dprintk("%s bl_mark_sectors_init fail %d\n",
+                                        __func__, ret);
+                                end_page_writeback(page);
+                                page_cache_release(page);
+                                wdata->pnfs_error = ret;
+                                goto out;
+                        }
+                        bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
+                                                 isect, page, be,
+                                                 bl_end_io_write_zero, par);
+                        if (IS_ERR(bio)) {
+                                wdata->pnfs_error = PTR_ERR(bio);
+                                goto out;
+                        }
+                        /* FIXME: This should be done in bi_end_io */
+                        mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                             page->index << PAGE_CACHE_SHIFT,
+                                             PAGE_CACHE_SIZE);
+next_page:
+                        isect += PAGE_CACHE_SECTORS;
+                        extent_length -= PAGE_CACHE_SECTORS;
+                }
+                if (last)
+                        goto write_done;
+        }
+        bio = bl_submit_bio(WRITE, bio);
+        /* Middle pages */
+        pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        for (i = pg_index; i < wdata->npages; i++) {
+                if (!extent_length) {
+                        /* We've used up the previous extent */
+                        bl_put_extent(be);
+                        bio = bl_submit_bio(WRITE, bio);
+                        /* Get the next one */
+                        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+                                             isect, NULL);
+                        if (!be || !is_writable(be, isect)) {
+                                wdata->pnfs_error = -EINVAL;
+                                goto out;
+                        }
+                        extent_length = be->be_length -
+                            (isect - be->be_f_offset);
+                }
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                        ret = bl_mark_sectors_init(be->be_inval, isect,
+                                                       PAGE_CACHE_SECTORS,
+                                                       NULL);
+                        if (unlikely(ret)) {
+                                dprintk("%s bl_mark_sectors_init fail %d\n",
+                                        __func__, ret);
+                                wdata->pnfs_error = ret;
+                                goto out;
+                        }
+                }
+                bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+                                         isect, pages[i], be,
+                                         bl_end_io_write, par);
+                if (IS_ERR(bio)) {
+                        wdata->pnfs_error = PTR_ERR(bio);
+                        goto out;
+                }
+                isect += PAGE_CACHE_SECTORS;
+                last_isect = isect;
+                extent_length -= PAGE_CACHE_SECTORS;
+        }
+        /* Last page inside INVALID extent */
+        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                bio = bl_submit_bio(WRITE, bio);
+                temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+                npg_zero = npg_per_block - do_div(temp, npg_per_block);
+                if (npg_zero < npg_per_block) {
+                        last = 1;
+                        goto fill_invalid_ext;
+                }
+        }
+write_done:
+        wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+        if (count < wdata->res.count) {
+                wdata->res.count = count;
+        }
+out:
+        bl_put_extent(be);
+        bl_submit_bio(WRITE, bio);
+        put_parallel(par);
+        return PNFS_ATTEMPTED;
+}
+/* FIXME - range ignored */
+static void
+release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
+{
+        int i;
+        struct pnfs_block_extent *be;
+        spin_lock(&bl->bl_ext_lock);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                while (!list_empty(&bl->bl_extents[i])) {
+                        be = list_first_entry(&bl->bl_extents[i],
+                                              struct pnfs_block_extent,
+                                              be_node);
+                        list_del(&be->be_node);
+                        bl_put_extent(be);
+                }
+        }
+        spin_unlock(&bl->bl_ext_lock);
+}
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+        struct pnfs_inval_tracking *pos, *temp;
+        list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+                list_del(&pos->it_link);
+                kfree(pos);
+        }
+        return;
+}
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        dprintk("%s enter\n", __func__);
+        release_extents(bl, NULL);
+        release_inval_marks(&bl->bl_inval);
+        kfree(bl);
+}
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+                                                   gfp_t gfp_flags)
+{
+        struct pnfs_block_layout *bl;
+        dprintk("%s enter\n", __func__);
+        bl = kzalloc(sizeof(*bl), gfp_flags);
+        if (!bl)
+                return NULL;
+        spin_lock_init(&bl->bl_ext_lock);
+        INIT_LIST_HEAD(&bl->bl_extents[0]);
+        INIT_LIST_HEAD(&bl->bl_extents[1]);
+        INIT_LIST_HEAD(&bl->bl_commit);
+        INIT_LIST_HEAD(&bl->bl_committing);
+        bl->bl_count = 0;
+        bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
+        BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+        return &bl->bl_layout;
+}
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        dprintk("%s enter\n", __func__);
+        kfree(lseg);
+}
+/* We pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge.
+ */
+static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+                                                 struct nfs4_layoutget_res *lgr,
+                                                 gfp_t gfp_flags)
+{
+        struct pnfs_layout_segment *lseg;
+        int status;
+        dprintk("%s enter\n", __func__);
+        lseg = kzalloc(sizeof(*lseg), gfp_flags);
+        if (!lseg)
+                return ERR_PTR(-ENOMEM);
+        status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+        if (status) {
+                /* We don't want to call the full-blown bl_free_lseg,
+                 * since on error extents were not touched.
+                 */
+                kfree(lseg);
+                return ERR_PTR(status);
+        }
+        return lseg;
+}
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+                       const struct nfs4_layoutcommit_args *arg)
+{
+        dprintk("%s enter\n", __func__);
+        encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
+}
+static void
+bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
+        dprintk("%s enter\n", __func__);
+        clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
+}
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+        if (mid) {
+                struct pnfs_block_dev *dev;
+                spin_lock(&mid->bm_lock);
+                while (!list_empty(&mid->bm_devlist)) {
+                        dev = list_first_entry(&mid->bm_devlist,
+                                               struct pnfs_block_dev,
+                                               bm_node);
+                        list_del(&dev->bm_node);
+                        bl_free_block_dev(dev);
+                }
+                spin_unlock(&mid->bm_lock);
+                kfree(mid);
+        }
+}
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+                        struct nfs4_deviceid *d_id)
+{
+        struct pnfs_device *dev;
+        struct pnfs_block_dev *rv = NULL;
+        u32 max_resp_sz;
+        int max_pages;
+        struct page **pages = NULL;
+        int i, rc;
+        /*
+         * Use the session max response size as the basis for setting
+         * GETDEVICEINFO's maxcount
+         */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        dprintk("%s max_resp_sz %u max_pages %d\n",
+                __func__, max_resp_sz, max_pages);
+        dev = kmalloc(sizeof(*dev), GFP_NOFS);
+        if (!dev) {
+                dprintk("%s kmalloc failed\n", __func__);
+                return NULL;
+        }
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+        if (pages == NULL) {
+                kfree(dev);
+                return NULL;
+        }
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_NOFS);
+                if (!pages[i])
+                        goto out_free;
+        }
+        memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+        dev->layout_type = LAYOUT_BLOCK_VOLUME;
+        dev->pages = pages;
+        dev->pgbase = 0;
+        dev->pglen = PAGE_SIZE * max_pages;
+        dev->mincount = 0;
+        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+        rc = nfs4_proc_getdeviceinfo(server, dev);
+        dprintk("%s getdevice info returns %d\n", __func__, rc);
+        if (rc)
+                goto out_free;
+        rv = nfs4_blk_decode_device(server, dev);
+ out_free:
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
+        kfree(dev);
+        return rv;
+}
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+        struct block_mount_id *b_mt_id = NULL;
+        struct pnfs_devicelist *dlist = NULL;
+        struct pnfs_block_dev *bdev;
+        LIST_HEAD(block_disklist);
+        int status = 0, i;
+        dprintk("%s enter\n", __func__);
+        if (server->pnfs_blksize == 0) {
+                dprintk("%s Server did not return blksize\n", __func__);
+                return -EINVAL;
+        }
+        b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
+        if (!b_mt_id) {
+                status = -ENOMEM;
+                goto out_error;
+        }
+        /* Initialize nfs4 block layout mount id */
+        spin_lock_init(&b_mt_id->bm_lock);
+        INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+        dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+        if (!dlist) {
+                status = -ENOMEM;
+                goto out_error;
+        }
+        dlist->eof = 0;
+        while (!dlist->eof) {
+                status = nfs4_proc_getdevicelist(server, fh, dlist);
+                if (status)
+                        goto out_error;
+                dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+                        __func__, dlist->num_devs, dlist->eof);
+                for (i = 0; i < dlist->num_devs; i++) {
+                        bdev = nfs4_blk_get_deviceinfo(server, fh,
+                                                       &dlist->dev_id[i]);
+                        if (!bdev) {
+                                status = -ENODEV;
+                                goto out_error;
+                        }
+                        spin_lock(&b_mt_id->bm_lock);
+                        list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+                        spin_unlock(&b_mt_id->bm_lock);
+                }
+        }
+        dprintk("%s SUCCESS\n", __func__);
+        server->pnfs_ld_data = b_mt_id;
+ out_return:
+        kfree(dlist);
+        return status;
+ out_error:
+        free_blk_mountid(b_mt_id);
+        goto out_return;
+}
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+        struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+        dprintk("%s enter\n", __func__);
+        free_blk_mountid(b_mt_id);
+        dprintk("%s RETURNS\n", __func__);
+        return 0;
+}
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+        .pg_init = pnfs_generic_pg_init_read,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+};
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+        .pg_init = pnfs_generic_pg_init_write,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+};
+static struct pnfs_layoutdriver_type blocklayout_type = {
+        .id                             = LAYOUT_BLOCK_VOLUME,
+        .name                           = "LAYOUT_BLOCK_VOLUME",
+        .read_pagelist                  = bl_read_pagelist,
+        .write_pagelist                 = bl_write_pagelist,
+        .alloc_layout_hdr               = bl_alloc_layout_hdr,
+        .free_layout_hdr                = bl_free_layout_hdr,
+        .alloc_lseg                     = bl_alloc_lseg,
+        .free_lseg                      = bl_free_lseg,
+        .encode_layoutcommit            = bl_encode_layoutcommit,
+        .cleanup_layoutcommit           = bl_cleanup_layoutcommit,
+        .set_layoutdriver               = bl_set_layoutdriver,
+        .clear_layoutdriver             = bl_clear_layoutdriver,
+        .pg_read_ops                    = &bl_pg_read_ops,
+        .pg_write_ops                   = &bl_pg_write_ops,
+};
+static const struct rpc_pipe_ops bl_upcall_ops = {
+        .upcall         = bl_pipe_upcall,
+        .downcall       = bl_pipe_downcall,
+        .destroy_msg    = bl_pipe_destroy_msg,
+};
+static int __init nfs4blocklayout_init(void)
+{
+        struct vfsmount *mnt;
+        struct path path;
+        int ret;
+        dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+        ret = pnfs_register_layoutdriver(&blocklayout_type);
+        if (ret)
+                goto out;
+        init_waitqueue_head(&bl_wq);
+        mnt = rpc_get_mount();
+        if (IS_ERR(mnt)) {
+                ret = PTR_ERR(mnt);
+                goto out_remove;
+        }
+        ret = vfs_path_lookup(mnt->mnt_root,
+                              mnt,
+                              NFS_PIPE_DIRNAME, 0, &path);
+        if (ret)
+                goto out_remove;
+        bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
+                                    &bl_upcall_ops, 0);
+        if (IS_ERR(bl_device_pipe)) {
+                ret = PTR_ERR(bl_device_pipe);
+                goto out_remove;
+        }
+out:
+        return ret;
+out_remove:
+        pnfs_unregister_layoutdriver(&blocklayout_type);
+        return ret;
+}
+static void __exit nfs4blocklayout_exit(void)
+{
+        dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&blocklayout_type);
+        rpc_unlink(bl_device_pipe);
+}
+MODULE_ALIAS("nfs-layouttype4-3");
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 000000000000..f27d827960a3
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,207 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "../pnfs.h"
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+struct block_mount_id {
+        spinlock_t                      bm_lock;    /* protects list */
+        struct list_head                bm_devlist; /* holds pnfs_block_dev */
+};
+struct pnfs_block_dev {
+        struct list_head                bm_node;
+        struct nfs4_deviceid            bm_mdevid;    /* associated devid */
+        struct block_device             *bm_mdev;     /* meta device itself */
+};
+enum exstate4 {
+        PNFS_BLOCK_READWRITE_DATA       = 0,
+        PNFS_BLOCK_READ_DATA            = 1,
+        PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
+        PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
+};
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+struct my_tree {
+        sector_t                mtt_step_size;  /* Internal sector alignment */
+        struct list_head        mtt_stub; /* Should be a radix tree */
+};
+struct pnfs_inval_markings {
+        spinlock_t      im_lock;
+        struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
+        sector_t        im_block_size;  /* Server blocksize in sectors */
+};
+struct pnfs_inval_tracking {
+        struct list_head it_link;
+        int              it_sector;
+        int              it_tags;
+};
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+        struct kref     be_refcnt;
+        struct list_head be_node;       /* link into lseg list */
+        struct nfs4_deviceid be_devid;  /* FIXME: could use device cache instead */
+        struct block_device *be_mdev;
+        sector_t        be_f_offset;    /* the starting offset in the file */
+        sector_t        be_length;      /* the size of the extent */
+        sector_t        be_v_offset;    /* the starting offset in the volume */
+        enum exstate4   be_state;       /* the state of this extent */
+        struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+        struct list_head bse_node;
+        struct nfs4_deviceid bse_devid;
+        struct block_device *bse_mdev;
+        sector_t        bse_f_offset;   /* the starting offset in the file */
+        sector_t        bse_length;     /* the size of the extent */
+};
+static inline void
+BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+        spin_lock_init(&marks->im_lock);
+        INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+        marks->im_block_size = blocksize;
+        marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+                                           blocksize);
+}
+enum extentclass4 {
+        RW_EXTENT       = 0, /* READWRTE and INVAL */
+        RO_EXTENT       = 1, /* READ and NONE */
+        EXTENT_LISTS    = 2,
+};
+static inline int bl_choose_list(enum exstate4 state)
+{
+        if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+                return RO_EXTENT;
+        else
+                return RW_EXTENT;
+}
+struct pnfs_block_layout {
+        struct pnfs_layout_hdr bl_layout;
+        struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+        spinlock_t              bl_ext_lock;   /* Protects list manipulation */
+        struct list_head        bl_extents[EXTENT_LISTS]; /* R and RW extents */
+        struct list_head        bl_commit;      /* Needs layout commit */
+        struct list_head        bl_committing;  /* Layout committing */
+        unsigned int            bl_count;       /* entries in bl_commit */
+        sector_t                bl_blocksize;  /* Server blocksize in sectors */
+};
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+        return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+        return BLK_LO2EXT(lseg->pls_layout);
+}
+struct bl_dev_msg {
+        int status;
+        uint32_t major, minor;
+};
+struct bl_msg_hdr {
+        u8  type;
+        u16 totallen; /* length of entire message, including hdr itself */
+};
+extern struct dentry *bl_device_pipe;
+extern wait_queue_head_t bl_wq;
+#define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
+/* blocklayoutdev.c */
+ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+                       char __user *, size_t);
+ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+                                                struct pnfs_device *dev);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                                struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+/* blocklayoutdm.c */
+void bl_free_block_dev(struct pnfs_block_dev *bdev);
+/* extents.c */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+                struct pnfs_block_extent **cow_read);
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                             sector_t offset, sector_t length,
+                             sector_t **pages);
+void bl_put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *bl_alloc_extent(void);
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                   struct xdr_stream *xdr,
+                                   const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                                   const struct nfs4_layoutcommit_args *arg,
+                                   int status);
+int bl_add_merge_extent(struct pnfs_block_layout *bl,
+                         struct pnfs_block_extent *new);
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                        sector_t offset, sector_t length);
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 000000000000..a83b393fb01c
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,410 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ *  Device operations for the pnfs nfs4 file layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+        uint64_t s;
+        *rp = xdr_decode_hyper(*rp, &s);
+        if (s & 0x1ff) {
+                printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+                return -1;
+        }
+        *sp = s >> SECTOR_SHIFT;
+        return 0;
+}
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+        struct block_device *bd;
+        dprintk("%s enter\n", __func__);
+        bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+        if (IS_ERR(bd))
+                goto fail;
+        return bd;
+fail:
+        dprintk("%s failed to open device : %ld\n",
+                        __func__, PTR_ERR(bd));
+        return NULL;
+}
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+        dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+                        MINOR(bdev->bd_dev));
+        return blkdev_put(bdev, FMODE_READ);
+}
+/*
+ * Shouldn't there be a rpc_generic_upcall() to do this for us?
+ */
+ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                       char __user *dst, size_t buflen)
+{
+        char *data = (char *)msg->data + msg->copied;
+        size_t mlen = min(msg->len - msg->copied, buflen);
+        unsigned long left;
+        left = copy_to_user(dst, data, mlen);
+        if (left == mlen) {
+                msg->errno = -EFAULT;
+                return -EFAULT;
+        }
+        mlen -= left;
+        msg->copied += mlen;
+        msg->errno = 0;
+        return mlen;
+}
+static struct bl_dev_msg bl_mount_reply;
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+                         size_t mlen)
+{
+        if (mlen != sizeof (struct bl_dev_msg))
+                return -EINVAL;
+        if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+                return -EFAULT;
+        wake_up(&bl_wq);
+        return mlen;
+}
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        if (msg->errno >= 0)
+                return;
+        wake_up(&bl_wq);
+}
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+                       struct pnfs_device *dev)
+{
+        struct pnfs_block_dev *rv = NULL;
+        struct block_device *bd = NULL;
+        struct rpc_pipe_msg msg;
+        struct bl_msg_hdr bl_msg = {
+                .type = BL_DEVICE_MOUNT,
+                .totallen = dev->mincount,
+        };
+        uint8_t *dataptr;
+        DECLARE_WAITQUEUE(wq, current);
+        struct bl_dev_msg *reply = &bl_mount_reply;
+        int offset, len, i;
+        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+        dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+                dev->mincount);
+        memset(&msg, 0, sizeof(msg));
+        msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+        if (!msg.data) {
+                rv = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        dataptr = (uint8_t *) msg.data;
+        len = dev->mincount;
+        offset = sizeof(bl_msg);
+        for (i = 0; len > 0; i++) {
+                memcpy(&dataptr[offset], page_address(dev->pages[i]),
+                                len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+                len -= PAGE_CACHE_SIZE;
+                offset += PAGE_CACHE_SIZE;
+        }
+        msg.len = sizeof(bl_msg) + dev->mincount;
+        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+        add_wait_queue(&bl_wq, &wq);
+        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+                remove_wait_queue(&bl_wq, &wq);
+                goto out;
+        }
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&bl_wq, &wq);
+        if (reply->status != BL_DEVICE_REQUEST_PROC) {
+                dprintk("%s failed to open device: %d\n",
+                        __func__, reply->status);
+                rv = ERR_PTR(-EINVAL);
+                goto out;
+        }
+        bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+        if (IS_ERR(bd)) {
+                dprintk("%s failed to open device : %ld\n",
+                        __func__, PTR_ERR(bd));
+                goto out;
+        }
+        rv = kzalloc(sizeof(*rv), GFP_NOFS);
+        if (!rv) {
+                rv = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        rv->bm_mdev = bd;
+        memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+        dprintk("%s Created device %s with bd_block_size %u\n",
+                __func__,
+                bd->bd_disk->disk_name,
+                bd->bd_block_size);
+out:
+        kfree(msg.data);
+        return rv;
+}
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+                                            struct nfs4_deviceid *id)
+{
+        struct block_device *rv = NULL;
+        struct block_mount_id *mid;
+        struct pnfs_block_dev *dev;
+        dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+        mid = BLK_ID(lo);
+        spin_lock(&mid->bm_lock);
+        list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+                if (memcmp(id->data, dev->bm_mdevid.data,
+                           NFS4_DEVICEID4_SIZE) == 0) {
+                        rv = dev->bm_mdev;
+                        goto out;
+                }
+        }
+ out:
+        spin_unlock(&mid->bm_lock);
+        dprintk("%s returning %p\n", __func__, rv);
+        return rv;
+}
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+        u32 mode;       /* R or RW */
+        u64 start;      /* Expected start of next non-COW extent */
+        u64 inval;      /* Start of INVAL coverage */
+        u64 cowread;    /* End of COW read coverage */
+};
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+                         struct layout_verification *lv)
+{
+        if (lv->mode == IOMODE_READ) {
+                if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+                    be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        return -EIO;
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
+        }
+        /* lv->mode == IOMODE_RW */
+        if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                if (lv->cowread > lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                lv->inval = lv->start;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                if (be->be_f_offset != lv->start)
+                        return -EIO;
+                lv->start += be->be_length;
+                return 0;
+        } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+                if (be->be_f_offset > lv->start)
+                        return -EIO;
+                if (be->be_f_offset < lv->inval)
+                        return -EIO;
+                if (be->be_f_offset < lv->cowread)
+                        return -EIO;
+                /* It looks like you might want to min this with lv->start,
+                 * but you really don't.
+                 */
+                lv->inval = lv->inval + be->be_length;
+                lv->cowread = be->be_f_offset + be->be_length;
+                return 0;
+        } else
+                return -EIO;
+}
+/* XDR decode pnfs_block_layout4 structure */
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+                           struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+        struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+        int i, status = -EIO;
+        uint32_t count;
+        struct pnfs_block_extent *be = NULL, *save;
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        __be32 *p;
+        struct layout_verification lv = {
+                .mode = lgr->range.iomode,
+                .start = lgr->range.offset >> SECTOR_SHIFT,
+                .inval = lgr->range.offset >> SECTOR_SHIFT,
+                .cowread = lgr->range.offset >> SECTOR_SHIFT,
+        };
+        LIST_HEAD(extents);
+        dprintk("---> %s\n", __func__);
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                return -ENOMEM;
+        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err;
+        count = be32_to_cpup(p++);
+        dprintk("%s enter, number of extents %i\n", __func__, count);
+        p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+        if (unlikely(!p))
+                goto out_err;
+        /* Decode individual extents, putting them in temporary
+         * staging area until whole layout is decoded to make error
+         * recovery easier.
+         */
+        for (i = 0; i < count; i++) {
+                be = bl_alloc_extent();
+                if (!be) {
+                        status = -ENOMEM;
+                        goto out_err;
+                }
+                memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+                p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+                be->be_mdev = translate_devid(lo, &be->be_devid);
+                if (!be->be_mdev)
+                        goto out_err;
+                /* The next three values are read in as bytes,
+                 * but stored as 512-byte sector lengths
+                 */
+                if (decode_sector_number(&p, &be->be_f_offset) < 0)
+                        goto out_err;
+                if (decode_sector_number(&p, &be->be_length) < 0)
+                        goto out_err;
+                if (decode_sector_number(&p, &be->be_v_offset) < 0)
+                        goto out_err;
+                be->be_state = be32_to_cpup(p++);
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                        be->be_inval = &bl->bl_inval;
+                if (verify_extent(be, &lv)) {
+                        dprintk("%s verify failed\n", __func__);
+                        goto out_err;
+                }
+                list_add_tail(&be->be_node, &extents);
+        }
+        if (lgr->range.offset + lgr->range.length !=
+                        lv.start << SECTOR_SHIFT) {
+                dprintk("%s Final length mismatch\n", __func__);
+                be = NULL;
+                goto out_err;
+        }
+        if (lv.start < lv.cowread) {
+                dprintk("%s Final uncovered COW extent\n", __func__);
+                be = NULL;
+                goto out_err;
+        }
+        /* Extents decoded properly, now try to merge them in to
+         * existing layout extents.
+         */
+        spin_lock(&bl->bl_ext_lock);
+        list_for_each_entry_safe(be, save, &extents, be_node) {
+                list_del(&be->be_node);
+                status = bl_add_merge_extent(bl, be);
+                if (status) {
+                        spin_unlock(&bl->bl_ext_lock);
+                        /* This is a fairly catastrophic error, as the
+                         * entire layout extent lists are now corrupted.
+                         * We should have some way to distinguish this.
+                         */
+                        be = NULL;
+                        goto out_err;
+                }
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        status = 0;
+ out:
+        __free_page(scratch);
+        dprintk("%s returns %i\n", __func__, status);
+        return status;
+ out_err:
+        bl_put_extent(be);
+        while (!list_empty(&extents)) {
+                be = list_first_entry(&extents, struct pnfs_block_extent,
+                                      be_node);
+                list_del(&be->be_node);
+                bl_put_extent(be);
+        }
+        goto out;
+}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 000000000000..d055c7558073
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2007 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Fred Isaman <iisaman@umich.edu>
+ *  Andy Adamson <andros@citi.umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static void dev_remove(dev_t dev)
+{
+        struct rpc_pipe_msg msg;
+        struct bl_dev_msg bl_umount_request;
+        struct bl_msg_hdr bl_msg = {
+                .type = BL_DEVICE_UMOUNT,
+                .totallen = sizeof(bl_umount_request),
+        };
+        uint8_t *dataptr;
+        DECLARE_WAITQUEUE(wq, current);
+        dprintk("Entering %s\n", __func__);
+        memset(&msg, 0, sizeof(msg));
+        msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+        if (!msg.data)
+                goto out;
+        memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+        bl_umount_request.major = MAJOR(dev);
+        bl_umount_request.minor = MINOR(dev);
+        memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+        dataptr = (uint8_t *) msg.data;
+        memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+        msg.len = sizeof(bl_msg) + bl_msg.totallen;
+        add_wait_queue(&bl_wq, &wq);
+        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+                remove_wait_queue(&bl_wq, &wq);
+                goto out;
+        }
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule();
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&bl_wq, &wq);
+out:
+        kfree(msg.data);
+}
+/*
+ * Release meta device
+ */
+static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+        int rv;
+        dprintk("%s Releasing\n", __func__);
+        rv = nfs4_blkdev_put(bdev->bm_mdev);
+        if (rv)
+                printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+                                __func__, rv);
+        dev_remove(bdev->bm_mdev->bd_dev);
+}
+void bl_free_block_dev(struct pnfs_block_dev *bdev)
+{
+        if (bdev) {
+                if (bdev->bm_mdev) {
+                        dprintk("%s Removing DM device: %d:%d\n",
+                                __func__,
+                                MAJOR(bdev->bm_mdev->bd_dev),
+                                MINOR(bdev->bm_mdev->bd_dev));
+                        nfs4_blk_metadev_release(bdev);
+                }
+                kfree(bdev);
+        }
+}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 000000000000..19fa7b0b8c00
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,935 @@
+/*
+ *  linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ *  Module for the NFSv4.1 pNFS block layout driver.
+ *
+ *  Copyright (c) 2006 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@citi.umich.edu>
+ *  Fred Isaman <iisaman@umich.edu>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization.  if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose.  the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include "blocklayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN     1
+#define EXTENT_IN_COMMIT   2
+#define INTERNAL_EXISTS    MY_MAX_TAGS
+#define INTERNAL_MASK      ((1 << INTERNAL_EXISTS) - 1)
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+        sector_t tmp = s; /* Since do_div modifies its argument */
+        return s - do_div(tmp, base);
+}
+static inline sector_t normalize_up(sector_t s, int base)
+{
+        return normalize(s + base - 1, base);
+}
+/* Complete stub using list while determine API wanted */
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree *tree, u64 s)
+{
+        struct pnfs_inval_tracking *pos;
+        dprintk("%s(%llu) enter\n", __func__, s);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector > s)
+                        continue;
+                else if (pos->it_sector == s)
+                        return pos->it_tags & INTERNAL_MASK;
+                else
+                        break;
+        }
+        return -ENOENT;
+}
+static inline
+int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
+{
+        int32_t tags;
+        dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+        s = normalize(s, tree->mtt_step_size);
+        tags = _find_entry(tree, s);
+        if ((tags < 0) || !(tags & (1 << tag)))
+                return 0;
+        else
+                return 1;
+}
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
+                      struct pnfs_inval_tracking *storage)
+{
+        int found = 0;
+        struct pnfs_inval_tracking *pos;
+        dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector > s)
+                        continue;
+                else if (pos->it_sector == s) {
+                        found = 1;
+                        break;
+                } else
+                        break;
+        }
+        if (found) {
+                pos->it_tags |= (1 << tag);
+                return 0;
+        } else {
+                struct pnfs_inval_tracking *new;
+                if (storage)
+                        new = storage;
+                else {
+                        new = kmalloc(sizeof(*new), GFP_NOFS);
+                        if (!new)
+                                return -ENOMEM;
+                }
+                new->it_sector = s;
+                new->it_tags = (1 << tag);
+                list_add(&new->it_link, &pos->it_link);
+                return 1;
+        }
+}
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
+{
+        u64 i;
+        dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+        for (i = normalize(s, tree->mtt_step_size); i < s + length;
+             i += tree->mtt_step_size)
+                if (_add_entry(tree, i, tag, NULL))
+                        return -ENOMEM;
+        return 0;
+}
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+{
+        u64 start, end, s;
+        int count, i, used = 0, status = -ENOMEM;
+        struct pnfs_inval_tracking **storage;
+        dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+        start = normalize(offset, tree->mtt_step_size);
+        end = normalize_up(offset + length, tree->mtt_step_size);
+        count = (int)(end - start) / (int)tree->mtt_step_size;
+        /* Pre-malloc what memory we might need */
+        storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+        if (!storage)
+                return -ENOMEM;
+        for (i = 0; i < count; i++) {
+                storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+                                     GFP_NOFS);
+                if (!storage[i])
+                        goto out_cleanup;
+        }
+        /* Now need lock - HOW??? */
+        for (s = start; s < end; s += tree->mtt_step_size)
+                used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+        /* Unlock - HOW??? */
+        status = 0;
+ out_cleanup:
+        for (i = used; i < count; i++) {
+                if (!storage[i])
+                        break;
+                kfree(storage[i]);
+        }
+        kfree(storage);
+        return status;
+}
+static void set_needs_init(sector_t *array, sector_t offset)
+{
+        sector_t *p = array;
+        dprintk("%s enter\n", __func__);
+        if (!p)
+                return;
+        while (*p < offset)
+                p++;
+        if (*p == offset)
+                return;
+        else if (*p == ~0) {
+                *p++ = offset;
+                *p = ~0;
+                return;
+        } else {
+                sector_t *save = p;
+                dprintk("%s Adding %llu\n", __func__, (u64)offset);
+                while (*p != ~0)
+                        p++;
+                p++;
+                memmove(save + 1, save, (char *)p - (char *)save);
+                *save = offset;
+                return;
+        }
+}
+/* We are relying on page lock to serialize this */
+int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
+{
+        int rv;
+        spin_lock(&marks->im_lock);
+        rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+        spin_unlock(&marks->im_lock);
+        return rv;
+}
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
+{
+        struct pnfs_inval_tracking *pos;
+        u64 expect = 0;
+        dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+        list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+                if (pos->it_sector >= end)
+                        continue;
+                if (!expect) {
+                        if ((pos->it_sector == end - tree->mtt_step_size) &&
+                            (pos->it_tags & (1 << tag))) {
+                                expect = pos->it_sector - tree->mtt_step_size;
+                                if (pos->it_sector < tree->mtt_step_size || expect < start)
+                                        return 1;
+                                continue;
+                        } else {
+                                return 0;
+                        }
+                }
+                if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+                        return 0;
+                expect -= tree->mtt_step_size;
+                if (expect < start)
+                        return 1;
+        }
+        return 0;
+}
+static int is_range_written(struct pnfs_inval_markings *marks,
+                            sector_t start, sector_t end)
+{
+        int rv;
+        spin_lock(&marks->im_lock);
+        rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+        spin_unlock(&marks->im_lock);
+        return rv;
+}
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Notes where partial block is initialized, and helps prepare it for
+ * complete initialization later.
+ */
+/* Currently assumes offset is page-aligned */
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+                             sector_t offset, sector_t length,
+                             sector_t **pages)
+{
+        sector_t s, start, end;
+        sector_t *array = NULL; /* Pages to mark */
+        dprintk("%s(offset=%llu,len=%llu) enter\n",
+                __func__, (u64)offset, (u64)length);
+        s = max((sector_t) 3,
+                2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
+        dprintk("%s set max=%llu\n", __func__, (u64)s);
+        if (pages) {
+                array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
+                if (!array)
+                        goto outerr;
+                array[0] = ~0;
+        }
+        start = normalize(offset, marks->im_block_size);
+        end = normalize_up(offset + length, marks->im_block_size);
+        if (_preload_range(&marks->im_tree, start, end - start))
+                goto outerr;
+        spin_lock(&marks->im_lock);
+        for (s = normalize_up(start, PAGE_CACHE_SECTORS);
+             s < offset; s += PAGE_CACHE_SECTORS) {
+                dprintk("%s pre-area pages\n", __func__);
+                /* Portion of used block is not initialized */
+                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                        set_needs_init(array, s);
+        }
+        if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+                goto out_unlock;
+        for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+             s < end; s += PAGE_CACHE_SECTORS) {
+                dprintk("%s post-area pages\n", __func__);
+                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+                        set_needs_init(array, s);
+        }
+        spin_unlock(&marks->im_lock);
+        if (pages) {
+                if (array[0] == ~0) {
+                        kfree(array);
+                        *pages = NULL;
+                } else
+                        *pages = array;
+        }
+        return 0;
+ out_unlock:
+        spin_unlock(&marks->im_lock);
+ outerr:
+        if (pages) {
+                kfree(array);
+                *pages = NULL;
+        }
+        return -ENOMEM;
+}
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+static int mark_written_sectors(struct pnfs_inval_markings *marks,
+                                sector_t offset, sector_t length)
+{
+        int status;
+        dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+                (u64)offset, (u64)length);
+        spin_lock(&marks->im_lock);
+        status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+        spin_unlock(&marks->im_lock);
+        return status;
+}
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+        dprintk("PRINT SHORT EXTENT extent %p\n", be);
+        if (be) {
+                dprintk("        be_f_offset %llu\n", (u64)be->bse_f_offset);
+                dprintk("        be_length   %llu\n", (u64)be->bse_length);
+        }
+}
+static void print_clist(struct list_head *list, unsigned int count)
+{
+        struct pnfs_block_short_extent *be;
+        unsigned int i = 0;
+        ifdebug(FACILITY) {
+                printk(KERN_DEBUG "****************\n");
+                printk(KERN_DEBUG "Extent list looks like:\n");
+                list_for_each_entry(be, list, bse_node) {
+                        i++;
+                        print_short_extent(be);
+                }
+                if (i != count)
+                        printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
+                printk(KERN_DEBUG "****************\n");
+        }
+}
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to bl_add_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+                              struct pnfs_block_short_extent *new)
+{
+        struct list_head *clist = &bl->bl_commit;
+        struct pnfs_block_short_extent *old, *save;
+        sector_t end = new->bse_f_offset + new->bse_length;
+        dprintk("%s enter\n", __func__);
+        print_short_extent(new);
+        print_clist(clist, bl->bl_count);
+        bl->bl_count++;
+        /* Scan for proper place to insert, extending new to the left
+         * as much as possible.
+         */
+        list_for_each_entry_safe(old, save, clist, bse_node) {
+                if (new->bse_f_offset < old->bse_f_offset)
+                        break;
+                if (end <= old->bse_f_offset + old->bse_length) {
+                        /* Range is already in list */
+                        bl->bl_count--;
+                        kfree(new);
+                        return;
+                } else if (new->bse_f_offset <=
+                                old->bse_f_offset + old->bse_length) {
+                        /* new overlaps or abuts existing be */
+                        if (new->bse_mdev == old->bse_mdev) {
+                                /* extend new to fully replace old */
+                                new->bse_length += new->bse_f_offset -
+                                                old->bse_f_offset;
+                                new->bse_f_offset = old->bse_f_offset;
+                                list_del(&old->bse_node);
+                                bl->bl_count--;
+                                kfree(old);
+                        }
+                }
+        }
+        /* Note that if we never hit the above break, old will not point to a
+         * valid extent.  However, in that case &old->bse_node==list.
+         */
+        list_add_tail(&new->bse_node, &old->bse_node);
+        /* Scan forward for overlaps.  If we find any, extend new and
+         * remove the overlapped extent.
+         */
+        old = list_prepare_entry(new, clist, bse_node);
+        list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+                if (end < old->bse_f_offset)
+                        break;
+                /* new overlaps or abuts old */
+                if (new->bse_mdev == old->bse_mdev) {
+                        if (end < old->bse_f_offset + old->bse_length) {
+                                /* extend new to fully cover old */
+                                end = old->bse_f_offset + old->bse_length;
+                                new->bse_length = end - new->bse_f_offset;
+                        }
+                        list_del(&old->bse_node);
+                        bl->bl_count--;
+                        kfree(old);
+                }
+        }
+        dprintk("%s: after merging\n", __func__);
+        print_clist(clist, bl->bl_count);
+}
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ */
+int bl_mark_for_commit(struct pnfs_block_extent *be,
+                    sector_t offset, sector_t length)
+{
+        sector_t new_end, end = offset + length;
+        struct pnfs_block_short_extent *new;
+        struct pnfs_block_layout *bl = container_of(be->be_inval,
+                                                    struct pnfs_block_layout,
+                                                    bl_inval);
+        new = kmalloc(sizeof(*new), GFP_NOFS);
+        if (!new)
+                return -ENOMEM;
+        mark_written_sectors(be->be_inval, offset, length);
+        /* We want to add the range to commit list, but it must be
+         * block-normalized, and verified that the normalized range has
+         * been entirely written to disk.
+         */
+        new->bse_f_offset = offset;
+        offset = normalize(offset, bl->bl_blocksize);
+        if (offset < new->bse_f_offset) {
+                if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+                        new->bse_f_offset = offset;
+                else
+                        new->bse_f_offset = offset + bl->bl_blocksize;
+        }
+        new_end = normalize_up(end, bl->bl_blocksize);
+        if (end < new_end) {
+                if (is_range_written(be->be_inval, end, new_end))
+                        end = new_end;
+                else
+                        end = new_end - bl->bl_blocksize;
+        }
+        if (end <= new->bse_f_offset) {
+                kfree(new);
+                return 0;
+        }
+        new->bse_length = end - new->bse_f_offset;
+        new->bse_devid = be->be_devid;
+        new->bse_mdev = be->be_mdev;
+        spin_lock(&bl->bl_ext_lock);
+        /* new will be freed, either by add_to_commitlist if it decides not
+         * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+         */
+        add_to_commitlist(bl, new);
+        spin_unlock(&bl->bl_ext_lock);
+        return 0;
+}
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+        dprintk("PRINT EXTENT extent %p\n", be);
+        if (be) {
+                dprintk("        be_f_offset %llu\n", (u64)be->be_f_offset);
+                dprintk("        be_length   %llu\n", (u64)be->be_length);
+                dprintk("        be_v_offset %llu\n", (u64)be->be_v_offset);
+                dprintk("        be_state    %d\n", be->be_state);
+        }
+}
+static void
+destroy_extent(struct kref *kref)
+{
+        struct pnfs_block_extent *be;
+        be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+        dprintk("%s be=%p\n", __func__, be);
+        kfree(be);
+}
+void
+bl_put_extent(struct pnfs_block_extent *be)
+{
+        if (be) {
+                dprintk("%s enter %p (%i)\n", __func__, be,
+                        atomic_read(&be->be_refcnt.refcount));
+                kref_put(&be->be_refcnt, destroy_extent);
+        }
+}
+struct pnfs_block_extent *bl_alloc_extent(void)
+{
+        struct pnfs_block_extent *be;
+        be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+        if (!be)
+                return NULL;
+        INIT_LIST_HEAD(&be->be_node);
+        kref_init(&be->be_refcnt);
+        be->be_inval = NULL;
+        return be;
+}
+static void print_elist(struct list_head *list)
+{
+        struct pnfs_block_extent *be;
+        dprintk("****************\n");
+        dprintk("Extent list looks like:\n");
+        list_for_each_entry(be, list, be_node) {
+                print_bl_extent(be);
+        }
+        dprintk("****************\n");
+}
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+        /* Note this assumes new->be_f_offset >= old->be_f_offset */
+        return (new->be_state == old->be_state) &&
+                ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+                 ((new->be_v_offset - old->be_v_offset ==
+                   new->be_f_offset - old->be_f_offset) &&
+                  new->be_mdev == old->be_mdev));
+}
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See bl_find_get_extent for list constraints.
+ *
+ * Refcount on new is already set.  If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * bl->bl_ext_lock is held by caller.
+ */
+int
+bl_add_merge_extent(struct pnfs_block_layout *bl,
+                     struct pnfs_block_extent *new)
+{
+        struct pnfs_block_extent *be, *tmp;
+        sector_t end = new->be_f_offset + new->be_length;
+        struct list_head *list;
+        dprintk("%s enter with be=%p\n", __func__, new);
+        print_bl_extent(new);
+        list = &bl->bl_extents[bl_choose_list(new->be_state)];
+        print_elist(list);
+        /* Scan for proper place to insert, extending new to the left
+         * as much as possible.
+         */
+        list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+                if (new->be_f_offset >= be->be_f_offset + be->be_length)
+                        break;
+                if (new->be_f_offset >= be->be_f_offset) {
+                        if (end <= be->be_f_offset + be->be_length) {
+                                /* new is a subset of existing be*/
+                                if (extents_consistent(be, new)) {
+                                        dprintk("%s: new is subset, ignoring\n",
+                                                __func__);
+                                        bl_put_extent(new);
+                                        return 0;
+                                } else {
+                                        goto out_err;
+                                }
+                        } else {
+                                /* |<--   be   -->|
+                                 *          |<--   new   -->| */
+                                if (extents_consistent(be, new)) {
+                                        /* extend new to fully replace be */
+                                        new->be_length += new->be_f_offset -
+                                                be->be_f_offset;
+                                        new->be_f_offset = be->be_f_offset;
+                                        new->be_v_offset = be->be_v_offset;
+                                        dprintk("%s: removing %p\n", __func__, be);
+                                        list_del(&be->be_node);
+                                        bl_put_extent(be);
+                                } else {
+                                        goto out_err;
+                                }
+                        }
+                } else if (end >= be->be_f_offset + be->be_length) {
+                        /* new extent overlap existing be */
+                        if (extents_consistent(be, new)) {
+                                /* extend new to fully replace be */
+                                dprintk("%s: removing %p\n", __func__, be);
+                                list_del(&be->be_node);
+                                bl_put_extent(be);
+                        } else {
+                                goto out_err;
+                        }
+                } else if (end > be->be_f_offset) {
+                        /*           |<--   be   -->|
+                         *|<--   new   -->| */
+                        if (extents_consistent(new, be)) {
+                                /* extend new to fully replace be */
+                                new->be_length += be->be_f_offset + be->be_length -
+                                        new->be_f_offset - new->be_length;
+                                dprintk("%s: removing %p\n", __func__, be);
+                                list_del(&be->be_node);
+                                bl_put_extent(be);
+                        } else {
+                                goto out_err;
+                        }
+                }
+        }
+        /* Note that if we never hit the above break, be will not point to a
+         * valid extent.  However, in that case &be->be_node==list.
+         */
+        list_add(&new->be_node, &be->be_node);
+        dprintk("%s: inserting new\n", __func__);
+        print_elist(list);
+        /* FIXME - The per-list consistency checks have all been done,
+         * should now check cross-list consistency.
+         */
+        return 0;
+ out_err:
+        bl_put_extent(new);
+        return -EIO;
+}
+/* Returns extent, or NULL.  If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID.  Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+            struct pnfs_block_extent **cow_read)
+{
+        struct pnfs_block_extent *be, *cow, *ret;
+        int i;
+        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+        cow = ret = NULL;
+        spin_lock(&bl->bl_ext_lock);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                        if (isect >= be->be_f_offset + be->be_length)
+                                break;
+                        if (isect >= be->be_f_offset) {
+                                /* We have found an extent */
+                                dprintk("%s Get %p (%i)\n", __func__, be,
+                                        atomic_read(&be->be_refcnt.refcount));
+                                kref_get(&be->be_refcnt);
+                                if (!ret)
+                                        ret = be;
+                                else if (be->be_state != PNFS_BLOCK_READ_DATA)
+                                        bl_put_extent(be);
+                                else
+                                        cow = be;
+                                break;
+                        }
+                }
+                if (ret &&
+                    (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+                        break;
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        if (cow_read)
+                *cow_read = cow;
+        print_bl_extent(ret);
+        return ret;
+}
+/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+        struct pnfs_block_extent *be, *ret = NULL;
+        int i;
+        dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+        for (i = 0; i < EXTENT_LISTS; i++) {
+                if (ret)
+                        break;
+                list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+                        if (isect >= be->be_f_offset + be->be_length)
+                                break;
+                        if (isect >= be->be_f_offset) {
+                                /* We have found an extent */
+                                dprintk("%s Get %p (%i)\n", __func__, be,
+                                        atomic_read(&be->be_refcnt.refcount));
+                                kref_get(&be->be_refcnt);
+                                ret = be;
+                                break;
+                        }
+                }
+        }
+        print_bl_extent(ret);
+        return ret;
+}
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                               struct xdr_stream *xdr,
+                               const struct nfs4_layoutcommit_args *arg)
+{
+        struct pnfs_block_short_extent *lce, *save;
+        unsigned int count = 0;
+        __be32 *p, *xdr_start;
+        dprintk("%s enter\n", __func__);
+        /* BUG - creation of bl_commit is buggy - need to wait for
+         * entire block to be marked WRITTEN before it can be added.
+         */
+        spin_lock(&bl->bl_ext_lock);
+        /* Want to adjust for possible truncate */
+        /* We now want to adjust argument range */
+        /* XDR encode the ranges found */
+        xdr_start = xdr_reserve_space(xdr, 8);
+        if (!xdr_start)
+                goto out;
+        list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+                p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+                if (!p)
+                        break;
+                p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
+                p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
+                p = xdr_encode_hyper(p, 0LL);
+                *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+                list_del(&lce->bse_node);
+                list_add_tail(&lce->bse_node, &bl->bl_committing);
+                bl->bl_count--;
+                count++;
+        }
+        xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+        xdr_start[1] = cpu_to_be32(count);
+out:
+        spin_unlock(&bl->bl_ext_lock);
+        dprintk("%s found %i ranges\n", __func__, count);
+        return 0;
+}
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+                 struct pnfs_block_extent *orig,
+                 sector_t offset, sector_t length, int state)
+{
+        kref_init(&new->be_refcnt);
+        /* don't need to INIT_LIST_HEAD(&new->be_node) */
+        memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+        new->be_mdev = orig->be_mdev;
+        new->be_f_offset = offset;
+        new->be_length = length;
+        new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+        new->be_state = state;
+        new->be_inval = orig->be_inval;
+}
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+             struct pnfs_block_extent *storage)
+{
+        struct pnfs_block_extent *prev;
+        if (!storage)
+                goto no_merge;
+        if (&be->be_node == head || be->be_node.prev == head)
+                goto no_merge;
+        prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+        if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+            !extents_consistent(prev, be))
+                goto no_merge;
+        _prep_new_extent(storage, prev, prev->be_f_offset,
+                         prev->be_length + be->be_length, prev->be_state);
+        list_replace(&prev->be_node, &storage->be_node);
+        bl_put_extent(prev);
+        list_del(&be->be_node);
+        bl_put_extent(be);
+        return storage;
+ no_merge:
+        kfree(storage);
+        return be;
+}
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+        u64 rv = offset + length;
+        struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+        struct pnfs_block_extent *children[3];
+        struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+        int i = 0, j;
+        dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+        /* Create storage for up to three new extents e1, e2, e3 */
+        e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
+        e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
+        e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
+        /* BUG - we are ignoring any failure */
+        if (!e1 || !e2 || !e3)
+                goto out_nosplit;
+        spin_lock(&bl->bl_ext_lock);
+        be = bl_find_get_extent_locked(bl, offset);
+        rv = be->be_f_offset + be->be_length;
+        if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+                spin_unlock(&bl->bl_ext_lock);
+                goto out_nosplit;
+        }
+        /* Add e* to children, bumping e*'s krefs */
+        if (be->be_f_offset != offset) {
+                _prep_new_extent(e1, be, be->be_f_offset,
+                                 offset - be->be_f_offset,
+                                 PNFS_BLOCK_INVALID_DATA);
+                children[i++] = e1;
+                print_bl_extent(e1);
+        } else
+                merge1 = e1;
+        _prep_new_extent(e2, be, offset,
+                         min(length, be->be_f_offset + be->be_length - offset),
+                         PNFS_BLOCK_READWRITE_DATA);
+        children[i++] = e2;
+        print_bl_extent(e2);
+        if (offset + length < be->be_f_offset + be->be_length) {
+                _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+                                 be->be_f_offset + be->be_length -
+                                 offset - length,
+                                 PNFS_BLOCK_INVALID_DATA);
+                children[i++] = e3;
+                print_bl_extent(e3);
+        } else
+                merge2 = e3;
+        /* Remove be from list, and insert the e* */
+        /* We don't get refs on e*, since this list is the base reference
+         * set when init'ed.
+         */
+        if (i < 3)
+                children[i] = NULL;
+        new = children[0];
+        list_replace(&be->be_node, &new->be_node);
+        bl_put_extent(be);
+        new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+        for (j = 1; j < i; j++) {
+                old = new;
+                new = children[j];
+                list_add(&new->be_node, &old->be_node);
+        }
+        if (merge2) {
+                /* This is a HACK, should just create a _back_merge function */
+                new = list_entry(new->be_node.next,
+                                 struct pnfs_block_extent, be_node);
+                new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+        }
+        spin_unlock(&bl->bl_ext_lock);
+        /* Since we removed the base reference above, be is now scheduled for
+         * destruction.
+         */
+        bl_put_extent(be);
+        dprintk("%s returns %llu after split\n", __func__, rv);
+        return rv;
+ out_nosplit:
+        kfree(e1);
+        kfree(e2);
+        kfree(e3);
+        dprintk("%s returns %llu without splitting\n", __func__, rv);
+        return rv;
+}
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+                              const struct nfs4_layoutcommit_args *arg,
+                              int status)
+{
+        struct pnfs_block_short_extent *lce, *save;
+        dprintk("%s status %d\n", __func__, status);
+        list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
+                if (likely(!status)) {
+                        u64 offset = lce->bse_f_offset;
+                        u64 end = offset + lce->bse_length;
+                        do {
+                                offset = set_to_rw(bl, offset, end - offset);
+                        } while (offset < end);
+                        list_del(&lce->bse_node);
+                        kfree(lce);
+                } else {
+                        list_del(&lce->bse_node);
+                        spin_lock(&bl->bl_ext_lock);
+                        add_to_commitlist(bl, lce);
+                        spin_unlock(&bl->bl_ext_lock);
+                }
+        }
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 76f856e284e4..7cf6cafcc007 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -6,7 +6,7 @@
 #include <linux/completion.h>
 #include <linux/sunrpc/cache.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Deferred request handling
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383bb565..07df5f1d85e5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
+        int                     slotid;
 };
 struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
        void *dummy, struct cb_process_state *cps);
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern void nfs4_cb_take_slot(struct nfs_client *clp);
 struct cb_devicenotifyitem {
        uint32_t                cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index d4d1954e9bb9..43926add945b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
 static u32 initiate_file_draining(struct nfs_client *clp,
                                  struct cb_layoutrecallargs *args)
 {
+        struct nfs_server *server;
        struct pnfs_layout_hdr *lo;
        struct inode *ino;
        bool found = false;
@@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        LIST_HEAD(free_me_list);
        spin_lock(&clp->cl_lock);
-        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+        rcu_read_lock();
-                if (nfs_compare_fh(&args->cbl_fh,
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                                   &NFS_I(lo->plh_inode)->fh))
+                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                        continue;
+                        if (nfs_compare_fh(&args->cbl_fh,
-                ino = igrab(lo->plh_inode);
+                                           &NFS_I(lo->plh_inode)->fh))
-                if (!ino)
+                                continue;
-                        continue;
+                        ino = igrab(lo->plh_inode);
-                found = true;
+                        if (!ino)
-                /* Without this, layout can be freed as soon
+                                continue;
-                 * as we release cl_lock.
+                        found = true;
-                 */
+                        /* Without this, layout can be freed as soon
-                get_layout_hdr(lo);
+                         * as we release cl_lock.
-                break;
+                         */
+                        get_layout_hdr(lo);
+                        break;
+                }
+                if (found)
+                        break;
        }
+        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
        if (!found)
                return NFS4ERR_NOMATCHING_LAYOUT;
@@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 static u32 initiate_bulk_draining(struct nfs_client *clp,
                                  struct cb_layoutrecallargs *args)
 {
+        struct nfs_server *server;
        struct pnfs_layout_hdr *lo;
        struct inode *ino;
        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
        };
        spin_lock(&clp->cl_lock);
-        list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) {
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
                if ((args->cbl_recall_type == RETURN_FSID) &&
-                    memcmp(&NFS_SERVER(lo->plh_inode)->fsid,
+                    memcmp(&server->fsid, &args->cbl_fsid,
-                           &args->cbl_fsid, sizeof(struct nfs_fsid)))
+                           sizeof(struct nfs_fsid)))
-                        continue;
-                if (!igrab(lo->plh_inode))
                        continue;
-                get_layout_hdr(lo);
-                BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                list_add(&lo->plh_bulk_recall, &recall_list);
+                        if (!igrab(lo->plh_inode))
+                                continue;
+                        get_layout_hdr(lo);
+                        BUG_ON(!list_empty(&lo->plh_bulk_recall));
+                        list_add(&lo->plh_bulk_recall, &recall_list);
+                }
        }
+        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
        list_for_each_entry_safe(lo, tmp,
                                 &recall_list, plh_bulk_recall) {
                ino = lo->plh_inode;
@@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Normal */
        if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
                slot->seq_nr++;
-                return htonl(NFS4_OK);
+                goto out_ok;
        }
        /* Replay */
@@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Wraparound */
        if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
                slot->seq_nr = 1;
-                return htonl(NFS4_OK);
+                goto out_ok;
        }
        /* Misordered request */
        return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+        tbl->highest_used_slotid = args->csa_slotid;
+        return htonl(NFS4_OK);
 }
 /*
@@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                              struct cb_sequenceres *res,
                              struct cb_process_state *cps)
 {
+        struct nfs4_slot_table *tbl;
        struct nfs_client *clp;
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
-        cps->clp = NULL;
        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
+        tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
-                status = NFS4ERR_DELAY;
+                spin_unlock(&tbl->slot_tbl_lock);
+                status = htonl(NFS4ERR_DELAY);
+                /* Return NFS4ERR_BADSESSION if we're draining the session
+                 * in order to reset it.
+                 */
+                if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+                        status = htonl(NFS4ERR_BADSESSION);
                goto out;
        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+        spin_unlock(&tbl->slot_tbl_lock);
        if (status)
                goto out;
+        cps->slotid = args->csa_slotid;
        /*
         * Check for pending referring calls.  If a match is found, a
         * related callback was received before the response to the original
@@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-        nfs4_cb_take_slot(clp);
 out:
        cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a77e043..918ad647afea 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * Let the state manager know callback processing done.
         * A single slot, so highest used slotid is either 0 or -1
         */
-        tbl->highest_used_slotid--;
+        tbl->highest_used_slotid = -1;
        nfs4_check_drain_bc_complete(session);
        spin_unlock(&tbl->slot_tbl_lock);
 }
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-        if (clp && clp->cl_session)
+        if (cps->slotid != -1)
-                nfs4_callback_free_slot(clp->cl_session);
+                nfs4_callback_free_slot(cps->clp->cl_session);
-}
-/* A single slot, so highest used slotid is either 0 or -1 */
-void nfs4_cb_take_slot(struct nfs_client *clp)
-{
-        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
-        spin_lock(&tbl->slot_tbl_lock);
-        tbl->highest_used_slotid++;
-        BUG_ON(tbl->highest_used_slotid != 0);
-        spin_unlock(&tbl->slot_tbl_lock);
 }
 #else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_process_state cps = {
                .drc_status = 0,
                .clp = NULL,
+                .slotid = -1,
        };
        unsigned int nops = 0;
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
-        nfs4_cb_free_slot(cps.clp);
+        nfs4_cb_free_slot(&cps);
        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b3dc2b88b65b..5833fbbf59b0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
        .nrvers                 = ARRAY_SIZE(nfs_version),
        .version                = nfs_version,
        .stats                  = &nfs_rpcstat,
-        .pipe_dir_name          = "/nfs",
+        .pipe_dir_name          = NFS_PIPE_DIRNAME,
 };
 struct rpc_stat nfs_rpcstat = {
@@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
-#if defined(CONFIG_NFS_V4_1)
-        INIT_LIST_HEAD(&clp->cl_layouts);
-#endif
        nfs_fscache_get_client_cookie(clp);
        return clp;
@@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp)
        nfs4_deviceid_purge_client(clp);
        kfree(clp->cl_hostname);
+        kfree(clp->server_scope);
        kfree(clp);
        dprintk("<-- nfs_free_client()\n");
@@ -906,7 +904,9 @@ error:
 /*
 * Load up the server record from information gained in an fsinfo record
 */
-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+                                  struct nfs_fh *mntfh,
+                                  struct nfs_fsinfo *fsinfo)
 {
        unsigned long max_rpc_payload;
@@ -936,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
                server->wsize = NFS_MAX_FILE_IO_SIZE;
        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        set_pnfs_layoutdriver(server, fsinfo->layouttype);
+        server->pnfs_blksize = fsinfo->blksize;
+        set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
@@ -982,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        if (error < 0)
                goto out_error;
-        nfs_server_set_fsinfo(server, &fsinfo);
+        nfs_server_set_fsinfo(server, mntfh, &fsinfo);
        /* Get some general file system info */
        if (server->namelen == 0) {
@@ -1062,6 +1063,7 @@ static struct nfs_server *nfs_alloc_server(void)
        INIT_LIST_HEAD(&server->client_link);
        INIT_LIST_HEAD(&server->master_link);
        INIT_LIST_HEAD(&server->delegations);
+        INIT_LIST_HEAD(&server->layouts);
        atomic_set(&server->active, 0);
@@ -1464,7 +1466,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
        dprintk("<-- %s %p\n", __func__, clp);
        return clp;
 }
-EXPORT_SYMBOL(nfs4_set_ds_client);
+EXPORT_SYMBOL_GPL(nfs4_set_ds_client);
 /*
 * Session has been established, and the client marked ready.
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dd25c2aec375..321a66bc3846 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
-static void nfs_mark_return_delegation(struct nfs_delegation *delegation)
+static void nfs_mark_return_delegation(struct nfs_server *server,
+                struct nfs_delegation *delegation)
 {
-        struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client;
        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+        set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
 }
 /**
@@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
                if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
                        continue;
                if (delegation->type & flags)
-                        nfs_mark_return_delegation(delegation);
+                        nfs_mark_return_delegation(server, delegation);
        }
 }
@@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
        list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
                        continue;
-                nfs_mark_return_delegation(delegation);
+                nfs_mark_return_delegation(server, delegation);
        }
 }
@@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 int nfs_async_inode_return_delegation(struct inode *inode,
                                      const nfs4_stateid *stateid)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_delegation *delegation;
        rcu_read_lock();
@@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,
                rcu_read_unlock();
                return -ENOENT;
        }
-        nfs_mark_return_delegation(delegation);
+        nfs_mark_return_delegation(server, delegation);
        rcu_read_unlock();
        nfs_delegation_run_state_manager(clp);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57f578e2560a..b238d95ac48c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = {
 #endif /* CONFIG_NFS_V4 */
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
 {
        struct nfs_open_dir_context *ctx;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
                ctx->duped = 0;
+                ctx->attr_gencount = NFS_I(dir)->attr_gencount;
                ctx->dir_cookie = 0;
                ctx->dup_cookie = 0;
                ctx->cred = get_rpccred(cred);
-        } else
+                return ctx;
-                ctx = ERR_PTR(-ENOMEM);
+        }
-        return ctx;
+        return  ERR_PTR(-ENOMEM);
 }
 static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
@@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        ctx = alloc_nfs_open_dir_context(cred);
+        ctx = alloc_nfs_open_dir_context(inode, cred);
        if (IS_ERR(ctx)) {
                res = PTR_ERR(ctx);
                goto out;
@@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 {
        loff_t diff = desc->file->f_pos - desc->current_index;
        unsigned int index;
-        struct nfs_open_dir_context *ctx = desc->file->private_data;
        if (diff < 0)
                goto out_eof;
@@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
-        ctx->duped = 0;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
        int i;
        loff_t new_pos;
        int status = -EAGAIN;
-        struct nfs_open_dir_context *ctx = desc->file->private_data;
        for (i = 0; i < array->size; i++) {
                if (array->array[i].cookie == *desc->dir_cookie) {
+                        struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+                        struct nfs_open_dir_context *ctx = desc->file->private_data;
                        new_pos = desc->current_index + i;
-                        if (new_pos < desc->file->f_pos) {
+                        if (ctx->attr_gencount != nfsi->attr_gencount
+                            || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+                                ctx->duped = 0;
+                                ctx->attr_gencount = nfsi->attr_gencount;
+                        } else if (new_pos < desc->file->f_pos) {
+                                if (ctx->duped > 0
+                                    && ctx->dup_cookie == *desc->dir_cookie) {
+                                        if (printk_ratelimit()) {
+                                                pr_notice("NFS: directory %s/%s contains a readdir loop."
+                                                                "Please contact your server vendor.  "
+                                                                "The file: %s has duplicate cookie %llu\n",
+                                                                desc->file->f_dentry->d_parent->d_name.name,
+                                                                desc->file->f_dentry->d_name.name,
+                                                                array->array[i].string.name,
+                                                                *desc->dir_cookie);
+                                        }
+                                        status = -ELOOP;
+                                        goto out;
+                                }
                                ctx->dup_cookie = *desc->dir_cookie;
-                                ctx->duped = 1;
+                                ctx->duped = -1;
                        }
                        desc->file->f_pos = new_pos;
                        desc->cache_entry_index = i;
@@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                if (*desc->dir_cookie == array->last_cookie)
                        desc->eof = 1;
        }
+out:
        return status;
 }
@@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct nfs_cache_array *array = NULL;
        struct nfs_open_dir_context *ctx = file->private_data;
-        if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
-                if (printk_ratelimit()) {
-                        pr_notice("NFS: directory %s/%s contains a readdir loop.  "
-                                "Please contact your server vendor.  "
-                                "Offending cookie: %llu\n",
-                                file->f_dentry->d_parent->d_name.name,
-                                file->f_dentry->d_name.name,
-                                *desc->dir_cookie);
-                }
-                res = -ELOOP;
-                goto out;
-        }
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
                res = PTR_ERR(array);
@@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
+                if (ctx->duped != 0)
+                        ctx->duped = 1;
        }
        if (array->eof_index >= 0)
                desc->eof = 1;
@@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        struct page     *page = NULL;
        int             status;
        struct inode *inode = desc->file->f_path.dentry->d_inode;
+        struct nfs_open_dir_context *ctx = desc->file->private_data;
        dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
                        (unsigned long long)*desc->dir_cookie);
@@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
        desc->page_index = 0;
        desc->last_cookie = *desc->dir_cookie;
        desc->page = page;
+        ctx->duped = 0;
        status = nfs_readdir_xdr_to_array(desc, page, inode);
        if (status < 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b35d25b98da6..1940f1a56a5f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -53,7 +53,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "internal.h"
 #include "iostat.h"
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a55347a2daa..ab12913dd473 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb);
 extern char *nfs_path(char **p, struct dentry *dentry,
                      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
+#ifdef CONFIG_NFS_V4
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
+#endif
 /* getroot.c */
 extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
@@ -288,12 +291,22 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
+struct nfs_pageio_descriptor;
 /* read.c */
 extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
                             const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
+                struct list_head *head);
+extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
+extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
+                struct list_head *head);
+extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
+extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
 extern int nfs_initiate_write(struct nfs_write_data *data,
                              struct rpc_clnt *clnt,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 1f063bacd285..8102391bb374 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -119,7 +119,7 @@ Elong:
 }
 #ifdef CONFIG_NFS_V4
-static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
        struct gss_api_mech *mech;
        struct xdr_netobj oid;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index e49e73107e62..7ef23979896d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -415,7 +415,7 @@ fail:
 }
 int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
-                mode_t mode)
+                umode_t mode)
 {
        struct posix_acl *dfacl, *acl;
        int error = 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 38053d823eb0..85f1690ca08c 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                 int flags, struct nfs_open_context *ctx)
 {
        struct nfs3_createdata *data;
-        mode_t mode = sattr->ia_mode;
+        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  create %s\n", dentry->d_name.name);
@@ -562,7 +562,7 @@ static int
 nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
        struct nfs3_createdata *data;
-        int mode = sattr->ia_mode;
+        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
@@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                dev_t rdev)
 {
        struct nfs3_createdata *data;
-        mode_t mode = sattr->ia_mode;
+        umode_t mode = sattr->ia_mode;
        int status = -ENOMEM;
        dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b788f2eb1ba0..1ec1a85fa71c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -48,6 +48,7 @@ enum nfs4_client_state {
        NFS4CLNT_SESSION_RESET,
        NFS4CLNT_RECALL_SLOT,
        NFS4CLNT_LEASE_CONFIRM,
+        NFS4CLNT_SERVER_SCOPE_MISMATCH,
 };
 enum nfs4_session_state {
@@ -66,6 +67,8 @@ struct nfs4_minor_version_ops {
                        int cache_reply);
        int     (*validate_stateid)(struct nfs_delegation *,
                        const nfs4_stateid *);
+        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
+                        struct nfs_fsinfo *);
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
        const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -315,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
-extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[3];
 extern const u32 nfs4_fs_locations_bitmap[2];
 /* nfs4renewd.c */
@@ -349,6 +352,8 @@ extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
+extern void nfs41_handle_server_scope(struct nfs_client *,
+                                      struct server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index f9d03abcd04c..e8915d4840ad 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
        pnfs_set_layoutcommit(wdata);
        dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
-                (unsigned long) wdata->lseg->pls_end_pos);
+                (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
 }
 /*
@@ -334,6 +334,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)
                __func__, data->inode->i_ino,
                data->args.pgbase, (size_t)data->args.count, offset);
+        if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
+                return PNFS_NOT_ATTEMPTED;
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
        idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -344,8 +347,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
                return PNFS_NOT_ATTEMPTED;
        }
-        dprintk("%s USE DS:ip %x %hu\n", __func__,
+        dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr);
-                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
        /* No multipath support. Use first DS */
        data->ds_clp = ds->ds_clp;
@@ -374,6 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
        struct nfs_fh *fh;
        int status;
+        if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags))
+                return PNFS_NOT_ATTEMPTED;
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
        idx = nfs4_fl_calc_ds_index(lseg, j);
@@ -384,9 +389,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
                return PNFS_NOT_ATTEMPTED;
        }
-        dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__,
                data->inode->i_ino, sync, (size_t) data->args.count, offset,
-                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+                ds->ds_remotestr);
        data->write_done_cb = filelayout_write_done_cb;
        data->ds_clp = ds->ds_clp;
@@ -428,6 +433,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        dprintk("--> %s\n", __func__);
+        /* FIXME: remove this check when layout segment support is added */
+        if (lgr->range.offset != 0 ||
+            lgr->range.length != NFS4_MAX_UINT64) {
+                dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+                        __func__);
+                goto out;
+        }
        if (fl->pattern_offset > lgr->range.offset) {
                dprintk("%s pattern_offset %lld too large\n",
                                __func__, fl->pattern_offset);
@@ -449,6 +462,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                        goto out;
        } else
                dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+        /* Found deviceid is being reaped */
+        if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags))
+                        goto out_put;
        fl->dsaddr = dsaddr;
        if (fl->first_stripe_index < 0 ||
@@ -659,7 +676,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 * return true  : coalesce page
 * return false : don't coalesce page
 */
-bool
+static bool
 filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                   struct nfs_page *req)
 {
@@ -670,8 +687,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
            !nfs_generic_pg_test(pgio, prev, req))
                return false;
-        if (!pgio->pg_lseg)
-                return 1;
        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
        r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
        stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
@@ -682,6 +697,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
        return (p_stripe == r_stripe);
 }
+void
+filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+                        struct nfs_page *req)
+{
+        BUG_ON(pgio->pg_lseg != NULL);
+        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                           req->wb_context,
+                                           0,
+                                           NFS4_MAX_UINT64,
+                                           IOMODE_READ,
+                                           GFP_KERNEL);
+        /* If no lseg, fall back to read through mds */
+        if (pgio->pg_lseg == NULL)
+                nfs_pageio_reset_read_mds(pgio);
+}
+void
+filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                         struct nfs_page *req)
+{
+        BUG_ON(pgio->pg_lseg != NULL);
+        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                           req->wb_context,
+                                           0,
+                                           NFS4_MAX_UINT64,
+                                           IOMODE_RW,
+                                           GFP_NOFS);
+        /* If no lseg, fall back to write through mds */
+        if (pgio->pg_lseg == NULL)
+                nfs_pageio_reset_write_mds(pgio);
+}
+static const struct nfs_pageio_ops filelayout_pg_read_ops = {
+        .pg_init = filelayout_pg_init_read,
+        .pg_test = filelayout_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+};
+static const struct nfs_pageio_ops filelayout_pg_write_ops = {
+        .pg_init = filelayout_pg_init_write,
+        .pg_test = filelayout_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+};
 static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
 {
        return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
@@ -879,7 +940,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .owner                  = THIS_MODULE,
        .alloc_lseg             = filelayout_alloc_lseg,
        .free_lseg              = filelayout_free_lseg,
-        .pg_test                = filelayout_pg_test,
+        .pg_read_ops            = &filelayout_pg_read_ops,
+        .pg_write_ops           = &filelayout_pg_write_ops,
        .mark_pnfs_commit       = filelayout_mark_pnfs_commit,
        .choose_commit_list     = filelayout_choose_commit_list,
        .commit_pagelist        = filelayout_commit_pagelist,
@@ -902,5 +964,7 @@ static void __exit nfs4filelayout_exit(void)
        pnfs_unregister_layoutdriver(&filelayout_type);
 }
+MODULE_ALIAS("nfs-layouttype4-1");
 module_init(nfs4filelayout_init);
 module_exit(nfs4filelayout_exit);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index cebe01e3795e..2e42284253fa 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -47,10 +47,17 @@ enum stripetype4 {
 };
 /* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+        struct sockaddr_storage da_addr;
+        size_t                  da_addrlen;
+        struct list_head        da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        char                    *da_remotestr;  /* human readable addr+port */
+};
 struct nfs4_pnfs_ds {
        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-        u32                     ds_ip_addr;
+        char                    *ds_remotestr;  /* comma sep list of addrs */
-        u32                     ds_port;
+        struct list_head        ds_addrs;
        struct nfs_client       *ds_clp;
        atomic_t                ds_count;
 };
@@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
                            generic_hdr);
 }
+static inline struct nfs4_deviceid_node *
+FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
+{
+        return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
+}
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 3b7bf1377264..ed388aae9689 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds)
                printk("%s NULL device\n", __func__);
                return;
        }
-        printk("        ip_addr %x port %hu\n"
+        printk("        ds %s\n"
                "        ref count %d\n"
                "        client %p\n"
                "        cl_exchange_flags %x\n",
-                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                ds->ds_remotestr,
                atomic_read(&ds->ds_count), ds->ds_clp,
                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
 }
-/* nfs4_ds_cache_lock is held */
+static bool
-static struct nfs4_pnfs_ds *
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
-_data_server_lookup_locked(u32 ip_addr, u32 port)
 {
-        struct nfs4_pnfs_ds *ds;
+        struct sockaddr_in *a, *b;
+        struct sockaddr_in6 *a6, *b6;
+        if (addr1->sa_family != addr2->sa_family)
+                return false;
+        switch (addr1->sa_family) {
+        case AF_INET:
+                a = (struct sockaddr_in *)addr1;
+                b = (struct sockaddr_in *)addr2;
+                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+                    a->sin_port == b->sin_port)
+                        return true;
+                break;
+        case AF_INET6:
+                a6 = (struct sockaddr_in6 *)addr1;
+                b6 = (struct sockaddr_in6 *)addr2;
+                /* LINKLOCAL addresses must have matching scope_id */
+                if (ipv6_addr_scope(&a6->sin6_addr) ==
+                    IPV6_ADDR_SCOPE_LINKLOCAL &&
+                    a6->sin6_scope_id != b6->sin6_scope_id)
+                        return false;
+                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+                    a6->sin6_port == b6->sin6_port)
+                        return true;
+                break;
+        default:
+                dprintk("%s: unhandled address family: %u\n",
+                        __func__, addr1->sa_family);
+                return false;
+        }
-        dprintk("_data_server_lookup: ip_addr=%x port=%hu\n",
+        return false;
-                        ntohl(ip_addr), ntohs(port));
+}
-        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+/*
-                if (ds->ds_ip_addr == ip_addr &&
+ * Lookup DS by addresses.  The first matching address returns true.
-                    ds->ds_port == port) {
+ * nfs4_ds_cache_lock is held
-                        return ds;
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(struct list_head *dsaddrs)
+{
+        struct nfs4_pnfs_ds *ds;
+        struct nfs4_pnfs_ds_addr *da1, *da2;
+        list_for_each_entry(da1, dsaddrs, da_node) {
+                list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) {
+                        list_for_each_entry(da2, &ds->ds_addrs, da_node) {
+                                if (same_sockaddr(
+                                        (struct sockaddr *)&da1->da_addr,
+                                        (struct sockaddr *)&da2->da_addr))
+                                        return ds;
+                        }
                }
        }
        return NULL;
 }
 /*
+ * Compare two lists of addresses.
+ */
+static bool
+_data_server_match_all_addrs_locked(struct list_head *dsaddrs1,
+                                    struct list_head *dsaddrs2)
+{
+        struct nfs4_pnfs_ds_addr *da1, *da2;
+        size_t count1 = 0,
+               count2 = 0;
+        list_for_each_entry(da1, dsaddrs1, da_node)
+                count1++;
+        list_for_each_entry(da2, dsaddrs2, da_node) {
+                bool found = false;
+                count2++;
+                list_for_each_entry(da1, dsaddrs1, da_node) {
+                        if (same_sockaddr((struct sockaddr *)&da1->da_addr,
+                                (struct sockaddr *)&da2->da_addr)) {
+                                found = true;
+                                break;
+                        }
+                }
+                if (!found)
+                        return false;
+        }
+        return (count1 == count2);
+}
+/*
 * Create an rpc connection to the nfs4_pnfs_ds data server
- * Currently only support IPv4
+ * Currently only supports IPv4 and IPv6 addresses
 */
 static int
 nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 {
-        struct nfs_client *clp;
+        struct nfs_client *clp = ERR_PTR(-EIO);
-        struct sockaddr_in sin;
+        struct nfs4_pnfs_ds_addr *da;
        int status = 0;
-        dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
-                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-        sin.sin_family = AF_INET;
+        BUG_ON(list_empty(&ds->ds_addrs));
-        sin.sin_addr.s_addr = ds->ds_ip_addr;
-        sin.sin_port = ds->ds_port;
+        list_for_each_entry(da, &ds->ds_addrs, da_node) {
+                dprintk("%s: DS %s: trying address %s\n",
+                        __func__, ds->ds_remotestr, da->da_remotestr);
+                clp = nfs4_set_ds_client(mds_srv->nfs_client,
+                                 (struct sockaddr *)&da->da_addr,
+                                 da->da_addrlen, IPPROTO_TCP);
+                if (!IS_ERR(clp))
+                        break;
+        }
-        clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
-                                 sizeof(sin), IPPROTO_TCP);
        if (IS_ERR(clp)) {
                status = PTR_ERR(clp);
                goto out;
@@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
                        goto out_put;
                }
                ds->ds_clp = clp;
-                dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+                dprintk("%s [existing] server=%s\n", __func__,
-                        ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+                        ds->ds_remotestr);
                goto out;
        }
@@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
                goto out_put;
        ds->ds_clp = clp;
-        dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
-                ntohs(ds->ds_port));
 out:
        return status;
 out_put:
@@ -147,12 +231,25 @@ out_put:
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
+        struct nfs4_pnfs_ds_addr *da;
        dprintk("--> %s\n", __func__);
        ifdebug(FACILITY)
                print_ds(ds);
        if (ds->ds_clp)
                nfs_put_client(ds->ds_clp);
+        while (!list_empty(&ds->ds_addrs)) {
+                da = list_first_entry(&ds->ds_addrs,
+                                      struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        kfree(ds->ds_remotestr);
        kfree(ds);
 }
@@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        kfree(dsaddr);
 }
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds_addr *da;
+        char *remotestr;
+        size_t len;
+        char *p;
+        len = 3;        /* '{', '}' and eol */
+        list_for_each_entry(da, dsaddrs, da_node) {
+                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
+        }
+        remotestr = kzalloc(len, gfp_flags);
+        if (!remotestr)
+                return NULL;
+        p = remotestr;
+        *(p++) = '{';
+        len--;
+        list_for_each_entry(da, dsaddrs, da_node) {
+                size_t ll = strlen(da->da_remotestr);
+                if (ll > len)
+                        goto out_err;
+                memcpy(p, da->da_remotestr, ll);
+                p += ll;
+                len -= ll;
+                if (len < 1)
+                        goto out_err;
+                (*p++) = ',';
+                len--;
+        }
+        if (len < 2)
+                goto out_err;
+        *(p++) = '}';
+        *p = '\0';
+        return remotestr;
+out_err:
+        kfree(remotestr);
+        return NULL;
+}
 static struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags)
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
 {
-        struct nfs4_pnfs_ds *tmp_ds, *ds;
+        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+        char *remotestr;
-        ds = kzalloc(sizeof(*tmp_ds), gfp_flags);
+        if (list_empty(dsaddrs)) {
+                dprintk("%s: no addresses defined\n", __func__);
+                goto out;
+        }
+        ds = kzalloc(sizeof(*ds), gfp_flags);
        if (!ds)
                goto out;
+        /* this is only used for debugging, so it's ok if its NULL */
+        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
        spin_lock(&nfs4_ds_cache_lock);
-        tmp_ds = _data_server_lookup_locked(ip_addr, port);
+        tmp_ds = _data_server_lookup_locked(dsaddrs);
        if (tmp_ds == NULL) {
-                ds->ds_ip_addr = ip_addr;
+                INIT_LIST_HEAD(&ds->ds_addrs);
-                ds->ds_port = port;
+                list_splice_init(dsaddrs, &ds->ds_addrs);
+                ds->ds_remotestr = remotestr;
                atomic_set(&ds->ds_count, 1);
                INIT_LIST_HEAD(&ds->ds_node);
                ds->ds_clp = NULL;
                list_add(&ds->ds_node, &nfs4_data_server_cache);
-                dprintk("%s add new data server ip 0x%x\n", __func__,
+                dprintk("%s add new data server %s\n", __func__,
-                        ds->ds_ip_addr);
+                        ds->ds_remotestr);
        } else {
+                if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs,
+                                                         dsaddrs)) {
+                        dprintk("%s:  multipath address mismatch: %s != %s",
+                                __func__, tmp_ds->ds_remotestr, remotestr);
+                }
+                kfree(remotestr);
                kfree(ds);
                atomic_inc(&tmp_ds->ds_count);
-                dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n",
+                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
-                        __func__, tmp_ds->ds_ip_addr,
+                        __func__, tmp_ds->ds_remotestr,
                        atomic_read(&tmp_ds->ds_count));
                ds = tmp_ds;
        }
@@ -213,18 +375,22 @@ out:
 }
 /*
- * Currently only support ipv4, and one multi-path address.
+ * Currently only supports ipv4, ipv6 and one multi-path address.
 */
-static struct nfs4_pnfs_ds *
+static struct nfs4_pnfs_ds_addr *
-decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags)
+decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
 {
-        struct nfs4_pnfs_ds *ds = NULL;
+        struct nfs4_pnfs_ds_addr *da = NULL;
-        char *buf;
+        char *buf, *portstr;
-        const char *ipend, *pstr;
+        u32 port;
-        u32 ip_addr, port;
+        int nlen, rlen;
-        int nlen, rlen, i;
        int tmp[2];
        __be32 *p;
+        char *netid, *match_netid;
+        size_t len, match_netid_len;
+        char *startsep = "";
+        char *endsep = "";
        /* r_netid */
        p = xdr_inline_decode(streamp, 4);
@@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla
        if (unlikely(!p))
                goto out_err;
-        /* Check that netid is "tcp" */
+        netid = kmalloc(nlen+1, gfp_flags);
-        if (nlen != 3 ||  memcmp((char *)p, "tcp", 3)) {
+        if (unlikely(!netid))
-                dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
                goto out_err;
-        }
-        /* r_addr */
+        netid[nlen] = '\0';
+        memcpy(netid, p, nlen);
+        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
        p = xdr_inline_decode(streamp, 4);
        if (unlikely(!p))
-                goto out_err;
+                goto out_free_netid;
        rlen = be32_to_cpup(p);
        p = xdr_inline_decode(streamp, rlen);
        if (unlikely(!p))
-                goto out_err;
+                goto out_free_netid;
-        /* ipv6 length plus port is legal */
+        /* port is ".ABC.DEF", 8 chars max */
-        if (rlen > INET6_ADDRSTRLEN + 8) {
+        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
                dprintk("%s: Invalid address, length %d\n", __func__,
                        rlen);
-                goto out_err;
+                goto out_free_netid;
        }
        buf = kmalloc(rlen + 1, gfp_flags);
        if (!buf) {
                dprintk("%s: Not enough memory\n", __func__);
-                goto out_err;
+                goto out_free_netid;
        }
        buf[rlen] = '\0';
        memcpy(buf, p, rlen);
-        /* replace the port dots with dashes for the in4_pton() delimiter*/
+        /* replace port '.' with '-' */
-        for (i = 0; i < 2; i++) {
+        portstr = strrchr(buf, '.');
-                char *res = strrchr(buf, '.');
+        if (!portstr) {
-                if (!res) {
+                dprintk("%s: Failed finding expected dot in port\n",
-                        dprintk("%s: Failed finding expected dots in port\n",
+                        __func__);
-                                __func__);
+                goto out_free_buf;
-                        goto out_free;
+        }
-                }
+        *portstr = '-';
-                *res = '-';
+        /* find '.' between address and port */
+        portstr = strrchr(buf, '.');
+        if (!portstr) {
+                dprintk("%s: Failed finding expected dot between address and "
+                        "port\n", __func__);
+                goto out_free_buf;
        }
+        *portstr = '\0';
-        /* Currently only support ipv4 address */
+        da = kzalloc(sizeof(*da), gfp_flags);
-        if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) {
+        if (unlikely(!da))
-                dprintk("%s: Only ipv4 addresses supported\n", __func__);
+                goto out_free_buf;
-                goto out_free;
+        INIT_LIST_HEAD(&da->da_node);
+        if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+                      sizeof(da->da_addr))) {
+                dprintk("%s: error parsing address %s\n", __func__, buf);
+                goto out_free_da;
        }
-        /* port */
+        portstr++;
-        pstr = ipend;
+        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
-        sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]);
        port = htons((tmp[0] << 8) | (tmp[1]));
-        ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags);
+        switch (da->da_addr.ss_family) {
-        dprintk("%s: Decoded address and port %s\n", __func__, buf);
+        case AF_INET:
-out_free:
+                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in);
+                match_netid = "tcp";
+                match_netid_len = 3;
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in6);
+                match_netid = "tcp6";
+                match_netid_len = 4;
+                startsep = "[";
+                endsep = "]";
+                break;
+        default:
+                dprintk("%s: unsupported address family: %u\n",
+                        __func__, da->da_addr.ss_family);
+                goto out_free_da;
+        }
+        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+                        __func__, netid, match_netid);
+                goto out_free_da;
+        }
+        /* save human readable address */
+        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+        da->da_remotestr = kzalloc(len, gfp_flags);
+        /* NULL is ok, only used for dprintk */
+        if (da->da_remotestr)
+                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+                         buf, endsep, ntohs(port));
+        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
        kfree(buf);
+        kfree(netid);
+        return da;
+out_free_da:
+        kfree(da);
+out_free_buf:
+        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+        kfree(buf);
+out_free_netid:
+        kfree(netid);
 out_err:
-        return ds;
+        return NULL;
 }
 /* Decode opaque device data and return the result */
@@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        struct xdr_stream stream;
        struct xdr_buf buf;
        struct page *scratch;
+        struct list_head dsaddrs;
+        struct nfs4_pnfs_ds_addr *da;
        /* set up xdr stream */
        scratch = alloc_page(gfp_flags);
@@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
                                NFS_SERVER(ino)->nfs_client,
                                &pdev->dev_id);
+        INIT_LIST_HEAD(&dsaddrs);
        for (i = 0; i < dsaddr->ds_num; i++) {
                int j;
                u32 mp_count;
@@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
                        goto out_err_free_deviceid;
                mp_count = be32_to_cpup(p); /* multipath count */
-                if (mp_count > 1) {
-                        printk(KERN_WARNING
-                               "%s: Multipath count %d not supported, "
-                               "skipping all greater than 1\n", __func__,
-                                mp_count);
-                }
                for (j = 0; j < mp_count; j++) {
-                        if (j == 0) {
+                        da = decode_ds_addr(&stream, gfp_flags);
-                                dsaddr->ds_list[i] = decode_and_add_ds(&stream,
+                        if (da)
-                                        ino, gfp_flags);
+                                list_add_tail(&da->da_node, &dsaddrs);
-                                if (dsaddr->ds_list[i] == NULL)
+                }
-                                        goto out_err_free_deviceid;
+                if (list_empty(&dsaddrs)) {
-                        } else {
+                        dprintk("%s: no suitable DS addresses found\n",
-                                u32 len;
+                                __func__);
-                                /* skip extra multipath */
+                        goto out_err_free_deviceid;
+                }
-                                /* read len, skip */
-                                p = xdr_inline_decode(&stream, 4);
+                dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
-                                if (unlikely(!p))
+                if (!dsaddr->ds_list[i])
-                                        goto out_err_free_deviceid;
+                        goto out_err_drain_dsaddrs;
-                                len = be32_to_cpup(p);
+                /* If DS was already in cache, free ds addrs */
-                                p = xdr_inline_decode(&stream, len);
+                while (!list_empty(&dsaddrs)) {
-                                if (unlikely(!p))
+                        da = list_first_entry(&dsaddrs,
-                                        goto out_err_free_deviceid;
+                                              struct nfs4_pnfs_ds_addr,
+                                              da_node);
-                                /* read len, skip */
+                        list_del_init(&da->da_node);
-                                p = xdr_inline_decode(&stream, 4);
+                        kfree(da->da_remotestr);
-                                if (unlikely(!p))
+                        kfree(da);
-                                        goto out_err_free_deviceid;
-                                len = be32_to_cpup(p);
-                                p = xdr_inline_decode(&stream, len);
-                                if (unlikely(!p))
-                                        goto out_err_free_deviceid;
-                        }
                }
        }
        __free_page(scratch);
        return dsaddr;
+out_err_drain_dsaddrs:
+        while (!list_empty(&dsaddrs)) {
+                da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
 out_err_free_deviceid:
        nfs4_fl_free_deviceid(dsaddr);
        /* stripe_indicies was part of dsaddr */
@@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 static void
 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
-                               int err, u32 ds_addr)
+                               int err, const char *ds_remotestr)
 {
        u32 *p = (u32 *)&dsaddr->id_node.deviceid;
-        printk(KERN_ERR "NFS: data server %x connection error %d."
+        printk(KERN_ERR "NFS: data server %s connection error %d."
                " Deviceid [%x%x%x%x] marked out of use.\n",
-                ds_addr, err, p[0], p[1], p[2], p[3]);
+                ds_remotestr, err, p[0], p[1], p[2], p[3]);
        spin_lock(&nfs4_ds_cache_lock);
        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
@@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
                err = nfs4_ds_connect(s, ds);
                if (err) {
                        filelayout_mark_devid_negative(dsaddr, err,
-                                                       ntohl(ds->ds_ip_addr));
+                                                       ds->ds_remotestr);
                        return NULL;
                }
        }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26bece8f3083..8c77039e7a81 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
                            struct nfs4_state *state);
+#ifdef CONFIG_NFS_V4_1
+static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *);
+static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *);
+#endif
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
 {
@@ -137,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
        0
 };
-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
                        | FATTR4_WORD0_MAXREAD
                        | FATTR4_WORD0_MAXWRITE
                        | FATTR4_WORD0_LEASE_TIME,
                        FATTR4_WORD1_TIME_DELTA
-                        | FATTR4_WORD1_FS_LAYOUT_TYPES
+                        | FATTR4_WORD1_FS_LAYOUT_TYPES,
+                        FATTR4_WORD2_LAYOUT_BLKSIZE
 };
 const u32 nfs4_fs_locations_bitmap[2] = {
@@ -1689,6 +1693,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
        return ret;
 }
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+        int status;
+        struct nfs_server *server = NFS_SERVER(state->inode);
+        status = nfs41_test_stateid(server, state);
+        if (status == NFS_OK)
+                return 0;
+        nfs41_free_stateid(server, state);
+        return nfs4_open_expired(sp, state);
+}
+#endif
 /*
 * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-*
 * fields corresponding to attributes that were used to store the verifier.
@@ -2252,13 +2270,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
                              struct nfs_fsinfo *info)
 {
+        int minor_version = server->nfs_client->cl_minorversion;
        int status = nfs4_lookup_root(server, fhandle, info);
        if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
                /*
                 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
                 * by nfs4_map_errors() as this function exits.
                 */
-                status = nfs4_find_root_sec(server, fhandle, info);
+                status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
        if (status == 0)
@@ -4441,6 +4460,20 @@ out:
        return err;
 }
+#if defined(CONFIG_NFS_V4_1)
+static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
+{
+        int status;
+        struct nfs_server *server = NFS_SERVER(state->inode);
+        status = nfs41_test_stateid(server, state);
+        if (status == NFS_OK)
+                return 0;
+        nfs41_free_stateid(server, state);
+        return nfs4_lock_expired(state, request);
+}
+#endif
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
        struct nfs_inode *nfsi = NFS_I(state->inode);
@@ -4779,6 +4812,16 @@ out_inval:
        return -NFS4ERR_INVAL;
 }
+static bool
+nfs41_same_server_scope(struct server_scope *a, struct server_scope *b)
+{
+        if (a->server_scope_sz == b->server_scope_sz &&
+            memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0)
+                return true;
+        return false;
+}
 /*
 * nfs4_proc_exchange_id()
 *
@@ -4821,9 +4864,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
                                init_utsname()->domainname,
                                clp->cl_rpcclient->cl_auth->au_flavor);
+        res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
+        if (unlikely(!res.server_scope))
+                return -ENOMEM;
        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        if (!status)
                status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+        if (!status) {
+                if (clp->server_scope &&
+                    !nfs41_same_server_scope(clp->server_scope,
+                                             res.server_scope)) {
+                        dprintk("%s: server_scope mismatch detected\n",
+                                __func__);
+                        set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+                        kfree(clp->server_scope);
+                        clp->server_scope = NULL;
+                }
+                if (!clp->server_scope)
+                        clp->server_scope = res.server_scope;
+                else
+                        kfree(res.server_scope);
+        }
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
 }
@@ -5704,7 +5769,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_layoutreturn *lrp = calldata;
        struct nfs_server *server;
-        struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+        struct pnfs_layout_hdr *lo = lrp->args.layout;
        dprintk("--> %s\n", __func__);
@@ -5733,7 +5798,7 @@ static void nfs4_layoutreturn_release(void *calldata)
        struct nfs4_layoutreturn *lrp = calldata;
        dprintk("--> %s\n", __func__);
-        put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+        put_layout_hdr(lrp->args.layout);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
 }
@@ -5770,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
        return status;
 }
+/*
+ * Retrieve the list of Data Server devices from the MDS.
+ */
+static int _nfs4_getdevicelist(struct nfs_server *server,
+                                    const struct nfs_fh *fh,
+                                    struct pnfs_devicelist *devlist)
+{
+        struct nfs4_getdevicelist_args args = {
+                .fh = fh,
+                .layoutclass = server->pnfs_curr_ld->id,
+        };
+        struct nfs4_getdevicelist_res res = {
+                .devlist = devlist,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+                                &res.seq_res, 0);
+        dprintk("<-- %s status=%d\n", __func__, status);
+        return status;
+}
+int nfs4_proc_getdevicelist(struct nfs_server *server,
+                            const struct nfs_fh *fh,
+                            struct pnfs_devicelist *devlist)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                _nfs4_getdevicelist(server, fh, devlist),
+                                &exception);
+        } while (exception.retry);
+        dprintk("%s: err=%d, num_devs=%u\n", __func__,
+                err, devlist->num_devs);
+        return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
@@ -5848,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 static void nfs4_layoutcommit_release(void *calldata)
 {
        struct nfs4_layoutcommit_data *data = calldata;
+        struct pnfs_layout_segment *lseg, *tmp;
+        pnfs_cleanup_layoutcommit(data);
        /* Matched by references in pnfs_set_layoutcommit */
-        put_lseg(data->lseg);
+        list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
+                list_del_init(&lseg->pls_lc_list);
+                if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
+                                       &lseg->pls_flags))
+                        put_lseg(lseg);
+        }
        put_rpccred(data->cred);
        kfree(data);
 }
@@ -5901,6 +6021,143 @@ out:
        rpc_put_task(task);
        return status;
 }
+static int
+_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+                    struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+{
+        struct nfs41_secinfo_no_name_args args = {
+                .style = SECINFO_STYLE_CURRENT_FH,
+        };
+        struct nfs4_secinfo_res res = {
+                .flavors = flavors,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+}
+static int
+nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
+                           struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+                switch (err) {
+                case 0:
+                case -NFS4ERR_WRONGSEC:
+                case -NFS4ERR_NOTSUPP:
+                        break;
+                default:
+                        err = nfs4_handle_exception(server, err, &exception);
+                }
+        } while (exception.retry);
+        return err;
+}
+static int
+nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+                    struct nfs_fsinfo *info)
+{
+        int err;
+        struct page *page;
+        rpc_authflavor_t flavor;
+        struct nfs4_secinfo_flavors *flavors;
+        page = alloc_page(GFP_KERNEL);
+        if (!page) {
+                err = -ENOMEM;
+                goto out;
+        }
+        flavors = page_address(page);
+        err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+        /*
+         * Fall back on "guess and check" method if
+         * the server doesn't support SECINFO_NO_NAME
+         */
+        if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
+                err = nfs4_find_root_sec(server, fhandle, info);
+                goto out_freepage;
+        }
+        if (err)
+                goto out_freepage;
+        flavor = nfs_find_best_sec(flavors);
+        if (err == 0)
+                err = nfs4_lookup_root_sec(server, fhandle, info, flavor);
+out_freepage:
+        put_page(page);
+        if (err == -EACCES)
+                return -EPERM;
+out:
+        return err;
+}
+static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+{
+        int status;
+        struct nfs41_test_stateid_args args = {
+                .stateid = &state->stateid,
+        };
+        struct nfs41_test_stateid_res res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        args.seq_args.sa_session = res.seq_res.sr_session = NULL;
+        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
+        return status;
+}
+static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                _nfs41_test_stateid(server, state),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
+static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+{
+        int status;
+        struct nfs41_free_stateid_args args = {
+                .stateid = &state->stateid,
+        };
+        struct nfs41_free_stateid_res res;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        args.seq_args.sa_session = res.seq_res.sr_session = NULL;
+        status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1);
+        return status;
+}
+static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                _nfs4_free_stateid(server, state),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5937,8 +6194,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
 struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {
        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
-        .recover_open   = nfs4_open_expired,
+        .recover_open   = nfs41_open_expired,
-        .recover_lock   = nfs4_lock_expired,
+        .recover_lock   = nfs41_lock_expired,
        .establish_clid = nfs41_init_clientid,
        .get_clid_cred  = nfs4_get_exchange_id_cred,
 };
@@ -5962,6 +6219,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
        .minor_version = 0,
        .call_sync = _nfs4_call_sync,
        .validate_stateid = nfs4_validate_delegation_stateid,
+        .find_root_sec = nfs4_find_root_sec,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
        .state_renewal_ops = &nfs40_state_renewal_ops,
@@ -5972,6 +6230,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .minor_version = 1,
        .call_sync = _nfs4_call_sync_session,
        .validate_stateid = nfs41_validate_delegation_stateid,
+        .find_root_sec = nfs41_find_root_sec,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7acfe8843626..72ab97ef3d61 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1643,7 +1643,14 @@ static void nfs4_state_manager(struct nfs_client *clp)
                                goto out_error;
                        }
                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
-                        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+                        if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH,
+                                               &clp->cl_state))
+                                nfs4_state_start_reclaim_nograce(clp);
+                        else
+                                set_bit(NFS4CLNT_RECLAIM_REBOOT,
+                                        &clp->cl_state);
                        pnfs_destroy_all_layouts(clp);
                }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6e8f3b9a1de..1dce12f41a4f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
 #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
 #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
 #define encode_fsinfo_maxsz     (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + 15)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + \
+                                 nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
 #define encode_renew_maxsz      (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz      (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int);
                                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
 #define encode_reclaim_complete_maxsz   (op_encode_hdr_maxsz + 4)
 #define decode_reclaim_complete_maxsz   (op_decode_hdr_maxsz + 4)
+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+                                encode_verifier_maxsz)
+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+                                2 /* nfs_cookie4 gdlr_cookie */ + \
+                                decode_verifier_maxsz \
+                                  /* verifier4 gdlr_verifier */ + \
+                                1 /* gdlr_deviceid_list count */ + \
+                                XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
+                                            NFS4_DEVICEID4_SIZE) \
+                                  /* gdlr_deviceid_list */ + \
+                                1 /* bool gdlr_eof */)
 #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
                                XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
 #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int);
                                1 /* FIXME: opaque lrf_body always empty at the moment */)
 #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
                                1 + decode_stateid_maxsz)
+#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
+#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz
+#define encode_test_stateid_maxsz       (op_encode_hdr_maxsz + 2 + \
+                                         XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_test_stateid_maxsz       (op_decode_hdr_maxsz + 2 + 1)
+#define encode_free_stateid_maxsz       (op_encode_hdr_maxsz + 1 + \
+                                         XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_free_stateid_maxsz       (op_decode_hdr_maxsz + 1)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_dec_reclaim_complete_sz    (compound_decode_hdr_maxsz + \
                                         decode_sequence_maxsz + \
                                         decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz + \
+                                encode_getdevicelist_maxsz)
+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz + \
+                                decode_getdevicelist_maxsz)
 #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz +    \
                                encode_sequence_maxsz +\
                                encode_getdeviceinfo_maxsz)
@@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int);
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_layoutreturn_maxsz)
+#define NFS4_enc_secinfo_no_name_sz     (compound_encode_hdr_maxsz + \
+                                        encode_sequence_maxsz + \
+                                        encode_putrootfh_maxsz +\
+                                        encode_secinfo_no_name_maxsz)
+#define NFS4_dec_secinfo_no_name_sz     (compound_decode_hdr_maxsz + \
+                                        decode_sequence_maxsz + \
+                                        decode_putrootfh_maxsz + \
+                                        decode_secinfo_no_name_maxsz)
+#define NFS4_enc_test_stateid_sz        (compound_encode_hdr_maxsz + \
+                                         encode_sequence_maxsz + \
+                                         encode_test_stateid_maxsz)
+#define NFS4_dec_test_stateid_sz        (compound_decode_hdr_maxsz + \
+                                         decode_sequence_maxsz + \
+                                         decode_test_stateid_maxsz)
+#define NFS4_enc_free_stateid_sz        (compound_encode_hdr_maxsz + \
+                                         encode_sequence_maxsz + \
+                                         encode_free_stateid_maxsz)
+#define NFS4_dec_free_stateid_sz        (compound_decode_hdr_maxsz + \
+                                         decode_sequence_maxsz + \
+                                         decode_free_stateid_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
        hdr->replen += decode_getattr_maxsz;
 }
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+                     uint32_t bm0, uint32_t bm1, uint32_t bm2,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4);
+        *p = cpu_to_be32(OP_GETATTR);
+        if (bm2) {
+                p = reserve_space(xdr, 16);
+                *p++ = cpu_to_be32(3);
+                *p++ = cpu_to_be32(bm0);
+                *p++ = cpu_to_be32(bm1);
+                *p = cpu_to_be32(bm2);
+        } else if (bm1) {
+                p = reserve_space(xdr, 12);
+                *p++ = cpu_to_be32(2);
+                *p++ = cpu_to_be32(bm0);
+                *p = cpu_to_be32(bm1);
+        } else {
+                p = reserve_space(xdr, 8);
+                *p++ = cpu_to_be32(1);
+                *p = cpu_to_be32(bm0);
+        }
+        hdr->nops++;
+        hdr->replen += decode_getattr_maxsz;
+}
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
 static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+        encode_getattr_three(xdr,
-                           bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+                             bitmask[0] & nfs4_fsinfo_bitmap[0],
+                             bitmask[1] & nfs4_fsinfo_bitmap[1],
+                             bitmask[2] & nfs4_fsinfo_bitmap[2],
+                             hdr);
 }
 static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr,
 #ifdef CONFIG_NFS_V4_1
 static void
+encode_getdevicelist(struct xdr_stream *xdr,
+                     const struct nfs4_getdevicelist_args *args,
+                     struct compound_hdr *hdr)
+{
+        __be32 *p;
+        nfs4_verifier dummy = {
+                .data = "dummmmmy",
+        };
+        p = reserve_space(xdr, 20);
+        *p++ = cpu_to_be32(OP_GETDEVICELIST);
+        *p++ = cpu_to_be32(args->layoutclass);
+        *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
+        xdr_encode_hyper(p, 0ULL);                          /* cookie */
+        encode_nfs4_verifier(xdr, &dummy);
+        hdr->nops++;
+        hdr->replen += decode_getdevicelist_maxsz;
+}
+static void
 encode_getdeviceinfo(struct xdr_stream *xdr,
                     const struct nfs4_getdeviceinfo_args *args,
                     struct compound_hdr *hdr)
@@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
-        p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+        p = xdr_encode_hyper(p, args->lastbytewritten + 1);     /* length */
        *p++ = cpu_to_be32(0); /* reclaim */
        p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
        *p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr,
        hdr->nops++;
        hdr->replen += decode_layoutreturn_maxsz;
 }
+static int
+encode_secinfo_no_name(struct xdr_stream *xdr,
+                       const struct nfs41_secinfo_no_name_args *args,
+                       struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 8);
+        *p++ = cpu_to_be32(OP_SECINFO_NO_NAME);
+        *p++ = cpu_to_be32(args->style);
+        hdr->nops++;
+        hdr->replen += decode_secinfo_no_name_maxsz;
+        return 0;
+}
+static void encode_test_stateid(struct xdr_stream *xdr,
+                                struct nfs41_test_stateid_args *args,
+                                struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_TEST_STATEID);
+        *p++ = cpu_to_be32(1);
+        xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+        hdr->nops++;
+        hdr->replen += decode_test_stateid_maxsz;
+}
+static void encode_free_stateid(struct xdr_stream *xdr,
+                                struct nfs41_free_stateid_args *args,
+                                struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_FREE_STATEID);
+        xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+        hdr->nops++;
+        hdr->replen += decode_free_stateid_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -2536,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .nops   = 0,
        };
-        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
        encode_compound_hdr(xdr, req, &hdr);
        encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2680,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
        };
-        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+        const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -2707,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
 }
 /*
+ * Encode GETDEVICELIST request
+ */
+static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
+                                       struct xdr_stream *xdr,
+                                       struct nfs4_getdevicelist_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->fh, &hdr);
+        encode_getdevicelist(xdr, args, &hdr);
+        encode_nops(&hdr);
+}
+/*
 * Encode GETDEVICEINFO request
 */
 static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -2790,6 +2951,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
        encode_layoutreturn(xdr, args, &hdr);
        encode_nops(&hdr);
 }
+/*
+ * Encode SECINFO_NO_NAME request
+ */
+static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req,
+                                        struct xdr_stream *xdr,
+                                        struct nfs41_secinfo_no_name_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putrootfh(xdr, &hdr);
+        encode_secinfo_no_name(xdr, args, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
+/*
+ *  Encode TEST_STATEID request
+ */
+static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs41_test_stateid_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_test_stateid(xdr, args, &hdr);
+        encode_nops(&hdr);
+}
+/*
+ *  Encode FREE_STATEID request
+ */
+static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_free_stateid_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_free_stateid(xdr, args, &hdr);
+        encode_nops(&hdr);
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2890,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
                goto out_overflow;
        bmlen = be32_to_cpup(p);
-        bitmap[0] = bitmap[1] = 0;
+        bitmap[0] = bitmap[1] = bitmap[2] = 0;
        p = xdr_inline_decode(xdr, (bmlen << 2));
        if (unlikely(!p))
                goto out_overflow;
        if (bmlen > 0) {
                bitmap[0] = be32_to_cpup(p++);
-                if (bmlen > 1)
+                if (bmlen > 1) {
-                        bitmap[1] = be32_to_cpup(p);
+                        bitmap[1] = be32_to_cpup(p++);
+                        if (bmlen > 2)
+                                bitmap[2] = be32_to_cpup(p);
+                }
        }
        return 0;
 out_overflow:
@@ -2929,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
                        return ret;
                bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
        } else
-                bitmask[0] = bitmask[1] = 0;
+                bitmask[0] = bitmask[1] = bitmask[2] = 0;
-        dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+        dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+                bitmask[0], bitmask[1], bitmask[2]);
        return 0;
 }
@@ -3984,7 +4202,7 @@ out_overflow:
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4010,7 +4228,7 @@ xdr_error:
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4042,7 +4260,7 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2] = {0};
+        uint32_t attrlen, bitmap[3] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4182,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 {
        __be32 *savep;
        uint32_t attrlen,
-                 bitmap[2] = {0};
+                 bitmap[3] = {0};
        int status;
        status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4268,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
        return status;
 }
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+                                      uint32_t *res)
+{
+        __be32 *p;
+        dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+        *res = 0;
+        if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p)) {
+                        print_overflow_msg(__func__, xdr);
+                        return -EIO;
+                }
+                *res = be32_to_cpup(p);
+                bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+        }
+        return 0;
+}
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
        __be32 *savep;
-        uint32_t attrlen, bitmap[2];
+        uint32_t attrlen, bitmap[3];
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4299,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
        status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
        if (status != 0)
                goto xdr_error;
+        status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+        if (status)
+                goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
@@ -4718,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 {
        __be32 *savep;
        uint32_t attrlen,
-                 bitmap[2] = {0};
+                 bitmap[3] = {0};
        struct kvec *iov = req->rq_rcv_buf.head;
        int status;
@@ -4977,11 +5220,17 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        if (unlikely(status))
                return status;
-        /* Throw away server_scope */
+        /* Save server_scope */
        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
        if (unlikely(status))
                return status;
+        if (unlikely(dummy > NFS4_OPAQUE_LIMIT))
+                return -EIO;
+        memcpy(res->server_scope->server_scope, dummy_str, dummy);
+        res->server_scope->server_scope_sz = dummy;
        /* Throw away Implementation id array */
        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
        if (unlikely(status))
@@ -5141,6 +5390,53 @@ out_overflow:
 }
 #if defined(CONFIG_NFS_V4_1)
+/*
+ * TODO: Need to handle case when EOF != true;
+ */
+static int decode_getdevicelist(struct xdr_stream *xdr,
+                                struct pnfs_devicelist *res)
+{
+        __be32 *p;
+        int status, i;
+        struct nfs_writeverf verftemp;
+        status = decode_op_hdr(xdr, OP_GETDEVICELIST);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 8 + 8 + 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        /* TODO: Skip cookie for now */
+        p += 2;
+        /* Read verifier */
+        p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+        res->num_devs = be32_to_cpup(p);
+        dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+        if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
+                printk(KERN_ERR "%s too many result dev_num %u\n",
+                                __func__, res->num_devs);
+                return -EIO;
+        }
+        p = xdr_inline_decode(xdr,
+                              res->num_devs * NFS4_DEVICEID4_SIZE + 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        for (i = 0; i < res->num_devs; i++)
+                p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
+                                            NFS4_DEVICEID4_SIZE);
+        res->eof = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_getdeviceinfo(struct xdr_stream *xdr,
                                struct pnfs_device *pdev)
@@ -5303,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
        int status;
        status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+        res->status = status;
        if (status)
                return status;
@@ -5322,6 +5619,55 @@ out_overflow:
        print_overflow_msg(__func__, xdr);
        return -EIO;
 }
+static int decode_test_stateid(struct xdr_stream *xdr,
+                               struct nfs41_test_stateid_res *res)
+{
+        __be32 *p;
+        int status;
+        int num_res;
+        status = decode_op_hdr(xdr, OP_TEST_STATEID);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        num_res = be32_to_cpup(p++);
+        if (num_res != 1)
+                goto out;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->status = be32_to_cpup(p++);
+        return res->status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+out:
+        return -EIO;
+}
+static int decode_free_stateid(struct xdr_stream *xdr,
+                               struct nfs41_free_stateid_res *res)
+{
+        __be32 *p;
+        int status;
+        status = decode_op_hdr(xdr, OP_FREE_STATEID);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->status = be32_to_cpup(p++);
+        return res->status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -6366,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
 }
 /*
+ * Decode GETDEVICELIST response
+ */
+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_getdevicelist_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        dprintk("encoding getdevicelist!\n");
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status != 0)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status != 0)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status != 0)
+                goto out;
+        status = decode_getdevicelist(xdr, res->devlist);
+out:
+        return status;
+}
+/*
 * Decode GETDEVINFO response
 */
 static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6461,6 +6833,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
 out:
        return status;
 }
+/*
+ * Decode SECINFO_NO_NAME response
+ */
+static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp,
+                                        struct xdr_stream *xdr,
+                                        struct nfs4_secinfo_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putrootfh(xdr);
+        if (status)
+                goto out;
+        status = decode_secinfo(xdr, res);
+out:
+        return status;
+}
+/*
+ * Decode TEST_STATEID response
+ */
+static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_test_stateid_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_test_stateid(xdr, res);
+out:
+        return status;
+}
+/*
+ * Decode FREE_STATEID response
+ */
+static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs41_free_stateid_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_free_stateid(xdr, res);
+out:
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /**
@@ -6480,7 +6918,7 @@ out:
 int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                       int plus)
 {
-        uint32_t bitmap[2] = {0};
+        uint32_t bitmap[3] = {0};
        uint32_t len;
        __be32 *p = xdr_inline_decode(xdr, 4);
        if (unlikely(!p))
@@ -6663,6 +7101,10 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
        PROC(LAYOUTCOMMIT,      enc_layoutcommit,       dec_layoutcommit),
        PROC(LAYOUTRETURN,      enc_layoutreturn,       dec_layoutreturn),
+        PROC(SECINFO_NO_NAME,   enc_secinfo_no_name,    dec_secinfo_no_name),
+        PROC(TEST_STATEID,      enc_test_stateid,       dec_test_stateid),
+        PROC(FREE_STATEID,      enc_free_stateid,       dec_free_stateid),
+        PROC(GETDEVICELIST,     enc_getdevicelist,      dec_getdevicelist),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8ff2ea3f10ef..d0cda12fddc3 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
        for (i = 0; i <  ios->numdevs; i++) {
                struct osd_sense_info osi;
                struct osd_request *or = ios->per_dev[i].or;
-                unsigned dev;
                int ret;
                if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
                        continue; /* we recovered */
                }
-                dev = ios->per_dev[i].dev;
+                objlayout_io_set_result(&ios->ol_state, i,
-                objlayout_io_set_result(&ios->ol_state, dev,
+                                        &ios->layout->comps[i].oc_object_id,
-                                        &ios->layout->comps[dev].oc_object_id,
                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
                                        ios->per_dev[i].offset,
                                        ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
 }
 static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+                unsigned pgbase, struct _objio_per_comp *per_dev, int len,
                gfp_t gfp_flags)
 {
        unsigned pg = *cur_pg;
+        int cur_len = len;
        struct request_queue *q =
                        osd_request_queue(_io_od(ios, per_dev->dev));
-        per_dev->length += cur_len;
        if (per_dev->bio == NULL) {
-                unsigned stripes = ios->layout->num_comps /
+                unsigned pages_in_stripe = ios->layout->group_width *
-                                                     ios->layout->mirrors_p1;
-                unsigned pages_in_stripe = stripes *
                                      (ios->layout->stripe_unit / PAGE_SIZE);
                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-                                    stripes;
+                                    ios->layout->group_width;
                if (BIO_MAX_PAGES_KMALLOC < bio_size)
                        bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
        }
        BUG_ON(cur_len);
+        per_dev->length += len;
        *cur_pg = pg;
        return 0;
 }
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
        int ret = 0;
        while (length) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+                struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
                                cur_len = stripe_unit;
                        }
-                        if (max_comp < dev)
+                        if (max_comp < dev - first_dev)
-                                max_comp = dev;
+                                max_comp = dev - first_dev;
                } else {
                        cur_len = stripe_unit;
                }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
        unsigned dev = per_dev->dev;
        struct pnfs_osd_object_cred *cred =
-                        &ios->layout->comps[dev];
+                        &ios->layout->comps[cur_comp];
        struct osd_obj_id obj = {
                .partition = cred->oc_object_id.oid_partition_id,
                .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
                struct osd_request *or = NULL;
                struct pnfs_osd_object_cred *cred =
-                                        &ios->layout->comps[dev];
+                                        &ios->layout->comps[cur_comp];
                struct osd_obj_id obj = {
                        .partition = cred->oc_object_id.oid_partition_id,
                        .id = cred->oc_object_id.oid_object_id,
@@ -1000,13 +996,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
        if (!pnfs_generic_pg_test(pgio, prev, req))
                return false;
-        if (pgio->pg_lseg == NULL)
-                return true;
        return pgio->pg_count + req->wb_bytes <=
                        OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
 }
+static const struct nfs_pageio_ops objio_pg_read_ops = {
+        .pg_init = pnfs_generic_pg_init_read,
+        .pg_test = objio_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+};
+static const struct nfs_pageio_ops objio_pg_write_ops = {
+        .pg_init = pnfs_generic_pg_init_write,
+        .pg_test = objio_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+};
 static struct pnfs_layoutdriver_type objlayout_type = {
        .id = LAYOUT_OSD2_OBJECTS,
        .name = "LAYOUT_OSD2_OBJECTS",
@@ -1020,7 +1025,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
        .read_pagelist           = objlayout_read_pagelist,
        .write_pagelist          = objlayout_write_pagelist,
-        .pg_test                 = objio_pg_test,
+        .pg_read_ops             = &objio_pg_read_ops,
+        .pg_write_ops            = &objio_pg_write_ops,
        .free_deviceid_node      = objio_free_deviceid_node,
@@ -1055,5 +1061,7 @@ objlayout_exit(void)
               __func__);
 }
+MODULE_ALIAS("nfs-layouttype4-2");
 module_init(objlayout_init);
 module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758e9123..b3918f7ac34d 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
        p = _osd_xdr_decode_data_map(p, &layout->olo_map);
        layout->olo_comps_index = be32_to_cpup(p++);
        layout->olo_num_comps = be32_to_cpup(p++);
+        dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+                layout->olo_comps_index, layout->olo_num_comps);
        iter->total_comps = layout->olo_num_comps;
        return 0;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 18449f43c568..b60970cc7f1f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
 */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     struct inode *inode,
-                     int (*doio)(struct nfs_pageio_descriptor *),
+                     const struct nfs_pageio_ops *pg_ops,
                     size_t bsize,
                     int io_flags)
 {
@@ -240,13 +240,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_bsize = bsize;
        desc->pg_base = 0;
        desc->pg_moreio = 0;
+        desc->pg_recoalesce = 0;
        desc->pg_inode = inode;
-        desc->pg_doio = doio;
+        desc->pg_ops = pg_ops;
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
        desc->pg_lseg = NULL;
-        desc->pg_test = nfs_generic_pg_test;
-        pnfs_pageio_init(desc, inode);
 }
 /**
@@ -276,7 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                return false;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
                return false;
-        return pgio->pg_test(pgio, prev, req);
+        return pgio->pg_ops->pg_test(pgio, prev, req);
 }
 /**
@@ -297,6 +296,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
        } else {
+                if (desc->pg_ops->pg_init)
+                        desc->pg_ops->pg_init(desc, req);
                desc->pg_base = req->wb_pgbase;
        }
        nfs_list_remove_request(req);
@@ -311,7 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
        if (!list_empty(&desc->pg_list)) {
-                int error = desc->pg_doio(desc);
+                int error = desc->pg_ops->pg_doio(desc);
                if (error < 0)
                        desc->pg_error = error;
                else
@@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 * Returns true if the request 'req' was successfully coalesced into the
 * existing list of pages 'desc'.
 */
-int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
        while (!nfs_pageio_do_add_request(desc, req)) {
@@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                if (desc->pg_error < 0)
                        return 0;
                desc->pg_moreio = 0;
+                if (desc->pg_recoalesce)
+                        return 0;
        }
        return 1;
 }
+static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
+{
+        LIST_HEAD(head);
+        do {
+                list_splice_init(&desc->pg_list, &head);
+                desc->pg_bytes_written -= desc->pg_count;
+                desc->pg_count = 0;
+                desc->pg_base = 0;
+                desc->pg_recoalesce = 0;
+                while (!list_empty(&head)) {
+                        struct nfs_page *req;
+                        req = list_first_entry(&head, struct nfs_page, wb_list);
+                        nfs_list_remove_request(req);
+                        if (__nfs_pageio_add_request(desc, req))
+                                continue;
+                        if (desc->pg_error < 0)
+                                return 0;
+                        break;
+                }
+        } while (desc->pg_recoalesce);
+        return 1;
+}
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+                struct nfs_page *req)
+{
+        int ret;
+        do {
+                ret = __nfs_pageio_add_request(desc, req);
+                if (ret)
+                        break;
+                if (desc->pg_error < 0)
+                        break;
+                ret = nfs_do_recoalesce(desc);
+        } while (ret);
+        return ret;
+}
 /**
 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
 * @desc: pointer to io descriptor
 */
 void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 {
-        nfs_pageio_doio(desc);
+        for (;;) {
+                nfs_pageio_doio(desc);
+                if (!desc->pg_recoalesce)
+                        break;
+                if (!nfs_do_recoalesce(desc))
+                        break;
+        }
 }
 /**
@@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
        if (!list_empty(&desc->pg_list)) {
                struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
                if (index != prev->wb_index + 1)
-                        nfs_pageio_doio(desc);
+                        nfs_pageio_complete(desc);
        }
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 29c0ca7fc347..e550e8836c37 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -28,6 +28,7 @@
 */
 #include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
@@ -75,8 +76,11 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-        if (nfss->pnfs_curr_ld)
+        if (nfss->pnfs_curr_ld) {
+                if (nfss->pnfs_curr_ld->clear_layoutdriver)
+                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
+        }
        nfss->pnfs_curr_ld = NULL;
 }
@@ -87,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 */
 void
-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+                      u32 id)
 {
        struct pnfs_layoutdriver_type *ld_type = NULL;
@@ -114,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
+        if (ld_type->set_layoutdriver
+            && ld_type->set_layoutdriver(server, mntfh)) {
+                printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n",
+                                __func__, id);
+                module_put(ld_type->owner);
+                goto out_no_driver;
+        }
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -189,6 +201,7 @@ static void
 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+        put_rpccred(lo->plh_lc_cred);
        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
 }
@@ -223,6 +236,7 @@ static void
 init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 {
        INIT_LIST_HEAD(&lseg->pls_list);
+        INIT_LIST_HEAD(&lseg->pls_lc_list);
        atomic_set(&lseg->pls_refcount, 1);
        smp_mb();
        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -448,11 +462,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 void
 pnfs_destroy_all_layouts(struct nfs_client *clp)
 {
+        struct nfs_server *server;
        struct pnfs_layout_hdr *lo;
        LIST_HEAD(tmp_list);
+        nfs4_deviceid_mark_client_invalid(clp);
+        nfs4_deviceid_purge_client(clp);
        spin_lock(&clp->cl_lock);
-        list_splice_init(&clp->cl_layouts, &tmp_list);
+        rcu_read_lock();
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                if (!list_empty(&server->layouts))
+                        list_splice_init(&server->layouts, &tmp_list);
+        }
+        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
        while (!list_empty(&tmp_list)) {
@@ -661,6 +684,7 @@ _pnfs_return_layout(struct inode *ino)
        lrp->args.stateid = stateid;
        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
        lrp->args.inode = ino;
+        lrp->args.layout = lo;
        lrp->clp = NFS_SERVER(ino)->nfs_client;
        status = nfs4_proc_layoutreturn(lrp);
@@ -805,7 +829,9 @@ out:
 }
 static struct pnfs_layout_hdr *
-alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+alloc_init_layout_hdr(struct inode *ino,
+                      struct nfs_open_context *ctx,
+                      gfp_t gfp_flags)
 {
        struct pnfs_layout_hdr *lo;
@@ -817,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
        INIT_LIST_HEAD(&lo->plh_segs);
        INIT_LIST_HEAD(&lo->plh_bulk_recall);
        lo->plh_inode = ino;
+        lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
        return lo;
 }
 static struct pnfs_layout_hdr *
-pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
+pnfs_find_alloc_layout(struct inode *ino,
+                       struct nfs_open_context *ctx,
+                       gfp_t gfp_flags)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *new = NULL;
@@ -836,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
                        return nfsi->layout;
        }
        spin_unlock(&ino->i_lock);
-        new = alloc_init_layout_hdr(ino, gfp_flags);
+        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
        spin_lock(&ino->i_lock);
        if (likely(nfsi->layout == NULL))       /* Won the race? */
@@ -920,7 +949,8 @@ pnfs_update_layout(struct inode *ino,
        };
        unsigned pg_offset;
        struct nfs_inode *nfsi = NFS_I(ino);
-        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
+        struct nfs_server *server = NFS_SERVER(ino);
+        struct nfs_client *clp = server->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
        bool first = false;
@@ -928,7 +958,7 @@ pnfs_update_layout(struct inode *ino,
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
        spin_lock(&ino->i_lock);
-        lo = pnfs_find_alloc_layout(ino, gfp_flags);
+        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
                goto out_unlock;
@@ -964,7 +994,7 @@ pnfs_update_layout(struct inode *ino,
                 */
                spin_lock(&clp->cl_lock);
                BUG_ON(!list_empty(&lo->plh_layouts));
-                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
+                list_add_tail(&lo->plh_layouts, &server->layouts);
                spin_unlock(&clp->cl_lock);
        }
@@ -973,7 +1003,8 @@ pnfs_update_layout(struct inode *ino,
                arg.offset -= pg_offset;
                arg.length += pg_offset;
        }
-        arg.length = PAGE_CACHE_ALIGN(arg.length);
+        if (arg.length != NFS4_MAX_UINT64)
+                arg.length = PAGE_CACHE_ALIGN(arg.length);
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
        if (!lseg && first) {
@@ -991,6 +1022,7 @@ out_unlock:
        spin_unlock(&ino->i_lock);
        goto out;
 }
+EXPORT_SYMBOL_GPL(pnfs_update_layout);
 int
 pnfs_layout_process(struct nfs4_layoutget *lgp)
@@ -1048,35 +1080,71 @@ out_forget_reply:
        goto out;
 }
+void
+pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+        BUG_ON(pgio->pg_lseg != NULL);
+        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                           req->wb_context,
+                                           req_offset(req),
+                                           req->wb_bytes,
+                                           IOMODE_READ,
+                                           GFP_KERNEL);
+        /* If no lseg, fall back to read through mds */
+        if (pgio->pg_lseg == NULL)
+                nfs_pageio_reset_read_mds(pgio);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
+void
+pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+        BUG_ON(pgio->pg_lseg != NULL);
+        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                           req->wb_context,
+                                           req_offset(req),
+                                           req->wb_bytes,
+                                           IOMODE_RW,
+                                           GFP_NOFS);
+        /* If no lseg, fall back to write through mds */
+        if (pgio->pg_lseg == NULL)
+                nfs_pageio_reset_write_mds(pgio);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
 bool
-pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-                     struct nfs_page *req)
 {
-        enum pnfs_iomode access_type;
+        struct nfs_server *server = NFS_SERVER(inode);
-        gfp_t gfp_flags;
+        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-        /* We assume that pg_ioflags == 0 iff we're reading a page */
+        if (ld == NULL)
-        if (pgio->pg_ioflags == 0) {
+                return false;
-                access_type = IOMODE_READ;
+        nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0);
-                gfp_flags = GFP_KERNEL;
+        return true;
-        } else {
+}
-                access_type = IOMODE_RW;
-                gfp_flags = GFP_NOFS;
-        }
-        if (pgio->pg_lseg == NULL) {
+bool
-                if (pgio->pg_count != prev->wb_bytes)
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
-                        return true;
+{
-                /* This is first coelesce call for a series of nfs_pages */
+        struct nfs_server *server = NFS_SERVER(inode);
-                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-                                                   prev->wb_context,
-                                                   req_offset(prev),
+        if (ld == NULL)
-                                                   pgio->pg_count,
+                return false;
-                                                   access_type,
+        nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags);
-                                                   gfp_flags);
+        return true;
-                if (pgio->pg_lseg == NULL)
+}
-                        return true;
-        }
+bool
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                     struct nfs_page *req)
+{
+        if (pgio->pg_lseg == NULL)
+                return nfs_generic_pg_test(pgio, prev, req);
        /*
         * Test if a nfs_page is fully contained in the pnfs_layout_range.
@@ -1120,15 +1188,30 @@ pnfs_ld_write_done(struct nfs_write_data *data)
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
-enum pnfs_try_status
+static void
+pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
+                struct nfs_write_data *data)
+{
+        list_splice_tail_init(&data->pages, &desc->pg_list);
+        if (data->req && list_empty(&data->req->wb_list))
+                nfs_list_add_request(data->req, &desc->pg_list);
+        nfs_pageio_reset_write_mds(desc);
+        desc->pg_recoalesce = 1;
+        nfs_writedata_release(data);
+}
+static enum pnfs_try_status
 pnfs_try_to_write_data(struct nfs_write_data *wdata,
-                        const struct rpc_call_ops *call_ops, int how)
+                        const struct rpc_call_ops *call_ops,
+                        struct pnfs_layout_segment *lseg,
+                        int how)
 {
        struct inode *inode = wdata->inode;
        enum pnfs_try_status trypnfs;
        struct nfs_server *nfss = NFS_SERVER(inode);
        wdata->mds_ops = call_ops;
+        wdata->lseg = get_lseg(lseg);
        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
                inode->i_ino, wdata->args.count, wdata->args.offset, how);
@@ -1144,6 +1227,44 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
        return trypnfs;
 }
+static void
+pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
+{
+        struct nfs_write_data *data;
+        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        desc->pg_lseg = NULL;
+        while (!list_empty(head)) {
+                enum pnfs_try_status trypnfs;
+                data = list_entry(head->next, struct nfs_write_data, list);
+                list_del_init(&data->list);
+                trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+                if (trypnfs == PNFS_NOT_ATTEMPTED)
+                        pnfs_write_through_mds(desc, data);
+        }
+        put_lseg(lseg);
+}
+int
+pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
+{
+        LIST_HEAD(head);
+        int ret;
+        ret = nfs_generic_flush(desc, &head);
+        if (ret != 0) {
+                put_lseg(desc->pg_lseg);
+                desc->pg_lseg = NULL;
+                return ret;
+        }
+        pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 /*
 * Called by non rpc-based layout drivers
 */
@@ -1167,18 +1288,32 @@ pnfs_ld_read_done(struct nfs_read_data *data)
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+static void
+pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
+                struct nfs_read_data *data)
+{
+        list_splice_tail_init(&data->pages, &desc->pg_list);
+        if (data->req && list_empty(&data->req->wb_list))
+                nfs_list_add_request(data->req, &desc->pg_list);
+        nfs_pageio_reset_read_mds(desc);
+        desc->pg_recoalesce = 1;
+        nfs_readdata_release(data);
+}
 /*
 * Call the appropriate parallel I/O subsystem read function.
 */
-enum pnfs_try_status
+static enum pnfs_try_status
 pnfs_try_to_read_data(struct nfs_read_data *rdata,
-                       const struct rpc_call_ops *call_ops)
+                       const struct rpc_call_ops *call_ops,
+                       struct pnfs_layout_segment *lseg)
 {
        struct inode *inode = rdata->inode;
        struct nfs_server *nfss = NFS_SERVER(inode);
        enum pnfs_try_status trypnfs;
        rdata->mds_ops = call_ops;
+        rdata->lseg = get_lseg(lseg);
        dprintk("%s: Reading ino:%lu %u@%llu\n",
                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
@@ -1194,17 +1329,56 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
        return trypnfs;
 }
+static void
+pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
+{
+        struct nfs_read_data *data;
+        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        desc->pg_lseg = NULL;
+        while (!list_empty(head)) {
+                enum pnfs_try_status trypnfs;
+                data = list_entry(head->next, struct nfs_read_data, list);
+                list_del_init(&data->list);
+                trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+                if (trypnfs == PNFS_NOT_ATTEMPTED)
+                        pnfs_read_through_mds(desc, data);
+        }
+        put_lseg(lseg);
+}
+int
+pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
+{
+        LIST_HEAD(head);
+        int ret;
+        ret = nfs_generic_pagein(desc, &head);
+        if (ret != 0) {
+                put_lseg(desc->pg_lseg);
+                desc->pg_lseg = NULL;
+                return ret;
+        }
+        pnfs_do_multiple_reads(desc, &head);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
 /*
- * Currently there is only one (whole file) write lseg.
+ * There can be multiple RW segments.
 */
-static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 {
-        struct pnfs_layout_segment *lseg, *rv = NULL;
+        struct pnfs_layout_segment *lseg;
-        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
+        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
-                if (lseg->pls_range.iomode == IOMODE_RW)
+                if (lseg->pls_range.iomode == IOMODE_RW &&
-                        rv = lseg;
+                    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
-        return rv;
+                        list_add(&lseg->pls_lc_list, listp);
+        }
 }
 void
@@ -1216,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
        spin_lock(&nfsi->vfs_inode.i_lock);
        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-                /* references matched in nfs4_layoutcommit_release */
-                get_lseg(wdata->lseg);
-                wdata->lseg->pls_lc_cred =
-                        get_rpccred(wdata->args.context->state->owner->so_cred);
                mark_as_dirty = true;
                dprintk("%s: Set layoutcommit for inode %lu ",
                        __func__, wdata->inode->i_ino);
        }
-        if (end_pos > wdata->lseg->pls_end_pos)
+        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
-                wdata->lseg->pls_end_pos = end_pos;
+                /* references matched in nfs4_layoutcommit_release */
+                get_lseg(wdata->lseg);
+        }
+        if (end_pos > nfsi->layout->plh_lwb)
+                nfsi->layout->plh_lwb = end_pos;
        spin_unlock(&nfsi->vfs_inode.i_lock);
+        dprintk("%s: lseg %p end_pos %llu\n",
+                __func__, wdata->lseg, nfsi->layout->plh_lwb);
        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1235,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
+{
+        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+}
 /*
 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
@@ -1248,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        struct nfs4_layoutcommit_data *data;
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct pnfs_layout_segment *lseg;
-        struct rpc_cred *cred;
        loff_t end_pos;
        int status = 0;
@@ -1266,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
                goto out;
        }
+        INIT_LIST_HEAD(&data->lseg_list);
        spin_lock(&inode->i_lock);
        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
                spin_unlock(&inode->i_lock);
                kfree(data);
                goto out;
        }
-        /*
-         * Currently only one (whole file) write lseg which is referenced
-         * in pnfs_set_layoutcommit and will be found.
-         */
-        lseg = pnfs_list_write_lseg(inode);
-        end_pos = lseg->pls_end_pos;
+        pnfs_list_write_lseg(inode, &data->lseg_list);
-        cred = lseg->pls_lc_cred;
-        lseg->pls_end_pos = 0;
+        end_pos = nfsi->layout->plh_lwb;
-        lseg->pls_lc_cred = NULL;
+        nfsi->layout->plh_lwb = 0;
        memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
                sizeof(nfsi->layout->plh_stateid.data));
        spin_unlock(&inode->i_lock);
        data->args.inode = inode;
-        data->lseg = lseg;
+        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
-        data->cred = cred;
        nfs_fattr_init(&data->fattr);
        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
        data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 96bf4e6f45be..01cbfd54f3cb 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,16 @@
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
+        NFS_LSEG_LAYOUTCOMMIT,  /* layoutcommit bit set for layoutcommit */
 };
 struct pnfs_layout_segment {
        struct list_head pls_list;
+        struct list_head pls_lc_list;
        struct pnfs_layout_range pls_range;
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
-        struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
-        loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
 };
 enum pnfs_try_status {
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
        struct module *owner;
        unsigned flags;
+        int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+        int (*clear_layoutdriver) (struct nfs_server *);
        struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
        void (*free_layout_hdr) (struct pnfs_layout_hdr *);
@@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type {
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
        /* test for nfs page cache coalescing */
-        bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+        const struct nfs_pageio_ops *pg_read_ops;
+        const struct nfs_pageio_ops *pg_write_ops;
        /* Returns true if layoutdriver wants to divert this request to
         * driver's commit routine.
@@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type {
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutreturn_args *args);
+        void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
        void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
@@ -124,6 +130,8 @@ struct pnfs_layout_hdr {
        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
        u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_flags;
+        loff_t                  plh_lwb; /* last write byte for layoutcommit */
+        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
 };
@@ -136,10 +144,21 @@ struct pnfs_device {
        unsigned int  pglen;
 };
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+struct pnfs_devicelist {
+        unsigned int            eof;
+        unsigned int            num_devs;
+        struct nfs4_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 /* nfs4proc.c */
+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
+                                   const struct nfs_fh *fh,
+                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
@@ -148,16 +167,16 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
-struct pnfs_layout_segment *
-pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
+bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
-                   loff_t pos, u64 count, enum pnfs_iomode access_type,
+bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
-                   gfp_t gfp_flags);
-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
-enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
-                                             const struct rpc_call_ops *, int);
+int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
-enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
-                                            const struct rpc_call_ops *);
+int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -178,10 +197,24 @@ void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_ld_write_done(struct nfs_write_data *);
 int pnfs_ld_read_done(struct nfs_read_data *);
+struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
+                                               struct nfs_open_context *ctx,
+                                               loff_t pos,
+                                               u64 count,
+                                               enum pnfs_iomode iomode,
+                                               gfp_t gfp_flags);
+void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
+/* nfs4_deviceid_flags */
+enum {
+        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
+};
 /* pnfs_dev.c */
 struct nfs4_deviceid_node {
@@ -189,13 +222,13 @@ struct nfs4_deviceid_node {
        struct hlist_node               tmpnode;
        const struct pnfs_layoutdriver_type *ld;
        const struct nfs_client         *nfs_client;
+        unsigned long                   flags;
        struct nfs4_deviceid            deviceid;
        atomic_t                        ref;
 };
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
 void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
                             const struct pnfs_layoutdriver_type *,
@@ -293,15 +326,6 @@ static inline int pnfs_return_layout(struct inode *ino)
        return 0;
 }
-static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
-                                    struct inode *inode)
-{
-        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
-        if (ld)
-                pgio->pg_test = ld->pg_test;
-}
 #else  /* CONFIG_NFS_V4_1 */
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -322,28 +346,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
-static inline struct pnfs_layout_segment *
-pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-                   loff_t pos, u64 count, enum pnfs_iomode access_type,
-                   gfp_t gfp_flags)
-{
-        return NULL;
-}
-static inline enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_read_data *data,
-                      const struct rpc_call_ops *call_ops)
-{
-        return PNFS_NOT_ATTEMPTED;
-}
-static inline enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_write_data *data,
-                       const struct rpc_call_ops *call_ops, int how)
-{
-        return PNFS_NOT_ATTEMPTED;
-}
 static inline int pnfs_return_layout(struct inode *ino)
 {
        return 0;
@@ -377,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
        return false;
 }
-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+                                         const struct nfs_fh *mntfh, u32 id)
 {
 }
@@ -385,9 +388,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
-static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
-                                    struct inode *inode)
+{
+        return false;
+}
+static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags)
 {
+        return false;
 }
 static inline void
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index f0f8e1e22f6c..6fda5228ef56 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
        rcu_read_lock();
        d = _lookup_deviceid(ld, clp, id, hash);
-        if (d && !atomic_inc_not_zero(&d->ref))
+        if (d != NULL)
-                d = NULL;
+                atomic_inc(&d->ref);
        rcu_read_unlock();
        return d;
 }
@@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 /*
- * Unhash and put deviceid
+ * Remove a deviceid from cache
 *
 * @clp nfs_client associated with deviceid
 * @id the deviceid to unhash
 *
 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
 */
-struct nfs4_deviceid_node *
+void
-nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
                         const struct nfs_client *clp, const struct nfs4_deviceid *id)
 {
        struct nfs4_deviceid_node *d;
@@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
        rcu_read_unlock();
        if (!d) {
                spin_unlock(&nfs4_deviceid_lock);
-                return NULL;
+                return;
        }
        hlist_del_init_rcu(&d->node);
        spin_unlock(&nfs4_deviceid_lock);
@@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
        /* balance the initial ref set in pnfs_insert_deviceid */
        if (atomic_dec_and_test(&d->ref))
-                return d;
+                d->ld->free_deviceid_node(d);
-        return NULL;
-}
-EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
-/*
- * Delete a deviceid from cache
- *
- * @clp struct nfs_client qualifying the deviceid
- * @id deviceid to delete
- */
-void
-nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
-                     const struct nfs_client *clp, const struct nfs4_deviceid *id)
-{
-        struct nfs4_deviceid_node *d;
-        d = nfs4_unhash_put_deviceid(ld, clp, id);
-        if (!d)
-                return;
-        d->ld->free_deviceid_node(d);
 }
 EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
@@ -177,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
        INIT_HLIST_NODE(&d->tmpnode);
        d->ld = ld;
        d->nfs_client = nfs_client;
+        d->flags = 0;
        d->deviceid = *id;
        atomic_set(&d->ref, 1);
 }
@@ -221,16 +201,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
 *
 * @d deviceid node to put
 *
- * @ret true iff the node was deleted
+ * return true iff the node was deleted
+ * Note that since the test for d->ref == 0 is sufficient to establish
+ * that the node is no longer hashed in the global device id cache.
 */
 bool
 nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
 {
-        if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+        if (!atomic_dec_and_test(&d->ref))
                return false;
-        hlist_del_init_rcu(&d->node);
-        spin_unlock(&nfs4_deviceid_lock);
-        synchronize_rcu();
        d->ld->free_deviceid_node(d);
        return true;
 }
@@ -275,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp)
        for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
                _deviceid_purge_client(clp, h);
 }
+/*
+ * Stop use of all deviceids associated with an nfs_client
+ */
+void
+nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
+{
+        struct nfs4_deviceid_node *d;
+        struct hlist_node *n;
+        int i;
+        rcu_read_lock();
+        for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
+                hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node)
+                        if (d->nfs_client == clp)
+                                set_bit(NFS_DEVICEID_INVALID, &d->flags);
+        }
+        rcu_read_unlock();
+}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a68679f538fc..2171c043ab08 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,8 +30,7 @@
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
+static const struct nfs_pageio_ops nfs_pageio_read_ops;
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
@@ -68,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
        mempool_free(p, nfs_rdata_mempool);
 }
-static void nfs_readdata_release(struct nfs_read_data *rdata)
+void nfs_readdata_release(struct nfs_read_data *rdata)
 {
        put_lseg(rdata->lseg);
        put_nfs_open_context(rdata->args.context);
@@ -113,6 +112,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
+static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+                struct inode *inode)
+{
+        nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
+                        NFS_SERVER(inode)->rsize, 0);
+}
+void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_ops = &nfs_pageio_read_ops;
+        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+                struct inode *inode)
+{
+        if (!pnfs_pageio_init_read(pgio, inode))
+                nfs_pageio_init_read_mds(pgio, inode);
+}
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                       struct page *page)
 {
@@ -131,14 +151,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
-        nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+        nfs_pageio_init_read(&pgio, inode);
-        nfs_list_add_request(new, &pgio.pg_list);
+        nfs_pageio_add_request(&pgio, new);
-        pgio.pg_count = len;
+        nfs_pageio_complete(&pgio);
-        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-                nfs_pagein_multi(&pgio);
-        else
-                nfs_pagein_one(&pgio);
        return 0;
 }
@@ -202,17 +217,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read);
 /*
 * Set up the NFS read request struct
 */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset)
-                unsigned int count, unsigned int offset,
-                struct pnfs_layout_segment *lseg)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
        data->req         = req;
        data->inode       = inode;
        data->cred        = req->wb_context->cred;
-        data->lseg        = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -226,14 +238,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->res.count   = count;
        data->res.eof     = 0;
        nfs_fattr_init(&data->fattr);
+}
-        if (data->lseg &&
+static int nfs_do_read(struct nfs_read_data *data,
-            (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+                const struct rpc_call_ops *call_ops)
-                return 0;
+{
+        struct inode *inode = data->args.context->dentry->d_inode;
        return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
 }
+static int
+nfs_do_multiple_reads(struct list_head *head,
+                const struct rpc_call_ops *call_ops)
+{
+        struct nfs_read_data *data;
+        int ret = 0;
+        while (!list_empty(head)) {
+                int ret2;
+                data = list_entry(head->next, struct nfs_read_data, list);
+                list_del_init(&data->list);
+                ret2 = nfs_do_read(data, call_ops);
+                if (ret == 0)
+                        ret = ret2;
+        }
+        return ret;
+}
 static void
 nfs_async_read_error(struct list_head *head)
 {
@@ -260,20 +294,19 @@ nfs_async_read_error(struct list_head *head)
 * won't see the new data until our attribute cache is updated.  This is more
 * or less conventional NFS client behavior.
 */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
 {
        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_read_data *data;
-        size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
+        size_t rsize = desc->pg_bsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
-        struct pnfs_layout_segment *lseg;
-        LIST_HEAD(list);
        nfs_list_remove_request(req);
+        offset = 0;
        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes,rsize);
@@ -281,45 +314,21 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
                data = nfs_readdata_alloc(1);
                if (!data)
                        goto out_bad;
-                list_add(&data->pages, &list);
+                data->pagevec[0] = page;
+                nfs_read_rpcsetup(req, data, len, offset);
+                list_add(&data->list, res);
                requests++;
                nbytes -= len;
+                offset += len;
        } while(nbytes != 0);
        atomic_set(&req->wb_complete, requests);
-        BUG_ON(desc->pg_lseg != NULL);
-        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
-                                  req_offset(req), desc->pg_count,
-                                  IOMODE_READ, GFP_KERNEL);
        ClearPageError(page);
-        offset = 0;
+        desc->pg_rpc_callops = &nfs_read_partial_ops;
-        nbytes = desc->pg_count;
-        do {
-                int ret2;
-                data = list_entry(list.next, struct nfs_read_data, pages);
-                list_del_init(&data->pages);
-                data->pagevec[0] = page;
-                if (nbytes < rsize)
-                        rsize = nbytes;
-                ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                                         rsize, offset, lseg);
-                if (ret == 0)
-                        ret = ret2;
-                offset += rsize;
-                nbytes -= rsize;
-        } while (nbytes != 0);
-        put_lseg(lseg);
-        desc->pg_lseg = NULL;
        return ret;
 out_bad:
-        while (!list_empty(&list)) {
+        while (!list_empty(res)) {
-                data = list_entry(list.next, struct nfs_read_data, pages);
+                data = list_entry(res->next, struct nfs_read_data, list);
-                list_del(&data->pages);
+                list_del(&data->list);
                nfs_readdata_free(data);
        }
        SetPageError(page);
@@ -327,19 +336,19 @@ out_bad:
        return -ENOMEM;
 }
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_read_data    *data;
        struct list_head *head = &desc->pg_list;
-        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        int ret = 0;
-        int ret = -ENOMEM;
        data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
                                                     desc->pg_count));
        if (!data) {
                nfs_async_read_error(head);
+                ret = -ENOMEM;
                goto out;
        }
@@ -352,19 +361,37 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
-        if ((!lseg) && list_is_singular(&data->pages))
-                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
-                                          req_offset(req), desc->pg_count,
-                                          IOMODE_READ, GFP_KERNEL);
-        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
+        nfs_read_rpcsetup(req, data, desc->pg_count, 0);
-                                0, lseg);
+        list_add(&data->list, res);
+        desc->pg_rpc_callops = &nfs_read_full_ops;
 out:
-        put_lseg(lseg);
-        desc->pg_lseg = NULL;
        return ret;
 }
+int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head)
+{
+        if (desc->pg_bsize < PAGE_CACHE_SIZE)
+                return nfs_pagein_multi(desc, head);
+        return nfs_pagein_one(desc, head);
+}
+static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
+{
+        LIST_HEAD(head);
+        int ret;
+        ret = nfs_generic_pagein(desc, &head);
+        if (ret == 0)
+                ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops);
+        return ret;
+}
+static const struct nfs_pageio_ops nfs_pageio_read_ops = {
+        .pg_test = nfs_generic_pg_test,
+        .pg_doio = nfs_generic_pg_readpages,
+};
 /*
 * This is the callback from RPC telling us whether a reply was
 * received or some error occurred (timeout or socket shutdown).
@@ -635,8 +662,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                .pgio = &pgio,
        };
        struct inode *inode = mapping->host;
-        struct nfs_server *server = NFS_SERVER(inode);
-        size_t rsize = server->rsize;
        unsigned long npages;
        int ret = -ESTALE;
@@ -664,10 +689,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
-        if (rsize < PAGE_CACHE_SIZE)
+        nfs_pageio_init_read(&pgio, inode);
-                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
-        else
-                nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0);
        ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8d6864c2a5fa..b2fbbde58e44 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        alias = d_lookup(parent, &data->args.name);
        if (alias != NULL) {
-                int ret = 0;
+                int ret;
                void *devname_garbage = NULL;
                /*
@@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                 * the sillyrename information to the aliased dentry.
                 */
                nfs_free_dname(data);
+                ret = nfs_copy_dname(alias, data);
                spin_lock(&alias->d_lock);
-                if (alias->d_inode != NULL &&
+                if (ret == 0 && alias->d_inode != NULL &&
                    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
                        devname_garbage = alias->d_fsdata;
                        alias->d_fsdata = data;
                        alias->d_flags |= DCACHE_NFSFS_RENAMED;
                        ret = 1;
-                }
+                } else
+                        ret = 0;
                spin_unlock(&alias->d_lock);
                nfs_dec_sillycount(dir);
                dput(alias);
@@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                 * point dentry is definitely not a root, so we won't need
                 * that anymore.
                 */
-                if (devname_garbage)
+                kfree(devname_garbage);
-                        kfree(devname_garbage);
                return ret;
        }
        data->dir = igrab(dir);
@@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
        if (parent == NULL)
                goto out_free;
        dir = parent->d_inode;
-        if (nfs_copy_dname(dentry, data) != 0)
-                goto out_dput;
        /* Non-exclusive lock protects against concurrent lookup() calls */
        spin_lock(&dir->i_lock);
        if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
@@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
        struct nfs_renamedata *data = calldata;
        struct inode *old_dir = data->old_dir;
        struct inode *new_dir = data->new_dir;
+        struct dentry *old_dentry = data->old_dentry;
+        struct dentry *new_dentry = data->new_dentry;
        if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
                nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
@@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
        }
        if (task->tk_status != 0) {
-                nfs_cancel_async_unlink(data->old_dentry);
+                nfs_cancel_async_unlink(old_dentry);
                return;
        }
-        nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir));
+        d_drop(old_dentry);
-        d_move(data->old_dentry, data->new_dentry);
+        d_drop(new_dentry);
 }
 /**
@@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
 * and only performs the unlink once the last reference to it is put.
 *
 * The final cleanup is done during dentry_iput.
+ *
+ * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server
+ * could take responsibility for keeping open files referenced.  The server
+ * would also need to ensure that opened-but-deleted files were kept over
+ * reboots.  However, we may not assume a server does so.  (RFC 5661
+ * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can
+ * use to advertise that it does this; some day we may take advantage of
+ * it.))
 */
 int
 nfs_sillyrename(struct inode *dir, struct dentry *dentry)
@@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        if (error)
                goto out_dput;
+        /* populate unlinkdata with the right dname */
+        error = nfs_copy_dname(sdentry,
+                                (struct nfs_unlinkdata *)dentry->d_fsdata);
+        if (error) {
+                nfs_cancel_async_unlink(dentry);
+                goto out_dput;
+        }
        /* run the rename task, undo unlink if it fails */
        task = nfs_async_rename(dir, dir, dentry, sdentry);
        if (IS_ERR(task)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08579312c57b..b39b37f80913 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
        mempool_free(p, nfs_wdata_mempool);
 }
-static void nfs_writedata_release(struct nfs_write_data *wdata)
+void nfs_writedata_release(struct nfs_write_data *wdata)
 {
        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
@@ -845,11 +845,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write);
 /*
 * Set up the argument/result storage required for the RPC call.
 */
-static int nfs_write_rpcsetup(struct nfs_page *req,
+static void nfs_write_rpcsetup(struct nfs_page *req,
                struct nfs_write_data *data,
-                const struct rpc_call_ops *call_ops,
                unsigned int count, unsigned int offset,
-                struct pnfs_layout_segment *lseg,
                int how)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
@@ -860,7 +858,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->req = req;
        data->inode = inode = req->wb_context->dentry->d_inode;
        data->cred = req->wb_context->cred;
-        data->lseg = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -872,24 +869,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.context = get_nfs_open_context(req->wb_context);
        data->args.lock_context = req->wb_lock_context;
        data->args.stable  = NFS_UNSTABLE;
-        if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+        switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
-                data->args.stable = NFS_DATA_SYNC;
+        case 0:
-                if (!nfs_need_commit(NFS_I(inode)))
+                break;
-                        data->args.stable = NFS_FILE_SYNC;
+        case FLUSH_COND_STABLE:
+                if (nfs_need_commit(NFS_I(inode)))
+                        break;
+        default:
+                data->args.stable = NFS_FILE_SYNC;
        }
        data->res.fattr   = &data->fattr;
        data->res.count   = count;
        data->res.verf    = &data->verf;
        nfs_fattr_init(&data->fattr);
+}
-        if (data->lseg &&
+static int nfs_do_write(struct nfs_write_data *data,
-            (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+                const struct rpc_call_ops *call_ops,
-                return 0;
+                int how)
+{
+        struct inode *inode = data->args.context->dentry->d_inode;
        return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
 }
+static int nfs_do_multiple_writes(struct list_head *head,
+                const struct rpc_call_ops *call_ops,
+                int how)
+{
+        struct nfs_write_data *data;
+        int ret = 0;
+        while (!list_empty(head)) {
+                int ret2;
+                data = list_entry(head->next, struct nfs_write_data, list);
+                list_del_init(&data->list);
+                
+                ret2 = nfs_do_write(data, call_ops, how);
+                 if (ret == 0)
+                         ret = ret2;
+        }
+        return ret;
+}
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
 * call this on each, which will prepare them to be retried on next
 * writeback using standard nfs.
@@ -907,17 +931,15 @@ static void nfs_redirty_request(struct nfs_page *req)
 * Generate multiple small requests to write out a single
 * contiguous dirty area on one page.
 */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res)
 {
        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_write_data *data;
-        size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
+        size_t wsize = desc->pg_bsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
-        struct pnfs_layout_segment *lseg;
-        LIST_HEAD(list);
        nfs_list_remove_request(req);
@@ -927,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+        offset = 0;
        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes, wsize);
@@ -934,45 +957,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
                data = nfs_writedata_alloc(1);
                if (!data)
                        goto out_bad;
-                list_add(&data->pages, &list);
+                data->pagevec[0] = page;
+                nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags);
+                list_add(&data->list, res);
                requests++;
                nbytes -= len;
+                offset += len;
        } while (nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        desc->pg_rpc_callops = &nfs_write_partial_ops;
-        BUG_ON(desc->pg_lseg);
-        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
-                                  req_offset(req), desc->pg_count,
-                                  IOMODE_RW, GFP_NOFS);
-        ClearPageError(page);
-        offset = 0;
-        nbytes = desc->pg_count;
-        do {
-                int ret2;
-                data = list_entry(list.next, struct nfs_write_data, pages);
-                list_del_init(&data->pages);
-                data->pagevec[0] = page;
-                if (nbytes < wsize)
-                        wsize = nbytes;
-                ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                                          wsize, offset, lseg, desc->pg_ioflags);
-                if (ret == 0)
-                        ret = ret2;
-                offset += wsize;
-                nbytes -= wsize;
-        } while (nbytes != 0);
-        put_lseg(lseg);
-        desc->pg_lseg = NULL;
        return ret;
 out_bad:
-        while (!list_empty(&list)) {
+        while (!list_empty(res)) {
-                data = list_entry(list.next, struct nfs_write_data, pages);
+                data = list_entry(res->next, struct nfs_write_data, list);
-                list_del(&data->pages);
+                list_del(&data->list);
                nfs_writedata_free(data);
        }
        nfs_redirty_request(req);
@@ -987,14 +986,13 @@ out_bad:
 * This is the case if nfs_updatepage detects a conflicting request
 * that has been written but not committed.
 */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_write_data   *data;
        struct list_head *head = &desc->pg_list;
-        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        int ret = 0;
-        int ret;
        data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
                                                      desc->pg_count));
@@ -1016,32 +1014,62 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
-        if ((!lseg) && list_is_singular(&data->pages))
-                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
-                                          req_offset(req), desc->pg_count,
-                                          IOMODE_RW, GFP_NOFS);
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
        /* Set up the argument struct */
-        ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
+        nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags);
+        list_add(&data->list, res);
+        desc->pg_rpc_callops = &nfs_write_full_ops;
 out:
-        put_lseg(lseg); /* Cleans any gotten in ->pg_test */
-        desc->pg_lseg = NULL;
        return ret;
 }
-static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head)
+{
+        if (desc->pg_bsize < PAGE_CACHE_SIZE)
+                return nfs_flush_multi(desc, head);
+        return nfs_flush_one(desc, head);
+}
+static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
+{
+        LIST_HEAD(head);
+        int ret;
+        ret = nfs_generic_flush(desc, &head);
+        if (ret == 0)
+                ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops,
+                                desc->pg_ioflags);
+        return ret;
+}
+static const struct nfs_pageio_ops nfs_pageio_write_ops = {
+        .pg_test = nfs_generic_pg_test,
+        .pg_doio = nfs_generic_pg_writepages,
+};
+static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
                                  struct inode *inode, int ioflags)
 {
-        size_t wsize = NFS_SERVER(inode)->wsize;
+        nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
+                                NFS_SERVER(inode)->wsize, ioflags);
+}
+void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_ops = &nfs_pageio_write_ops;
+        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
-        if (wsize < PAGE_CACHE_SIZE)
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
-                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
+                                  struct inode *inode, int ioflags)
-        else
+{
-                nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
+        if (!pnfs_pageio_init_write(pgio, inode, ioflags))
+                nfs_pageio_init_write_mds(pgio, inode, ioflags);
 }
 /*
@@ -1566,8 +1594,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                int status;
                bool sync = true;
-                if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
+                if (wbc->sync_mode == WB_SYNC_NONE)
-                    wbc->for_background)
                        sync = false;
                status = pnfs_layoutcommit_inode(inode, sync);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index d309f38449cb..63fc294a4692 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -26,7 +26,7 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 /*
 * Final freeing of a group
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 07ea8d3e6ea2..b13c00ac48eb 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -23,7 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 252ab1f6452b..e14587d55689 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,7 +92,7 @@
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index f39260f8f865..ee188158a224 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -43,7 +43,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index e86577d6c5c3..778fe6cae3b0 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -24,7 +24,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 2dabf813456c..fe8e7e928889 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -24,7 +24,7 @@
 #ifndef _LINUX_NTFS_INODE_H
 #define _LINUX_NTFS_INODE_H
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/list.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 783c58d9daf1..a7219075b4de 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle,
        case ACL_TYPE_ACCESS:
                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        umode_t mode = inode->i_mode;
                        ret = posix_acl_equiv_mode(acl, &mode);
                        if (ret < 0)
                                return ret;
@@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
        int ret = 0, ret2;
-        mode_t mode;
+        umode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3b8d3979e03b..98e544274390 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb)
        memset(bh->b_data, 0, sizeof(struct omfs_inode));
-        if (inode->i_mode & S_IFDIR) {
+        if (S_ISDIR(inode->i_mode)) {
                memset(&bh->b_data[OMFS_DIR_START], 0xff,
                        sbi->s_sys_blocksize - OMFS_DIR_START);
        } else
diff --git a/fs/open.c b/fs/open.c
index 739b751aa73e..f71192109457 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -446,74 +446,52 @@ out:
        return error;
 }
-SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
+static int chmod_common(struct path *path, umode_t mode)
 {
-        struct inode * inode;
+        struct inode *inode = path->dentry->d_inode;
-        struct dentry * dentry;
-        struct file * file;
-        int err = -EBADF;
        struct iattr newattrs;
+        int error;
-        file = fget(fd);
+        error = mnt_want_write(path->mnt);
-        if (!file)
+        if (error)
-                goto out;
+                return error;
-        dentry = file->f_path.dentry;
-        inode = dentry->d_inode;
-        audit_inode(NULL, dentry);
-        err = mnt_want_write_file(file);
-        if (err)
-                goto out_putf;
        mutex_lock(&inode->i_mutex);
-        err = security_path_chmod(dentry, file->f_vfsmnt, mode);
+        error = security_path_chmod(path->dentry, path->mnt, mode);
-        if (err)
+        if (error)
                goto out_unlock;
-        if (mode == (mode_t) -1)
-                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-        err = notify_change(dentry, &newattrs);
+        error = notify_change(path->dentry, &newattrs);
 out_unlock:
        mutex_unlock(&inode->i_mutex);
-        mnt_drop_write(file->f_path.mnt);
+        mnt_drop_write(path->mnt);
-out_putf:
+        return error;
-        fput(file);
+}
-out:
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
+{
+        struct file * file;
+        int err = -EBADF;
+        file = fget(fd);
+        if (file) {
+                audit_inode(NULL, file->f_path.dentry);
+                err = chmod_common(&file->f_path, mode);
+                fput(file);
+        }
        return err;
 }
 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 {
        struct path path;
-        struct inode *inode;
        int error;
-        struct iattr newattrs;
        error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
-        if (error)
+        if (!error) {
-                goto out;
+                error = chmod_common(&path, mode);
-        inode = path.dentry->d_inode;
+                path_put(&path);
+        }
-        error = mnt_want_write(path.mnt);
-        if (error)
-                goto dput_and_out;
-        mutex_lock(&inode->i_mutex);
-        error = security_path_chmod(path.dentry, path.mnt, mode);
-        if (error)
-                goto out_unlock;
-        if (mode == (mode_t) -1)
-                mode = inode->i_mode;
-        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
-        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-        error = notify_change(path.dentry, &newattrs);
-out_unlock:
-        mutex_unlock(&inode->i_mutex);
-        mnt_drop_write(path.mnt);
-dput_and_out:
-        path_put(&path);
-out:
        return error;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 1b7f9af67ccf..0e0be1dc0f8e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = {
 static struct inode * get_pipe_inode(void)
 {
-        struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;
        if (!inode)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index a6227d219e93..10027b42b7e2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/posix_acl.h>
@@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl)
 * file mode permission bits, or else 1. Returns -E... on error.
 */
 int
-posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
+posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
 {
        const struct posix_acl_entry *pa, *pe;
-        mode_t mode = 0;
+        umode_t mode = 0;
        int not_equiv = 0;
        FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
 * Create an ACL representing the file mode permission bits of an inode.
 */
 struct posix_acl *
-posix_acl_from_mode(mode_t mode, gfp_t flags)
+posix_acl_from_mode(umode_t mode, gfp_t flags)
 {
        struct posix_acl *acl = posix_acl_alloc(3, flags);
        if (!acl)
@@ -279,11 +279,11 @@ check_perm:
 * system calls. All permissions that are not granted by the acl are removed.
 * The permissions in the acl are changed to reflect the mode_p parameter.
 */
-static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p)
+static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
 {
        struct posix_acl_entry *pa, *pe;
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
-        mode_t mode = *mode_p;
+        umode_t mode = *mode_p;
        int not_equiv = 0;
        /* assert(atomic_read(acl->a_refcount) == 1); */
@@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p)
 /*
 * Modify the ACL for the chmod syscall.
 */
-static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
+static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
 {
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;
@@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
 }
 int
-posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p)
+posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
 {
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
@@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p)
 EXPORT_SYMBOL(posix_acl_create);
 int
-posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode)
+posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 {
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c9e3f650f23c..5eb02069e1b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1118,7 +1118,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
         * Warn that /proc/pid/oom_adj is deprecated, see
         * Documentation/feature-removal-schedule.txt.
         */
-        WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+        printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
                  current->comm, task_pid_nr(current), task_pid_nr(task),
                  task_pid_nr(task));
        task->signal->oom_adj = oom_adjust;
@@ -1919,6 +1919,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
                spin_lock(&files->file_lock);
                file = fcheck_files(files, fd);
                if (file) {
+                        unsigned int f_flags;
+                        struct fdtable *fdt;
+                        fdt = files_fdtable(files);
+                        f_flags = file->f_flags & ~O_CLOEXEC;
+                        if (FD_ISSET(fd, fdt->close_on_exec))
+                                f_flags |= O_CLOEXEC;
                        if (path) {
                                *path = file->f_path;
                                path_get(&file->f_path);
@@ -1928,7 +1936,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
                                         "pos:\t%lli\n"
                                         "flags:\t0%o\n",
                                         (long long) file->f_pos,
-                                         file->f_flags);
+                                         f_flags);
                        spin_unlock(&files->file_lock);
                        put_files_struct(files);
                        return 0;
@@ -2706,9 +2714,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
        struct task_io_accounting acct = task->ioac;
        unsigned long flags;
+        int result;
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+        result = mutex_lock_killable(&task->signal->cred_guard_mutex);
-                return -EACCES;
+        if (result)
+                return result;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+                result = -EACCES;
+                goto out_unlock;
+        }
        if (whole && lock_task_sighand(task, &flags)) {
                struct task_struct *t = task;
@@ -2719,7 +2734,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
                unlock_task_sighand(task, &flags);
        }
-        return sprintf(buffer,
+        result = sprintf(buffer,
                        "rchar: %llu\n"
                        "wchar: %llu\n"
                        "syscr: %llu\n"
@@ -2734,6 +2749,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
                        (unsigned long long)acct.read_bytes,
                        (unsigned long long)acct.write_bytes,
                        (unsigned long long)acct.cancelled_write_bytes);
+out_unlock:
+        mutex_unlock(&task->signal->cred_guard_mutex);
+        return result;
 }
 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1637f17c37c..9d99131d0d65 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -620,8 +620,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
        if (!ent) goto out;
        memset(ent, 0, sizeof(struct proc_dir_entry));
-        memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
+        memcpy(ent->name, fn, len + 1);
-        ent->name = ((char *) ent) + sizeof(*ent);
        ent->namelen = len;
        ent->mode = mode;
        ent->nlink = nlink;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 74b48cfa1bb2..7ed72d6c1c6f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
        if (!pde->proc_fops) {
                spin_unlock(&pde->pde_unload_lock);
                kfree(pdeo);
-                return -EINVAL;
+                return -ENOENT;
        }
        pde->pde_users++;
        open = pde->proc_fops->open;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ed257d141568..586174168e2a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -10,7 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 9020ac15baaa..f738024ccc8e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -197,15 +197,15 @@ static __net_init int proc_net_ns_init(struct net *net)
        int err;
        err = -ENOMEM;
-        netd = kzalloc(sizeof(*netd), GFP_KERNEL);
+        netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL);
        if (!netd)
                goto out;
        netd->data = net;
        netd->nlink = 2;
-        netd->name = "net";
        netd->namelen = 3;
        netd->parent = &proc_root;
+        memcpy(netd->name, "net", 4);
        err = -EEXIST;
        net_statd = proc_net_mkdir(net, "stat", netd);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index d6c3b416529b..9a8a2b77b874 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -186,13 +186,13 @@ static const struct inode_operations proc_root_inode_operations = {
 struct proc_dir_entry proc_root = {
        .low_ino        = PROC_ROOT_INO, 
        .namelen        = 5, 
-        .name           = "/proc",
        .mode           = S_IFDIR | S_IRUGO | S_IXUGO, 
        .nlink          = 2, 
        .count          = ATOMIC_INIT(1),
        .proc_iops      = &proc_root_inode_operations, 
        .proc_fops      = &proc_root_operations,
        .parent         = &proc_root,
+        .name           = "/proc",
 };
 int pid_ns_prepare_proc(struct pid_namespace *ns)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 977ed2723845..893b961dcfd8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -39,8 +39,9 @@
 #define PSTORE_NAMELEN  64
 struct pstore_private {
+        struct pstore_info *psi;
+        enum pstore_type_id type;
        u64     id;
-        int     (*erase)(u64);
        ssize_t size;
        char    data[];
 };
@@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct pstore_private *p = dentry->d_inode->i_private;
-        p->erase(p->id);
+        p->psi->erase(p->type, p->id, p->psi);
        return simple_unlink(dir, dentry);
 }
@@ -175,8 +176,8 @@ int pstore_is_mounted(void)
 * Set the mtime & ctime to the date that this record was originally stored.
 */
 int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
-                              char *data, size_t size,
+                  char *data, size_t size, struct timespec time,
-                              struct timespec time, int (*erase)(u64))
+                  struct pstore_info *psi)
 {
        struct dentry           *root = pstore_sb->s_root;
        struct dentry           *dentry;
@@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
        private = kmalloc(sizeof *private + size, GFP_KERNEL);
        if (!private)
                goto fail_alloc;
+        private->type = type;
        private->id = id;
-        private->erase = erase;
+        private->psi = psi;
        switch (type) {
        case PSTORE_TYPE_DMESG:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 8c9f23eb1645..611c1b3c46fa 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -2,5 +2,5 @@ extern void	pstore_set_kmsg_bytes(int);
 extern void     pstore_get_records(void);
 extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
                              char *data, size_t size,
-                              struct timespec time, int (*erase)(u64));
+                              struct timespec time, struct pstore_info *psi);
 extern int      pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f2c3ff20ea68..c5300ec31696 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -37,6 +37,8 @@
 static DEFINE_SPINLOCK(pstore_lock);
 static struct pstore_info *psinfo;
+static char *backend;
 /* How much of the console log to snapshot */
 static unsigned long kmsg_bytes = 10240;
@@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
        unsigned long   size, total = 0;
        char            *dst, *why;
        u64             id;
-        int             hsize, part = 1;
+        int             hsize;
+        unsigned int    part = 1;
        if (reason < ARRAY_SIZE(reason_str))
                why = reason_str[reason];
@@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
        oopscount++;
        while (total < kmsg_bytes) {
                dst = psinfo->buf;
-                hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
+                hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part);
                size = psinfo->bufsize - hsize;
                dst += hsize;
@@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                memcpy(dst, s1 + s1_start, l1_cpy);
                memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
-                id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+                id = psinfo->write(PSTORE_TYPE_DMESG, part,
+                                   hsize + l1_cpy + l2_cpy, psinfo);
                if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
                        pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
                                      psinfo->buf, hsize + l1_cpy + l2_cpy,
-                                      CURRENT_TIME, psinfo->erase);
+                                      CURRENT_TIME, psinfo);
                l1 -= l1_cpy;
                l2 -= l2_cpy;
                total += l1_cpy + l2_cpy;
+                part++;
        }
        mutex_unlock(&psinfo->buf_mutex);
 }
@@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi)
                spin_unlock(&pstore_lock);
                return -EBUSY;
        }
+        if (backend && strcmp(backend, psi->name)) {
+                spin_unlock(&pstore_lock);
+                return -EINVAL;
+        }
        psinfo = psi;
        spin_unlock(&pstore_lock);
@@ -166,9 +177,9 @@ void pstore_get_records(void)
        if (rc)
                goto out;
-        while ((size = psi->read(&id, &type, &time)) > 0) {
+        while ((size = psi->read(&id, &type, &time, psi)) > 0) {
                if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
-                                  time, psi->erase))
+                                  time, psi))
                        failed++;
        }
        psi->close(psi);
@@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
        mutex_lock(&psinfo->buf_mutex);
        memcpy(psinfo->buf, buf, size);
-        id = psinfo->write(type, size);
+        id = psinfo->write(type, 0, size, psinfo);
        if (pstore_is_mounted())
                pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
-                              size, CURRENT_TIME, psinfo->erase);
+                              size, CURRENT_TIME, psinfo);
        mutex_unlock(&psinfo->buf_mutex);
        return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_write);
+module_param(backend, charp, 0444);
+MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/fs/read_write.c b/fs/read_write.c
index 5907b49e4d7e..179f1c33ea57 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -166,8 +166,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
-                        if (offset >= inode->i_size)
+                        if (offset >= inode->i_size) {
-                                return -ENXIO;
+                                retval = -ENXIO;
+                                goto out;
+                        }
                        break;
                case SEEK_HOLE:
                        /*
@@ -175,8 +177,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
-                        if (offset >= inode->i_size)
+                        if (offset >= inode->i_size) {
-                                return -ENXIO;
+                                retval = -ENXIO;
+                                goto out;
+                        }
                        offset = inode->i_size;
                        break;
        }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 7362cf4c946a..6da0396e5052 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
        case ACL_TYPE_ACCESS:
                name = POSIX_ACL_XATTR_ACCESS;
                if (acl) {
-                        mode_t mode = inode->i_mode;
+                        error = posix_acl_equiv_mode(acl, &inode->i_mode);
-                        error = posix_acl_equiv_mode(acl, &mode);
                        if (error < 0)
                                return error;
                        else {
-                                inode->i_mode = mode;
                                if (error == 0)
                                        acl = NULL;
                        }
@@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
                return PTR_ERR(acl);
        if (acl) {
-                mode_t mode = inode->i_mode;
                /* Copy the default ACL to the default ACL of a new directory */
                if (S_ISDIR(inode->i_mode)) {
                        err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
@@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
                /* Now we reconcile the new ACL and the mode,
                   potentially modifying both */
-                err = posix_acl_create(&acl, GFP_NOFS, &mode);
+                err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
                if (err < 0)
                        return err;
-                inode->i_mode = mode;
                /* If we need an ACL.. */
                if (err > 0)
                        err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/stack.c b/fs/stack.c
index 4a6f7f440658..b4f2ab48a61f 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
         *
         * We don't actually know what locking is used at the lower level;
         * but if it's a filesystem that supports quotas, it will be using
-         * i_lock as in inode_add_bytes().  tmpfs uses other locking, and
+         * i_lock as in inode_add_bytes().
-         * its 32-bit is (just) able to exceed 2TB i_size with the aid of
-         * holes; but its i_blocks cannot carry into the upper long without
-         * almost 2TB swap - let's ignore that case.
         */
        if (sizeof(i_blocks) > sizeof(long))
                spin_lock(&src->i_lock);
diff --git a/fs/stat.c b/fs/stat.c
index 961039121cb8..ba5316ffac61 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
        stat->uid = inode->i_uid;
        stat->gid = inode->i_gid;
        stat->rdev = inode->i_rdev;
+        stat->size = i_size_read(inode);
        stat->atime = inode->i_atime;
        stat->mtime = inode->i_mtime;
        stat->ctime = inode->i_ctime;
-        stat->size = i_size_read(inode);
-        stat->blocks = inode->i_blocks;
        stat->blksize = (1 << inode->i_blkbits);
+        stat->blocks = inode->i_blocks;
 }
 EXPORT_SYMBOL(generic_fillattr);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 45174b534377..feb361e252ac 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -335,9 +335,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define ubifs_dbg_msg(fmt, ...) do {               \
+#define ubifs_dbg_msg(fmt, ...) do {                        \
-        if (0)                                     \
+        if (0)                                              \
-                pr_debug(fmt "\n", ##__VA_ARGS__); \
+                printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
 } while (0)
 #define dbg_dump_stack()
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 75bb316529dd..427a4e82a588 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,44 +16,53 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-y += -I$(src)                   # needed for trace events
-ccflags-$(CONFIG_XFS_DEBUG) += -g
-XFS_LINUX := linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 obj-$(CONFIG_XFS_FS)            += xfs.o
-xfs-y                           += linux-2.6/xfs_trace.o
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y                           += xfs_trace.o
-xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
-                                   xfs_dquot.o \
-                                   xfs_dquot_item.o \
-                                   xfs_trans_dquot.o \
-                                   xfs_qm_syscalls.o \
-                                   xfs_qm_bhv.o \
-                                   xfs_qm.o)
-xfs-$(CONFIG_XFS_QUOTA)         += linux-2.6/xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
-endif
-xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += $(XFS_LINUX)/xfs_acl.o
-xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
+# highlevel code
+xfs-y                           += xfs_aops.o \
+                                   xfs_bit.o \
+                                   xfs_buf.o \
+                                   xfs_dfrag.o \
+                                   xfs_discard.o \
+                                   xfs_error.o \
+                                   xfs_export.o \
+                                   xfs_file.o \
+                                   xfs_filestream.o \
+                                   xfs_fsops.o \
+                                   xfs_fs_subr.o \
+                                   xfs_globals.o \
+                                   xfs_iget.o \
+                                   xfs_ioctl.o \
+                                   xfs_iomap.o \
+                                   xfs_iops.o \
+                                   xfs_itable.o \
+                                   xfs_message.o \
+                                   xfs_mru_cache.o \
+                                   xfs_super.o \
+                                   xfs_sync.o \
+                                   xfs_xattr.o \
+                                   xfs_rename.o \
+                                   xfs_rw.o \
+                                   xfs_utils.o \
+                                   xfs_vnodeops.o \
+                                   kmem.o \
+                                   uuid.o
+# code shared with libxfs
 xfs-y                           += xfs_alloc.o \
                                   xfs_alloc_btree.o \
                                   xfs_attr.o \
                                   xfs_attr_leaf.o \
-                                   xfs_bit.o \
                                   xfs_bmap.o \
                                   xfs_bmap_btree.o \
                                   xfs_btree.o \
-                                   xfs_buf_item.o \
                                   xfs_da_btree.o \
                                   xfs_dir2.o \
                                   xfs_dir2_block.o \
@@ -61,49 +70,37 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_dir2_leaf.o \
                                   xfs_dir2_node.o \
                                   xfs_dir2_sf.o \
-                                   xfs_error.o \
-                                   xfs_extfree_item.o \
-                                   xfs_filestream.o \
-                                   xfs_fsops.o \
                                   xfs_ialloc.o \
                                   xfs_ialloc_btree.o \
-                                   xfs_iget.o \
                                   xfs_inode.o \
-                                   xfs_inode_item.o \
-                                   xfs_iomap.o \
-                                   xfs_itable.o \
-                                   xfs_dfrag.o \
-                                   xfs_log.o \
-                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
-                                   xfs_mru_cache.o \
+                                   xfs_trans.o
-                                   xfs_rename.o \
-                                   xfs_trans.o \
+# low-level transaction/log code
+xfs-y                           += xfs_log.o \
+                                   xfs_log_cil.o \
+                                   xfs_buf_item.o \
+                                   xfs_extfree_item.o \
+                                   xfs_inode_item.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
-                                   xfs_utils.o \
-                                   xfs_vnodeops.o \
-                                   xfs_rw.o
-# Objects in linux/
-xfs-y                           += $(addprefix $(XFS_LINUX)/, \
-                                   kmem.o \
-                                   xfs_aops.o \
-                                   xfs_buf.o \
-                                   xfs_discard.o \
-                                   xfs_export.o \
-                                   xfs_file.o \
-                                   xfs_fs_subr.o \
-                                   xfs_globals.o \
-                                   xfs_ioctl.o \
-                                   xfs_iops.o \
-                                   xfs_message.o \
-                                   xfs_super.o \
-                                   xfs_sync.o \
-                                   xfs_xattr.o)
-# Objects in support/
+# optional features
-xfs-y                           += support/uuid.o
+xfs-$(CONFIG_XFS_QUOTA)         += xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o \
+                                   xfs_quotaops.o
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)           += xfs_qm_stats.o
+endif
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
+xfs-$(CONFIG_SYSCTL)            += xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)            += xfs_ioctl32.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c
index a907de565db3..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/kmem.c
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h
index 292eff198030..292eff198030 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/kmem.h
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5c..ff6a19873e5c 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184c..387e695a184c 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index b83f76b6d410..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 53ec3ea9a625..d8b11b7f94aa 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,5 +24,6 @@
 #define XFS_BUF_LOCK_TRACKING 1
 #endif
-#include <linux-2.6/xfs_linux.h>
+#include "xfs_linux.h"
 #endif  /* __XFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c
index 44ce51656804..b6c4b3795c4a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 }
 static int
-xfs_set_mode(struct inode *inode, mode_t mode)
+xfs_set_mode(struct inode *inode, umode_t mode)
 {
        int error = 0;
@@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode)
 int
 xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
 {
-        mode_t mode = inode->i_mode;
+        umode_t mode = inode->i_mode;
        int error = 0, inherit = 0;
        if (S_ISDIR(inode->i_mode)) {
@@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
                goto out_release;
        if (type == ACL_TYPE_ACCESS) {
-                mode_t mode = inode->i_mode;
+                umode_t mode = inode->i_mode;
                error = posix_acl_equiv_mode(acl, &mode);
                if (error <= 0) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 2c656ef49473..39632d941354 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -51,7 +51,10 @@ extern int posix_acl_default_exists(struct inode *inode);
 extern const struct xattr_handler xfs_xattr_acl_access_handler;
 extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
-# define xfs_get_acl(inode, type)                       NULL
+static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+{
+        return NULL;
+}
 # define xfs_inherit_acl(inode, default_acl)            0
 # define xfs_acl_chmod(inode)                           0
 # define posix_acl_access_exists(inode)                 0
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6530769a999b..4805f009f923 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -103,7 +103,7 @@ typedef struct xfs_agf {
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
@@ -156,7 +156,7 @@ typedef struct xfs_agi {
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
 #define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_agnumber_t agno, struct xfs_buf **bpp);
@@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
 #define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
 #define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
 #define XFS_AGFL_SIZE(mp)       ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
-#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
 typedef struct xfs_agfl {
        __be32          agfl_bno[1];    /* actually XFS_AGFL_SIZE(mp) */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1e00b3ef6274..bdd9cb54d63b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -451,8 +451,7 @@ xfs_alloc_read_agfl(
                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
        if (error)
                return error;
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
        *bpp = bp;
        return 0;
@@ -2116,7 +2115,7 @@ xfs_read_agf(
        if (!*bpp)
                return 0;
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        ASSERT(!(*bpp)->b_error);
        agf = XFS_BUF_TO_AGF(*bpp);
        /*
@@ -2168,7 +2167,7 @@ xfs_alloc_read_agf(
                return error;
        if (!*bpp)
                return 0;
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        ASSERT(!(*bpp)->b_error);
        agf = XFS_BUF_TO_AGF(*bpp);
        pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..8c37dde4c521 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1300,6 +1300,7 @@ xfs_end_io_direct_write(
        bool                    is_async)
 {
        struct xfs_ioend        *ioend = iocb->private;
+        struct inode            *inode = ioend->io_inode;
        /*
         * blockdev_direct_IO can return an error even after the I/O
@@ -1331,7 +1332,7 @@ xfs_end_io_direct_write(
        }
        /* XXX: probably should move into the real I/O completion handler */
-        inode_dio_done(ioend->io_inode);
+        inode_dio_done(inode);
 }
 STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71f..71f721e1a71f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cbae424fe1ba..160bcdc34a6e 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2121,8 +2121,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
                                 XBF_LOCK | XBF_DONT_BLOCK);
-                ASSERT(bp);
+                ASSERT(!xfs_buf_geterror(bp));
-                ASSERT(!XFS_BUF_GETERROR(bp));
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index c51a3f903633..452a291383ab 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -414,7 +414,7 @@ xfs_bmap_add_attrfork_local(
        if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
                return 0;
-        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+        if (S_ISDIR(ip->i_d.di_mode)) {
                mp = ip->i_mount;
                memset(&dargs, 0, sizeof(dargs));
                dargs.dp = ip;
@@ -3344,8 +3344,7 @@ xfs_bmap_local_to_extents(
         * We don't want to deal with the case of keeping inode data inline yet.
         * So sending the data fork of a regular inode is invalid.
         */
-        ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
+        ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
-                 whichfork == XFS_DATA_FORK));
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
        flags = 0;
@@ -3384,8 +3383,7 @@ xfs_bmap_local_to_extents(
                ASSERT(args.len == 1);
                *firstblock = args.fsbno;
                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-                memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
+                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-                        ifp->if_bytes);
                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
                xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -4052,7 +4050,7 @@ xfs_bmap_one_block(
 #ifndef DEBUG
        if (whichfork == XFS_DATA_FORK) {
-                return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ?
+                return S_ISREG(ip->i_d.di_mode) ?
                        (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
                        (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
        }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cabf4b5604aa..2b9fd385e27d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -275,8 +275,7 @@ xfs_btree_dup_cursor(
                                return error;
                        }
                        new->bc_bufs[i] = bp;
-                        ASSERT(bp);
+                        ASSERT(!xfs_buf_geterror(bp));
-                        ASSERT(!XFS_BUF_GETERROR(bp));
                } else
                        new->bc_bufs[i] = NULL;
        }
@@ -467,8 +466,7 @@ xfs_btree_get_bufl(
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
        return bp;
 }
@@ -491,8 +489,7 @@ xfs_btree_get_bufs(
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
        return bp;
 }
@@ -632,7 +629,7 @@ xfs_btree_read_bufl(
                        mp->m_bsize, lock, &bp))) {
                return error;
        }
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        ASSERT(!xfs_buf_geterror(bp));
        if (bp)
                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
        *bpp = bp;
@@ -973,8 +970,7 @@ xfs_btree_get_buf_block(
        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
                                 mp->m_bsize, flags);
-        ASSERT(*bpp);
+        ASSERT(!xfs_buf_geterror(*bpp));
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
        *block = XFS_BUF_TO_BLOCK(*bpp);
        return 0;
@@ -1006,8 +1002,7 @@ xfs_btree_read_buf_block(
        if (error)
                return error;
-        ASSERT(*bpp != NULL);
+        ASSERT(!xfs_buf_geterror(*bpp));
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
        xfs_btree_set_refs(cur, *bpp);
        *block = XFS_BUF_TO_BLOCK(*bpp);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 8d05a6a46ce3..5b240de104c0 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -262,7 +262,7 @@ typedef struct xfs_btree_cur
 /*
 * Convert from buffer to btree block header.
 */
-#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)((bp)->b_addr))
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c
index b2b411985591..c57836dc778f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -596,7 +596,7 @@ _xfs_buf_read(
        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
        status = xfs_buf_iorequest(bp);
-        if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+        if (status || bp->b_error || (flags & XBF_ASYNC))
                return status;
        return xfs_buf_iowait(bp);
 }
@@ -679,7 +679,6 @@ xfs_buf_read_uncached(
        /* set up the buffer for a read IO */
        XFS_BUF_SET_ADDR(bp, daddr);
        XFS_BUF_READ(bp);
-        XFS_BUF_BUSY(bp);
        xfsbdstrat(mp, bp);
        error = xfs_buf_iowait(bp);
@@ -1069,7 +1068,7 @@ xfs_bioerror(
        /*
         * No need to wait until the buffer is unpinned, we aren't flushing it.
         */
-        XFS_BUF_ERROR(bp, EIO);
+        xfs_buf_ioerror(bp, EIO);
        /*
         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
@@ -1094,7 +1093,7 @@ STATIC int
 xfs_bioerror_relse(
        struct xfs_buf  *bp)
 {
-        int64_t         fl = XFS_BUF_BFLAGS(bp);
+        int64_t         fl = bp->b_flags;
        /*
         * No need to wait until the buffer is unpinned.
         * We aren't flushing it.
@@ -1115,7 +1114,7 @@ xfs_bioerror_relse(
                 * There's no reason to mark error for
                 * ASYNC buffers.
                 */
-                XFS_BUF_ERROR(bp, EIO);
+                xfs_buf_ioerror(bp, EIO);
                XFS_BUF_FINISH_IOWAIT(bp);
        } else {
                xfs_buf_relse(bp);
@@ -1224,6 +1223,9 @@ _xfs_buf_ioapply(
                rw = READ;
        }
+        /* we only use the buffer cache for meta-data */
+        rw |= REQ_META;
 next_chunk:
        atomic_inc(&bp->b_io_remaining);
        nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
@@ -1321,7 +1323,7 @@ xfs_buf_offset(
        struct page             *page;
        if (bp->b_flags & XBF_MAPPED)
-                return XFS_BUF_PTR(bp) + offset;
+                return bp->b_addr + offset;
        offset += bp->b_offset;
        page = bp->b_pages[offset >> PAGE_SHIFT];
@@ -1481,7 +1483,7 @@ xfs_setsize_buftarg_flags(
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
                xfs_warn(btp->bt_mount,
                        "Cannot set_blocksize to %u on device %s\n",
-                        sectorsize, XFS_BUFTARG_NAME(btp));
+                        sectorsize, xfs_buf_target_name(btp));
                return EINVAL;
        }
@@ -1678,7 +1680,7 @@ xfs_buf_delwri_split(
        list_for_each_entry_safe(bp, n, dwq, b_list) {
                ASSERT(bp->b_flags & XBF_DELWRI);
-                if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) {
+                if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
                        if (!force &&
                            time_before(jiffies, bp->b_queuetime + age)) {
                                xfs_buf_unlock(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6a83b46b4bcf..620972b8094d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -228,11 +228,15 @@ extern void xfs_buf_delwri_promote(xfs_buf_t *);
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
-#define xfs_buf_target_name(target)     \
+static inline const char *
-        ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
+xfs_buf_target_name(struct xfs_buftarg *target)
+{
+        static char __b[BDEVNAME_SIZE];
+        return bdevname(target->bt_bdev, __b);
+}
-#define XFS_BUF_BFLAGS(bp)      ((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp) \
        ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
                            XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
@@ -251,23 +255,14 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNDELAYWRITE(bp)        xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
-#define XFS_BUF_ERROR(bp,no)    xfs_buf_ioerror(bp,no)
-#define XFS_BUF_GETERROR(bp)    xfs_buf_geterror(bp)
-#define XFS_BUF_ISERROR(bp)     (xfs_buf_geterror(bp) ? 1 : 0)
 #define XFS_BUF_DONE(bp)        ((bp)->b_flags |= XBF_DONE)
 #define XFS_BUF_UNDONE(bp)      ((bp)->b_flags &= ~XBF_DONE)
 #define XFS_BUF_ISDONE(bp)      ((bp)->b_flags & XBF_DONE)
-#define XFS_BUF_BUSY(bp)        do { } while (0)
-#define XFS_BUF_UNBUSY(bp)      do { } while (0)
-#define XFS_BUF_ISBUSY(bp)      (1)
 #define XFS_BUF_ASYNC(bp)       ((bp)->b_flags |= XBF_ASYNC)
 #define XFS_BUF_UNASYNC(bp)     ((bp)->b_flags &= ~XBF_ASYNC)
 #define XFS_BUF_ISASYNC(bp)     ((bp)->b_flags & XBF_ASYNC)
-#define XFS_BUF_HOLD(bp)        xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
 #define XFS_BUF_ISREAD(bp)      ((bp)->b_flags & XBF_READ)
@@ -276,10 +271,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNWRITE(bp)     ((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)     ((bp)->b_flags & XBF_WRITE)
-#define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
-#define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
 #define XFS_BUF_ADDR(bp)                ((bp)->b_bn)
 #define XFS_BUF_SET_ADDR(bp, bno)       ((bp)->b_bn = (xfs_daddr_t)(bno))
 #define XFS_BUF_OFFSET(bp)              ((bp)->b_file_offset)
@@ -299,14 +290,13 @@ xfs_buf_set_ref(
 #define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
+        return atomic_read(&bp->b_pin_count);
+}
 #define XFS_BUF_FINISH_IOWAIT(bp)       complete(&bp->b_iowait);
-#define XFS_BUF_SET_TARGET(bp, target)  ((bp)->b_target = (target))
-#define XFS_BUF_TARGET(bp)              ((bp)->b_target)
-#define XFS_BUFTARG_NAME(target)        xfs_buf_target_name(target)
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
        xfs_buf_unlock(bp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 88492916c3dc..cac2ecfa6746 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -124,9 +124,9 @@ xfs_buf_item_log_check(
        bp = bip->bli_buf;
        ASSERT(XFS_BUF_COUNT(bp) > 0);
-        ASSERT(XFS_BUF_PTR(bp) != NULL);
+        ASSERT(bp->b_addr != NULL);
        orig = bip->bli_orig;
-        buffer = XFS_BUF_PTR(bp);
+        buffer = bp->b_addr;
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
                        xfs_emerg(bp->b_mount,
@@ -371,7 +371,6 @@ xfs_buf_item_pin(
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
@@ -479,13 +478,13 @@ xfs_buf_item_trylock(
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                return XFS_ITEM_PINNED;
        if (!xfs_buf_trylock(bp))
                return XFS_ITEM_LOCKED;
        /* take a reference to the buffer.  */
-        XFS_BUF_HOLD(bp);
+        xfs_buf_hold(bp);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
        trace_xfs_buf_item_trylock(bip);
@@ -726,7 +725,7 @@ xfs_buf_item_init(
         * to have logged.
         */
        bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
-        memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
+        memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
        bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
 #endif
@@ -895,7 +894,6 @@ xfs_buf_attach_iodone(
 {
        xfs_log_item_t  *head_lip;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        lip->li_cb = cb;
@@ -960,7 +958,7 @@ xfs_buf_iodone_callbacks(
        static ulong            lasttime;
        static xfs_buftarg_t    *lasttarg;
-        if (likely(!XFS_BUF_GETERROR(bp)))
+        if (likely(!xfs_buf_geterror(bp)))
                goto do_callbacks;
        /*
@@ -973,14 +971,14 @@ xfs_buf_iodone_callbacks(
                goto do_callbacks;
        }
-        if (XFS_BUF_TARGET(bp) != lasttarg ||
+        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
                lasttime = jiffies;
                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                        xfs_buf_target_name(bp->b_target),
                      (__uint64_t)XFS_BUF_ADDR(bp));
        }
-        lasttarg = XFS_BUF_TARGET(bp);
+        lasttarg = bp->b_target;
        /*
         * If the write was asynchronous then no one will be looking for the
@@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks(
         * around.
         */
        if (XFS_BUF_ISASYNC(bp)) {
-                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+                xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
                if (!XFS_BUF_ISSTALE(bp)) {
                        XFS_BUF_DELAYWRITE(bp);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_SET_START(bp);
                }
                ASSERT(bp->b_iodone != NULL);
                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1013,7 +1010,6 @@ xfs_buf_iodone_callbacks(
        XFS_BUF_UNDELAYWRITE(bp);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
-        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 do_callbacks:
        xfs_buf_do_callbacks(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 2925726529f8..ee9d5427fcd4 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -692,6 +692,24 @@ xfs_da_join(xfs_da_state_t *state)
        return(error);
 }
+#ifdef  DEBUG
+static void
+xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
+{
+        __be16  magic = blkinfo->magic;
+        if (level == 1) {
+                ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+                       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+        } else
+                ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+        ASSERT(!blkinfo->forw);
+        ASSERT(!blkinfo->back);
+}
+#else   /* !DEBUG */
+#define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
+#endif  /* !DEBUG */
 /*
 * We have only one entry in the root.  Copy the only remaining child of
 * the old root to block 0 as the new root node.
@@ -700,8 +718,6 @@ STATIC int
 xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 {
        xfs_da_intnode_t *oldroot;
-        /* REFERENCED */
-        xfs_da_blkinfo_t *blkinfo;
        xfs_da_args_t *args;
        xfs_dablk_t child;
        xfs_dabuf_t *bp;
@@ -732,15 +748,9 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
        if (error)
                return(error);
        ASSERT(bp != NULL);
-        blkinfo = bp->data;
+        xfs_da_blkinfo_onlychild_validate(bp->data,
-        if (be16_to_cpu(oldroot->hdr.level) == 1) {
+                                        be16_to_cpu(oldroot->hdr.level));
-                ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-                       blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-        } else {
-                ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-        }
-        ASSERT(!blkinfo->forw);
-        ASSERT(!blkinfo->back);
        memcpy(root_blk->bp->data, bp->data, state->blocksize);
        xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
        error = xfs_da_shrink_inode(args, child, bp);
@@ -2040,7 +2050,7 @@ xfs_da_do_buf(
                case 0:
                        bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
                                mappedbno, nmapped, 0);
-                        error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+                        error = bp ? bp->b_error : XFS_ERROR(EIO);
                        break;
                case 1:
                case 2:
@@ -2258,7 +2268,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
                dabuf->nbuf = 1;
                bp = bps[0];
                dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
-                dabuf->data = XFS_BUF_PTR(bp);
+                dabuf->data = bp->b_addr;
                dabuf->bps[0] = bp;
        } else {
                dabuf->nbuf = nbuf;
@@ -2269,7 +2279,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
                dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
                for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
                        bp = bps[i];
-                        memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+                        memcpy((char *)dabuf->data + off, bp->b_addr,
                                XFS_BUF_COUNT(bp));
                }
        }
@@ -2292,8 +2302,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
                for (i = off = 0; i < dabuf->nbuf;
                                i++, off += XFS_BUF_COUNT(bp)) {
                        bp = dabuf->bps[i];
-                        memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+                        memcpy(bp->b_addr, dabuf->data + off,
-                                XFS_BUF_COUNT(bp));
+                                                XFS_BUF_COUNT(bp));
                }
        }
 }
@@ -2330,7 +2340,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
        if (dabuf->nbuf == 1) {
-                ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+                ASSERT(dabuf->data == dabuf->bps[0]->b_addr);
                xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
                return;
        }
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index dffba9ba0db6..a3721633abc8 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt {
                be32_to_cpu((dip)->di_nextents) : \
                be16_to_cpu((dip)->di_anextents))
-#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
 /*
 * For block and character special files the 32bit dev_t is stored at the
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 4580ce00aeb4..a2e27010c7fb 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -121,7 +121,7 @@ xfs_dir_isempty(
 {
        xfs_dir2_sf_hdr_t       *sfp;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        if (dp->i_d.di_size == 0)       /* might happen during shutdown. */
                return 1;
        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -179,7 +179,7 @@ xfs_dir_init(
        memset((char *)&args, 0, sizeof(args));
        args.dp = dp;
        args.trans = tp;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
                return error;
        return xfs_dir2_sf_create(&args, pdp->i_ino);
@@ -202,7 +202,7 @@ xfs_dir_createname(
        int                     rval;
        int                     v;              /* type-checking value */
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
        XFS_STATS_INC(xs_dir_create);
@@ -278,7 +278,7 @@ xfs_dir_lookup(
        int             rval;
        int             v;              /* type-checking value */
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_lookup);
        memset(&args, 0, sizeof(xfs_da_args_t));
@@ -333,7 +333,7 @@ xfs_dir_removename(
        int             rval;
        int             v;              /* type-checking value */
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_remove);
        memset(&args, 0, sizeof(xfs_da_args_t));
@@ -382,7 +382,7 @@ xfs_readdir(
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_getdents);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -414,7 +414,7 @@ xfs_dir_replace(
        int             rval;
        int             v;              /* type-checking value */
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
@@ -464,7 +464,7 @@ xfs_dir_canenter(
        if (resblks)
                return 0;
-        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        ASSERT(S_ISDIR(dp->i_d.di_mode));
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae32..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 837f31158d43..db62959bed13 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk(
        int             curid, i;
        ASSERT(tp);
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
-        d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+        d = bp->b_addr;
        /*
         * ID of the first dquot in the block - id's are zero based.
@@ -403,7 +402,7 @@ xfs_qm_dqalloc(
                               dqp->q_blkno,
                               mp->m_quotainfo->qi_dqchunklen,
                               0);
-        if (!bp || (error = XFS_BUF_GETERROR(bp)))
+        if (!bp || (error = xfs_buf_geterror(bp)))
                goto error1;
        /*
         * Make a chunk of dquots out of this buffer and log
@@ -534,13 +533,12 @@ xfs_qm_dqtobp(
                        return XFS_ERROR(error);
        }
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        /*
         * calculate the location of the dquot inside the buffer.
         */
-        ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddq = bp->b_addr + dqp->q_bufoffset;
        /*
         * A simple sanity check in case we got a corrupted dquot...
@@ -553,7 +551,6 @@ xfs_qm_dqtobp(
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
                }
-                XFS_BUF_BUSY(bp); /* We dirtied this */
        }
        *O_bpp = bp;
@@ -622,7 +619,6 @@ xfs_qm_dqread(
         * this particular dquot was repaired. We still aren't afraid to
         * brelse it because we have the changes incore.
         */
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        xfs_trans_brelse(tp, bp);
@@ -1204,7 +1200,7 @@ xfs_qm_dqflush(
        /*
         * Calculate the location of the dquot inside the buffer.
         */
-        ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddqp = bp->b_addr + dqp->q_bufoffset;
        /*
         * A simple sanity check in case we got a corrupted dquot..
@@ -1240,7 +1236,7 @@ xfs_qm_dqflush(
         * If the buffer is pinned then push on the log so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp)) {
+        if (xfs_buf_ispinned(bp)) {
                trace_xfs_dqflush_force(dqp);
                xfs_log_force(mp, 0);
        }
@@ -1447,7 +1443,7 @@ xfs_qm_dqflock_pushbuf_wait(
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
-                if (XFS_BUF_ISPINNED(bp))
+                if (xfs_buf_ispinned(bp))
                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbfa..34b7e945dbfa 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48f..75e5d322e48f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/xfs_export.c
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h
index 3272b6ae7a35..3272b6ae7a35 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/xfs_export.h
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c
index 825390e1c138..7f7b42469ea7 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -149,7 +149,9 @@ xfs_file_fsync(
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        xfs_ioend_wait(ip);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        if (mp->m_flags & XFS_MOUNT_BARRIER) {
                /*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 9124425b7f2f..3ff3d9e23ded 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -344,9 +344,9 @@ _xfs_filestream_update_ag(
         * Either ip is a regular file and pip is a directory, or ip is a
         * directory and pip is NULL.
         */
-        ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
+        ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
-                       (pip->i_d.di_mode & S_IFDIR)) ||
+                       S_ISDIR(pip->i_d.di_mode)) ||
-                      ((ip->i_d.di_mode & S_IFDIR) && !pip)));
+                      (S_ISDIR(ip->i_d.di_mode) && !pip)));
        mp = ip->i_mount;
        cache = mp->m_filestream;
@@ -537,7 +537,7 @@ xfs_filestream_lookup_ag(
        xfs_agnumber_t  ag;
        int             ref;
-        if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
+        if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
                ASSERT(0);
                return NULLAGNUMBER;
        }
@@ -579,9 +579,9 @@ xfs_filestream_associate(
        xfs_agnumber_t  ag, rotorstep, startag;
        int             err = 0;
-        ASSERT(pip->i_d.di_mode & S_IFDIR);
+        ASSERT(S_ISDIR(pip->i_d.di_mode));
-        ASSERT(ip->i_d.di_mode & S_IFREG);
+        ASSERT(S_ISREG(ip->i_d.di_mode));
-        if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
+        if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
                return -EINVAL;
        mp = pip->i_mount;
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dd5628bd8d0b..9f24ec28283b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -202,8 +202,7 @@ xfs_ialloc_inode_init(
                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                         mp->m_bsize * blks_per_cluster,
                                         XBF_LOCK);
-                ASSERT(fbuf);
+                ASSERT(!xfs_buf_geterror(fbuf));
-                ASSERT(!XFS_BUF_GETERROR(fbuf));
                /*
                 * Initialize all inodes in this buffer and then log them.
@@ -1486,7 +1485,7 @@ xfs_read_agi(
        if (error)
                return error;
-        ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+        ASSERT(!xfs_buf_geterror(*bpp));
        agi = XFS_BUF_TO_AGI(*bpp);
        /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cc21ddf9f7e..0239a7c7c886 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -368,7 +368,7 @@ xfs_iformat(
                        /*
                         * no local regular files yet
                         */
-                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
+                        if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
                                xfs_warn(ip->i_mount,
                        "corrupt inode %Lu (local format for regular file).",
                                        (unsigned long long) ip->i_ino);
@@ -1040,7 +1040,7 @@ xfs_ialloc(
        if (pip && XFS_INHERIT_GID(pip)) {
                ip->i_d.di_gid = pip->i_d.di_gid;
-                if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
+                if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
                        ip->i_d.di_mode |= S_ISGID;
                }
        }
@@ -1097,14 +1097,14 @@ xfs_ialloc(
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
                        uint    di_flags = 0;
-                        if ((mode & S_IFMT) == S_IFDIR) {
+                        if (S_ISDIR(mode)) {
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
                                        di_flags |= XFS_DIFLAG_RTINHERIT;
                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
                                        ip->i_d.di_extsize = pip->i_d.di_extsize;
                                }
-                        } else if ((mode & S_IFMT) == S_IFREG) {
+                        } else if (S_ISREG(mode)) {
                                if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
                                        di_flags |= XFS_DIFLAG_REALTIME;
                                if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
@@ -1188,7 +1188,7 @@ xfs_isize_check(
        int                     nimaps;
        xfs_bmbt_irec_t         imaps[2];
-        if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+        if (!S_ISREG(ip->i_d.di_mode))
                return;
        if (XFS_IS_REALTIME_INODE(ip))
@@ -1828,7 +1828,7 @@ xfs_ifree(
        ASSERT(ip->i_d.di_nextents == 0);
        ASSERT(ip->i_d.di_anextents == 0);
        ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
-               ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
+               (!S_ISREG(ip->i_d.di_mode)));
        ASSERT(ip->i_d.di_nblocks == 0);
        /*
@@ -2473,7 +2473,7 @@ cluster_corrupt_out:
                if (bp->b_iodone) {
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
-                        XFS_BUF_ERROR(bp,EIO);
+                        xfs_buf_ioerror(bp, EIO);
                        xfs_buf_ioend(bp, 0);
                } else {
                        XFS_BUF_STALE(bp);
@@ -2585,7 +2585,7 @@ xfs_iflush(
         * If the buffer is pinned then push on the log now so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                xfs_log_force(mp, 0);
        /*
@@ -2671,7 +2671,7 @@ xfs_iflush_int(
                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
                goto corrupt_out;
        }
-        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+        if (S_ISREG(ip->i_d.di_mode)) {
                if (XFS_TEST_ERROR(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -2681,7 +2681,7 @@ xfs_iflush_int(
                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
-        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+        } else if (S_ISDIR(ip->i_d.di_mode)) {
                if (XFS_TEST_ERROR(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a97644ab945a..2380a4bcbece 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -263,7 +263,7 @@ typedef struct xfs_inode {
        struct inode            i_vnode;        /* embedded VFS inode */
 } xfs_inode_t;
-#define XFS_ISIZE(ip)   (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
+#define XFS_ISIZE(ip)   S_ISREG((ip)->i_d.di_mode) ? \
                                (ip)->i_size : (ip)->i_d.di_size;
 /* Convert from vfs inode to xfs inode */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index acca2c5ca3fa..f7ce7debe14c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -265,7 +265,7 @@ xfs_open_by_handle(
                return PTR_ERR(filp);
        }
-        if (inode->i_mode & S_IFREG) {
+        if (S_ISREG(inode->i_mode)) {
                filp->f_flags |= O_NOATIME;
                filp->f_mode |= FMODE_NOCMTIME;
        }
@@ -850,14 +850,14 @@ xfs_set_diflags(
                di_flags |= XFS_DIFLAG_NODEFRAG;
        if (xflags & XFS_XFLAG_FILESTREAM)
                di_flags |= XFS_DIFLAG_FILESTREAM;
-        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+        if (S_ISDIR(ip->i_d.di_mode)) {
                if (xflags & XFS_XFLAG_RTINHERIT)
                        di_flags |= XFS_DIFLAG_RTINHERIT;
                if (xflags & XFS_XFLAG_NOSYMLINKS)
                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
                if (xflags & XFS_XFLAG_EXTSZINHERIT)
                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+        } else if (S_ISREG(ip->i_d.di_mode)) {
                if (xflags & XFS_XFLAG_REALTIME)
                        di_flags |= XFS_DIFLAG_REALTIME;
                if (xflags & XFS_XFLAG_EXTSIZE)
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb85..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c
index 6544c3236bc8..673704fab748 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -70,9 +70,8 @@ xfs_synchronize_times(
 }
 /*
- * If the linux inode is valid, mark it dirty.
+ * If the linux inode is valid, mark it dirty, else mark the dirty state
- * Used when committing a dirty inode into a transaction so that
+ * in the XFS inode to make sure we pick it up when reclaiming the inode.
- * the inode will get written back by the linux code
 */
 void
 xfs_mark_inode_dirty_sync(
@@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync(
        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty_sync(inode);
+        else {
+                barrier();
+                ip->i_update_core = 1;
+        }
 }
 void
@@ -92,6 +95,11 @@ xfs_mark_inode_dirty(
        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty(inode);
+        else {
+                barrier();
+                ip->i_update_core = 1;
+        }
 }
 /*
@@ -1194,9 +1202,14 @@ xfs_setup_inode(
                break;
        }
-        /* if there is no attribute fork no ACL can exist on this inode */
+        /*
-        if (!XFS_IFORK_Q(ip))
+         * If there is no attribute fork no ACL can exist on this inode,
+         * and it can't have any file capabilities attached to it either.
+         */
+        if (!XFS_IFORK_Q(ip)) {
+                inode_has_no_xattr(inode);
                cache_no_acl(inode);
+        }
        xfs_iflags_clear(ip, XFS_INEW);
        barrier();
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h
index d42f814e4d35..1e8a45e74c3e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,13 +32,12 @@
 # define XFS_BIG_INUMS  0
 #endif
-#include <xfs_types.h>
+#include "xfs_types.h"
-#include <kmem.h>
+#include "kmem.h"
-#include <mrlock.h>
+#include "mrlock.h"
-#include <time.h>
+#include "time.h"
+#include "uuid.h"
-#include <support/uuid.h>
 #include <linux/semaphore.h>
 #include <linux/mm.h>
@@ -78,14 +77,14 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_vnode.h>
+#include "xfs_vnode.h"
-#include <xfs_stats.h>
+#include "xfs_stats.h"
-#include <xfs_sysctl.h>
+#include "xfs_sysctl.h"
-#include <xfs_iops.h>
+#include "xfs_iops.h"
-#include <xfs_aops.h>
+#include "xfs_aops.h"
-#include <xfs_super.h>
+#include "xfs_super.h"
-#include <xfs_buf.h>
+#include "xfs_buf.h"
-#include <xfs_message.h>
+#include "xfs_message.h"
 #ifdef __BIG_ENDIAN
 #define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 06ff8437ed8e..3a8d4f66d702 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -878,7 +878,7 @@ xlog_iodone(xfs_buf_t *bp)
        /*
         * Race to shutdown the filesystem if we see an error.
         */
-        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
+        if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
                        XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
                xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
                XFS_BUF_STALE(bp);
@@ -1051,7 +1051,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        if (!bp)
                goto out_free_log;
        bp->b_iodone = xlog_iodone;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        log->l_xbuf = bp;
@@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
-                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(xfs_buf_islocked(iclog->ic_bp));
                init_waitqueue_head(&iclog->ic_force_wait);
                init_waitqueue_head(&iclog->ic_write_wait);
@@ -1248,7 +1246,7 @@ xlog_bdstrat(
        struct xlog_in_core     *iclog = bp->b_fspriv;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
-                XFS_BUF_ERROR(bp, EIO);
+                xfs_buf_ioerror(bp, EIO);
                XFS_BUF_STALE(bp);
                xfs_buf_ioend(bp, 0);
                /*
@@ -1355,7 +1353,6 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_SET_COUNT(bp, count);
        bp->b_fspriv = iclog;
        XFS_BUF_ZEROFLAGS(bp);
-        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
@@ -1398,16 +1395,15 @@ xlog_sync(xlog_t		*log,
        if (split) {
                bp = iclog->ic_log->l_xbuf;
                XFS_BUF_SET_ADDR(bp, 0);             /* logical 0 */
-                XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+                xfs_buf_associate_memory(bp,
-                                            (__psint_t)count), split);
+                                (char *)&iclog->ic_header + count, split);
                bp->b_fspriv = iclog;
                XFS_BUF_ZEROFLAGS(bp);
-                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                dptr = XFS_BUF_PTR(bp);
+                dptr = bp->b_addr;
                /*
                 * Bump the cycle numbers at the start of each block
                 * since this part of the buffer is at the start of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8fe4206de057..a199dbcee7d8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_align(
        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-        return XFS_BUF_PTR(bp) + BBTOB(offset);
+        return bp->b_addr + BBTOB(offset);
 }
@@ -178,9 +178,7 @@ xlog_bread_noalign(
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
-        XFS_BUF_BUSY(bp);
        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
-        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
        error = xfs_buf_iowait(bp);
@@ -220,18 +218,18 @@ xlog_bread_offset(
        xfs_buf_t       *bp,
        xfs_caddr_t     offset)
 {
-        xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+        xfs_caddr_t     orig_offset = bp->b_addr;
        int             orig_len = bp->b_buffer_length;
        int             error, error2;
-        error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+        error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
        if (error)
                return error;
        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
        /* must reset buffer pointer even on error */
-        error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+        error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
        if (error)
                return error;
        return error2;
@@ -266,11 +264,9 @@ xlog_bwrite(
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_ZEROFLAGS(bp);
-        XFS_BUF_BUSY(bp);
+        xfs_buf_hold(bp);
-        XFS_BUF_HOLD(bp);
        xfs_buf_lock(bp);
        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
-        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        if ((error = xfs_bwrite(log->l_mp, bp)))
                xfs_ioerror_alert("xlog_bwrite", log->l_mp,
@@ -360,7 +356,7 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-        if (XFS_BUF_GETERROR(bp)) {
+        if (bp->b_error) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
@@ -1262,7 +1258,7 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                        offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+                        offset = bp->b_addr + BBTOB(ealign - start_block);
                        error = xlog_bread_offset(log, ealign, sectbb,
                                                        bp, offset);
                        if (error)
@@ -2135,15 +2131,16 @@ xlog_recover_buffer_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
                          buf_flags);
-        if (XFS_BUF_ISERROR(bp)) {
+        if (!bp)
+                return XFS_ERROR(ENOMEM);
+        error = bp->b_error;
+        if (error) {
                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
                                  bp, buf_f->blf_blkno);
-                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                return error;
        }
-        error = 0;
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (buf_f->blf_flags &
@@ -2227,14 +2224,17 @@ xlog_recover_inode_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
-        if (XFS_BUF_ISERROR(bp)) {
+        if (!bp) {
+                error = ENOMEM;
+                goto error;
+        }
+        error = bp->b_error;
+        if (error) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
                                  bp, in_f->ilf_blkno);
-                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                goto error;
        }
-        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2283,7 +2283,7 @@ xlog_recover_inode_pass2(
        /* Take the opportunity to reset the flush iteration count */
        dicp->di_flushiter = 0;
-        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
+        if (unlikely(S_ISREG(dicp->di_mode))) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
@@ -2296,7 +2296,7 @@ xlog_recover_inode_pass2(
                        error = EFSCORRUPTED;
                        goto error;
                }
-        } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
+        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
@@ -3437,7 +3437,7 @@ xlog_do_recovery_pass(
                        /*
                         * Check for header wrapping around physical end-of-log
                         */
-                        offset = XFS_BUF_PTR(hbp);
+                        offset = hbp->b_addr;
                        split_hblks = 0;
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
@@ -3497,7 +3497,7 @@ xlog_do_recovery_pass(
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
-                                offset = XFS_BUF_PTR(dbp);
+                                offset = dbp->b_addr;
                                split_bblks = 0;
                                if (blk_no != log->l_logBBsize) {
                                        /* some data is before the physical
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95ac..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/xfs_message.c
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/xfs_message.h
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7f25245da289..0081657ad985 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1331,7 +1331,7 @@ xfs_mountfs(
        ASSERT(rip != NULL);
-        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
+        if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
                xfs_warn(mp, "corrupted root inode %llu: not a directory",
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1615,7 +1615,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNDELAYWRITE(sbp);
                XFS_BUF_WRITE(sbp);
                XFS_BUF_UNASYNC(sbp);
-                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
+                ASSERT(sbp->b_target == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
                error = xfs_buf_iowait(sbp);
                if (error)
@@ -1938,7 +1938,7 @@ xfs_getsb(
                xfs_buf_lock(bp);
        }
-        XFS_BUF_HOLD(bp);
+        xfs_buf_hold(bp);
        ASSERT(XFS_BUF_ISDONE(bp));
        return bp;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c
index 46e54ad9a2dc..9a0aa76facdf 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts(
        do_div(j, sizeof(xfs_dqblk_t));
        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
-        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+        ddq = bp->b_addr;
        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052c..43b9abe1052c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
index 5b964fc0dc09..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/xfs_qm_stats.h
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6c..609246f42e6c 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 29b9d642e93d..7e76f537abb7 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -25,7 +25,7 @@
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "quota/xfs_qm.h"
+#include "xfs_qm.h"
 #include <linux/quota.h>
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 77a59891734e..df78c297d1a1 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -116,7 +116,7 @@ xfs_rename(
        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
        new_parent = (src_dp != target_dp);
-        src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
+        src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
        if (src_is_directory) {
                /*
@@ -226,7 +226,7 @@ xfs_rename(
                 * target and source are directories and that target can be
                 * destroyed, or that neither is a directory.
                 */
-                if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                if (S_ISDIR(target_ip->i_d.di_mode)) {
                        /*
                         * Make sure target dir is empty.
                         */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8f76fdff4f46..35561a511b57 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -168,7 +168,7 @@ error_cancel:
                                xfs_trans_cancel(tp, cancelflags);
                                goto error;
                        }
-                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
+                        memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
                        /*
                         * Commit the transaction.
@@ -883,7 +883,7 @@ xfs_rtbuf_get(
        if (error) {
                return error;
        }
-        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        ASSERT(!xfs_buf_geterror(bp));
        *bpp = bp;
        return 0;
 }
@@ -943,7 +943,7 @@ xfs_rtcheck_range(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Compute the starting word's address, and starting bit.
         */
@@ -994,7 +994,7 @@ xfs_rtcheck_range(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1040,7 +1040,7 @@ xfs_rtcheck_range(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1158,7 +1158,7 @@ xfs_rtfind_back(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Get the first word's index & point to it.
         */
@@ -1210,7 +1210,7 @@ xfs_rtfind_back(
                        if (error) {
                                return error;
                        }
-                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        bufp = bp->b_addr;
                        word = XFS_BLOCKWMASK(mp);
                        b = &bufp[word];
                } else {
@@ -1256,7 +1256,7 @@ xfs_rtfind_back(
                        if (error) {
                                return error;
                        }
-                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        bufp = bp->b_addr;
                        word = XFS_BLOCKWMASK(mp);
                        b = &bufp[word];
                } else {
@@ -1333,7 +1333,7 @@ xfs_rtfind_forw(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Get the first word's index & point to it.
         */
@@ -1384,7 +1384,7 @@ xfs_rtfind_forw(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1429,7 +1429,7 @@ xfs_rtfind_forw(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1649,7 +1649,7 @@ xfs_rtmodify_range(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Compute the starting word's address, and starting bit.
         */
@@ -1694,7 +1694,7 @@ xfs_rtmodify_range(
                        if (error) {
                                return error;
                        }
-                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        first = b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1734,7 +1734,7 @@ xfs_rtmodify_range(
                        if (error) {
                                return error;
                        }
-                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        first = b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1832,8 +1832,8 @@ xfs_rtmodify_summary(
         */
        sp = XFS_SUMPTR(mp, bp, so);
        *sp += delta;
-        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-                (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+                (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 09e1f4f35e97..f7f3a359c1c5 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -47,7 +47,7 @@ struct xfs_trans;
 #define XFS_SUMOFFSTOBLOCK(mp,s)        \
        (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
 #define XFS_SUMPTR(mp,bp,so)    \
-        ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+        ((xfs_suminfo_t *)((bp)->b_addr + \
                (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
 #define XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index d6d6fdfe9422..c96a8a05ac03 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -104,9 +104,9 @@ xfs_ioerror_alert(
        xfs_alert(mp,
                 "I/O error occurred: meta-data dev %s block 0x%llx"
                 "       (\"%s\") error %d buf count %zd",
-                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+                xfs_buf_target_name(bp->b_target),
                (__uint64_t)blkno, func,
-                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
+                bp->b_error, XFS_BUF_COUNT(bp));
 }
 /*
@@ -137,8 +137,8 @@ xfs_read_buf(
        bp = xfs_buf_read(target, blkno, len, flags);
        if (!bp)
                return XFS_ERROR(EIO);
-        error = XFS_BUF_GETERROR(bp);
+        error = bp->b_error;
-        if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+        if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
                *bpp = bp;
        } else {
                *bpp = NULL;
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1eb2ba586814..cb6ae715814a 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 #define XFS_SB_DADDR            ((xfs_daddr_t)0) /* daddr in filesystem/ag */
 #define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)((bp)->b_addr))
 #define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
 #define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c
index 9a72dda58bd0..2366c54cc4fa 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -356,6 +356,8 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+                        xfs_warn(mp,
+        "nodelaylog is deprecated and will be removed in Linux 3.3");
                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
                        mp->m_flags |= XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -877,33 +879,17 @@ xfs_log_inode(
        struct xfs_trans        *tp;
        int                     error;
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                /* we need to return with the lock hold shared */
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-        /*
-         * Note - it's possible that we might have pushed ourselves out of the
-         * way during trans_reserve which would flush the inode.  But there's
-         * no guarantee that the inode buffer has actually gone out yet (it's
-         * delwri).  Plus the buffer could be pinned anyway if it's part of
-         * an inode in another recent transaction.  So we play it safe and
-         * fire off the transaction anyway.
-         */
-        xfs_trans_ijoin(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
+        return xfs_trans_commit(tp, 0);
-        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 STATIC int
@@ -918,7 +904,9 @@ xfs_fs_write_inode(
        trace_xfs_write_inode(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -XFS_ERROR(EIO);
+        if (!ip->i_update_core)
+                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
                /*
@@ -929,12 +917,10 @@ xfs_fs_write_inode(
                 * of synchronous log foces dramatically.
                 */
                xfs_ioend_wait(ip);
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                error = xfs_log_inode(ip);
-                if (ip->i_update_core) {
+                if (error)
-                        error = xfs_log_inode(ip);
+                        goto out;
-                        if (error)
+                return 0;
-                                goto out_unlock;
-                }
        } else {
                /*
                 * We make this non-blocking if the inode is contended, return
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/xfs_super.h
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c
index e4c938afb910..4604f90f86a3 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -332,7 +332,7 @@ xfs_sync_fsdata(
         * between there and here.
         */
        bp = xfs_getsb(mp, 0);
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                xfs_log_force(mp, 0);
        return xfs_bwrite(mp, bp);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..941202e7ac6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c
index 88d25d4aa56e..9010ce885e6a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -43,8 +43,8 @@
 #include "xfs_quota.h"
 #include "xfs_iomap.h"
 #include "xfs_aops.h"
-#include "quota/xfs_dquot_item.h"
+#include "xfs_dquot_item.h"
-#include "quota/xfs_dquot.h"
+#include "xfs_dquot.h"
 #include "xfs_log_recover.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd72..690fc7a7bd72 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 43233e92f0f6..c15aa29fa169 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -299,7 +299,7 @@ xfs_trans_ail_cursor_last(
 * Splice the log item list into the AIL at the given LSN. We splice to the
 * tail of the given LSN to maintain insert order for push traversals. The
 * cursor is optional, allowing repeated updates to the same LSN to avoid
- * repeated traversals.
+ * repeated traversals.  This should not be called with an empty list.
 */
 static void
 xfs_ail_splice(
@@ -308,50 +308,39 @@ xfs_ail_splice(
        struct list_head        *list,
        xfs_lsn_t               lsn)
 {
-        struct xfs_log_item     *lip = cur ? cur->item : NULL;
+        struct xfs_log_item     *lip;
-        struct xfs_log_item     *next_lip;
+        ASSERT(!list_empty(list));
        /*
-         * Get a new cursor if we don't have a placeholder or the existing one
+         * Use the cursor to determine the insertion point if one is
-         * has been invalidated.
+         * provided.  If not, or if the one we got is not valid,
+         * find the place in the AIL where the items belong.
         */
-        if (!lip || (__psint_t)lip & 1) {
+        lip = cur ? cur->item : NULL;
+        if (!lip || (__psint_t) lip & 1)
                lip = __xfs_trans_ail_cursor_last(ailp, lsn);
-                if (!lip) {
+        /*
-                        /* The list is empty, so just splice and return.  */
+         * If a cursor is provided, we know we're processing the AIL
-                        if (cur)
+         * in lsn order, and future items to be spliced in will
-                                cur->item = NULL;
+         * follow the last one being inserted now.  Update the
-                        list_splice(list, &ailp->xa_ail);
+         * cursor to point to that last item, now while we have a
-                        return;
+         * reliable pointer to it.
-                }
+         */
-        }
+        if (cur)
+                cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
        /*
-         * Our cursor points to the item we want to insert _after_, so we have
+         * Finally perform the splice.  Unless the AIL was empty,
-         * to update the cursor to point to the end of the list we are splicing
+         * lip points to the item in the AIL _after_ which the new
-         * in so that it points to the correct location for the next splice.
+         * items should go.  If lip is null the AIL was empty, so
-         * i.e. before the splice
+         * the new items go at the head of the AIL.
-         *
-         *  lsn -> lsn -> lsn + x -> lsn + x ...
-         *          ^
-         *          | cursor points here
-         *
-         * After the splice we have:
-         *
-         *  lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
-         *          ^                            ^
-         *          | cursor points here         | needs to move here
-         *
-         * So we set the cursor to the last item in the list to be spliced
-         * before we execute the splice, resulting in the cursor pointing to
-         * the correct item after the splice occurs.
         */
-        if (cur) {
+        if (lip)
-                next_lip = list_entry(list->prev, struct xfs_log_item, li_ail);
+                list_splice(list, &lip->li_ail);
-                cur->item = next_lip;
+        else
-        }
+                list_splice(list, &ailp->xa_ail);
-        list_splice(list, &lip->li_ail);
 }
 /*
@@ -682,6 +671,7 @@ xfs_trans_ail_update_bulk(
        int                     i;
        LIST_HEAD(tmp);
+        ASSERT(nr_items > 0);           /* Not required, but true. */
        mlip = xfs_ail_min(ailp);
        for (i = 0; i < nr_items; i++) {
@@ -701,7 +691,8 @@ xfs_trans_ail_update_bulk(
                list_add(&lip->li_ail, &tmp);
        }
-        xfs_ail_splice(ailp, cur, &tmp, lsn);
+        if (!list_empty(&tmp))
+                xfs_ail_splice(ailp, cur, &tmp, lsn);
        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15584fc3ed7d..137e2b9e2948 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -54,7 +54,7 @@ xfs_trans_buf_item_match(
        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                blip = (struct xfs_buf_log_item *)lidp->lid_item;
                if (blip->bli_item.li_type == XFS_LI_BUF &&
-                    XFS_BUF_TARGET(blip->bli_buf) == target &&
+                    blip->bli_buf->b_target == target &&
                    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
                    XFS_BUF_COUNT(blip->bli_buf) == len)
                        return blip->bli_buf;
@@ -80,7 +80,6 @@ _xfs_trans_bjoin(
 {
        struct xfs_buf_log_item *bip;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == NULL);
        /*
@@ -194,7 +193,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
                return NULL;
        }
-        ASSERT(!XFS_BUF_GETERROR(bp));
+        ASSERT(!bp->b_error);
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_get_buf(bp->b_fspriv);
@@ -293,10 +292,10 @@ xfs_trans_read_buf(
                        return (flags & XBF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
-                if (XFS_BUF_GETERROR(bp) != 0) {
+                if (bp->b_error) {
+                        error = bp->b_error;
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                          bp, blkno);
-                        error = XFS_BUF_GETERROR(bp);
                        xfs_buf_relse(bp);
                        return error;
                }
@@ -330,7 +329,7 @@ xfs_trans_read_buf(
                ASSERT(xfs_buf_islocked(bp));
                ASSERT(bp->b_transp == tp);
                ASSERT(bp->b_fspriv != NULL);
-                ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+                ASSERT(!bp->b_error);
                if (!(XFS_BUF_ISDONE(bp))) {
                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                        ASSERT(!XFS_BUF_ISASYNC(bp));
@@ -386,10 +385,9 @@ xfs_trans_read_buf(
                return (flags & XBF_TRYLOCK) ?
                                        0 : XFS_ERROR(ENOMEM);
        }
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        if (bp->b_error) {
-            XFS_BUF_SUPER_STALE(bp);
+                error = bp->b_error;
-                error = XFS_BUF_GETERROR(bp);
+                XFS_BUF_SUPER_STALE(bp);
                xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                  bp, blkno);
                if (tp->t_flags & XFS_TRANS_DIRTY)
@@ -430,7 +428,7 @@ shutdown_abort:
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
-        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+        ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
@@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
@@ -702,7 +697,6 @@ xfs_trans_binval(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -774,7 +768,6 @@ xfs_trans_inode_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -851,7 +842,6 @@ xfs_trans_dquot_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..4d00ee67792d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 88d121486c52..51fc429527bc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -83,7 +83,9 @@ xfs_readlink_bmap(
                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
                                  XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
-                error = XFS_BUF_GETERROR(bp);
+                if (!bp)
+                        return XFS_ERROR(ENOMEM);
+                error = bp->b_error;
                if (error) {
                        xfs_ioerror_alert("xfs_readlink",
                                  ip->i_mount, bp, XFS_BUF_ADDR(bp));
@@ -94,7 +96,7 @@ xfs_readlink_bmap(
                        byte_cnt = pathlen;
                pathlen -= byte_cnt;
-                memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
+                memcpy(link, bp->b_addr, byte_cnt);
                xfs_buf_relse(bp);
        }
@@ -121,7 +123,7 @@ xfs_readlink(
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
+        ASSERT(S_ISLNK(ip->i_d.di_mode));
        ASSERT(ip->i_d.di_size <= MAXPATHLEN);
        pathlen = ip->i_d.di_size;
@@ -529,7 +531,7 @@ xfs_release(
        if (ip->i_d.di_nlink == 0)
                return 0;
-        if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+        if ((S_ISREG(ip->i_d.di_mode) &&
             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
               ip->i_delayed_blks > 0)) &&
             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
@@ -610,7 +612,7 @@ xfs_inactive(
        truncate = ((ip->i_d.di_nlink == 0) &&
            ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
             (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
-            ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
+            S_ISREG(ip->i_d.di_mode));
        mp = ip->i_mount;
@@ -621,7 +623,7 @@ xfs_inactive(
                goto out;
        if (ip->i_d.di_nlink != 0) {
-                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                if ((S_ISREG(ip->i_d.di_mode) &&
                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
@@ -669,7 +671,7 @@ xfs_inactive(
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
                        return VN_INACTIVE_CACHE;
                }
-        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
+        } else if (S_ISLNK(ip->i_d.di_mode)) {
                /*
                 * If we get an error while cleaning up a
@@ -1648,13 +1650,13 @@ xfs_symlink(
                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                               BTOBB(byte_cnt), 0);
-                        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+                        ASSERT(!xfs_buf_geterror(bp));
                        if (pathlen < byte_cnt) {
                                byte_cnt = pathlen;
                        }
                        pathlen -= byte_cnt;
-                        memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
+                        memcpy(bp->b_addr, cur_chunk, byte_cnt);
                        cur_chunk += byte_cnt;
                        xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
@@ -1999,7 +2001,7 @@ xfs_zero_remaining_bytes(
                                          mp, bp, XFS_BUF_ADDR(bp));
                        break;
                }
-                memset(XFS_BUF_PTR(bp) +
+                memset(bp->b_addr +
                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
                      0, lastoffset - offset + 1);
                XFS_BUF_UNDONE(bp);
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c