274 files changed, 10699 insertions, 8128 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index ef9661886112..2b78014a124a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -132,21 +132,19 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
        options = tmp_options;
        while ((p = strsep(&options, ",")) != NULL) {
-                int token;
+                int token, r;
                if (!*p)
                        continue;
                token = match_token(p, tokens, args);
-                if (token < Opt_uname) {
+                switch (token) {
-                        int r = match_int(&args[0], &option);
+                case Opt_debug:
+                        r = match_int(&args[0], &option);
                        if (r < 0) {
                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "integer field, but no integer?\n");
+                                           "integer field, but no integer?\n");
                                ret = r;
                                continue;
                        }
-                }
-                switch (token) {
-                case Opt_debug:
                        v9ses->debug = option;
 #ifdef CONFIG_NET_9P_DEBUG
                        p9_debug_level = option;
@@ -154,12 +152,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        break;
                case Opt_dfltuid:
+                        r = match_int(&args[0], &option);
+                        if (r < 0) {
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                           "integer field, but no integer?\n");
+                                ret = r;
+                                continue;
+                        }
                        v9ses->dfltuid = option;
                        break;
                case Opt_dfltgid:
+                        r = match_int(&args[0], &option);
+                        if (r < 0) {
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                           "integer field, but no integer?\n");
+                                ret = r;
+                                continue;
+                        }
                        v9ses->dfltgid = option;
                        break;
                case Opt_afid:
+                        r = match_int(&args[0], &option);
+                        if (r < 0) {
+                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                           "integer field, but no integer?\n");
+                                ret = r;
+                                continue;
+                        }
                        v9ses->afid = option;
                        break;
                case Opt_uname:
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 46ce357ca1ab..410ffd6ceb5f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-                    struct inode *inode, int mode);
+                    struct inode *inode, int mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
        return;
 }
+int v9fs_open_to_dotl_flags(int flags);
 #endif
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 9c2bdda5cd9d..598fff1a54e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -165,9 +165,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                while (rdir->head < rdir->tail) {
                        p9stat_init(&st);
-                        err = p9stat_read(rdir->buf + rdir->head,
+                        err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
-                                                rdir->tail - rdir->head, &st,
+                                          rdir->tail - rdir->head, &st);
-                                                fid->clnt->proto_version);
                        if (err) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
                                err = -EIO;
@@ -231,7 +230,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
        while (err == 0) {
                if (rdir->tail == rdir->head) {
                        err = p9_client_readdir(fid, rdir->buf, buflen,
-                                                                filp->f_pos);
+                                                filp->f_pos);
                        if (err <= 0)
                                goto unlock_and_exit;
@@ -241,10 +240,9 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                while (rdir->head < rdir->tail) {
-                        err = p9dirent_read(rdir->buf + rdir->head,
+                        err = p9dirent_read(fid->clnt, rdir->buf + rdir->head,
-                                                rdir->tail - rdir->head,
+                                            rdir->tail - rdir->head,
-                                                &curdirent,
+                                            &curdirent);
-                                                fid->clnt->proto_version);
                        if (err < 0) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
                                err = -EIO;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3c173fcc2c5a..62857a810a79 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
-                omode = file->f_flags;
+                omode = v9fs_open_to_dotl_flags(file->f_flags);
        else
                omode = v9fs_uflags2omode(file->f_flags,
                                        v9fs_proto_dotu(v9ses));
@@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
        /* convert posix lock to p9 tlock args */
        memset(&flock, 0, sizeof(flock));
-        flock.type = fl->fl_type;
+        /* map the lock type */
+        switch (fl->fl_type) {
+        case F_RDLCK:
+                flock.type = P9_LOCK_TYPE_RDLCK;
+                break;
+        case F_WRLCK:
+                flock.type = P9_LOCK_TYPE_WRLCK;
+                break;
+        case F_UNLCK:
+                flock.type = P9_LOCK_TYPE_UNLCK;
+                break;
+        }
        flock.start = fl->fl_start;
        if (fl->fl_end == OFFSET_MAX)
                flock.length = 0;
@@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
        /* convert posix lock to p9 tgetlock args */
        memset(&glock, 0, sizeof(glock));
-        glock.type = fl->fl_type;
+        glock.type  = P9_LOCK_TYPE_UNLCK;
        glock.start = fl->fl_start;
        if (fl->fl_end == OFFSET_MAX)
                glock.length = 0;
@@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
        res = p9_client_getlock_dotl(fid, &glock);
        if (res < 0)
                return res;
-        if (glock.type != F_UNLCK) {
+        /* map 9p lock type to os lock type */
-                fl->fl_type = glock.type;
+        switch (glock.type) {
+        case P9_LOCK_TYPE_RDLCK:
+                fl->fl_type = F_RDLCK;
+                break;
+        case P9_LOCK_TYPE_WRLCK:
+                fl->fl_type = F_WRLCK;
+                break;
+        case P9_LOCK_TYPE_UNLCK:
+                fl->fl_type = F_UNLCK;
+                break;
+        }
+        if (glock.type != P9_LOCK_TYPE_UNLCK) {
                fl->fl_start = glock.start;
                if (glock.length == 0)
                        fl->fl_end = OFFSET_MAX;
                else
                        fl->fl_end = glock.start + glock.length - 1;
                fl->fl_pid = glock.proc_id;
-        } else
+        }
-                fl->fl_type = F_UNLCK;
        return res;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8bb5507e822f..b5a1076aaa6c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 /**
 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
 * @v9ses: v9fs session information
- * @mode: mode to convert
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
 *
 */
+static int p9mode2unixmode(struct v9fs_session_info *v9ses,
-static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+                           struct p9_wstat *stat, dev_t *rdev)
 {
        int res;
+        int mode = stat->mode;
-        res = mode & 0777;
+        res = mode & S_IALLUGO;
+        *rdev = 0;
        if ((mode & P9_DMDIR) == P9_DMDIR)
                res |= S_IFDIR;
@@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
                 && (v9ses->nodev == 0))
                res |= S_IFIFO;
        else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
-                 && (v9ses->nodev == 0))
+                 && (v9ses->nodev == 0)) {
-                res |= S_IFBLK;
+                char type = 0, ext[32];
-        else
+                int major = -1, minor = -1;
+                strncpy(ext, stat->extension, sizeof(ext));
+                sscanf(ext, "%c %u %u", &type, &major, &minor);
+                switch (type) {
+                case 'c':
+                        res |= S_IFCHR;
+                        break;
+                case 'b':
+                        res |= S_IFBLK;
+                        break;
+                default:
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                "Unknown special type %c %s\n", type,
+                                stat->extension);
+                };
+                *rdev = MKDEV(major, minor);
+        } else
                res |= S_IFREG;
        if (v9fs_proto_dotu(v9ses)) {
@@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
                if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
                        res |= S_ISVTX;
        }
        return res;
 }
@@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode)
 }
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-                    struct inode *inode, int mode)
+                    struct inode *inode, int mode, dev_t rdev)
 {
        int err = 0;
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
-        inode->i_rdev = 0;
+        inode->i_rdev = rdev;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_mapping->a_ops = &v9fs_addr_operations;
@@ -259,10 +278,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
        case S_IFSOCK:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        inode->i_fop = &v9fs_file_operations_dotl;
                } else if (v9fs_proto_dotu(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
                } else {
                        P9_DPRINTK(P9_DEBUG_ERROR,
                                   "special files without extended mode\n");
@@ -335,7 +352,7 @@ error:
 *
 */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
 {
        int err;
        struct inode *inode;
@@ -348,7 +365,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
                return ERR_PTR(-ENOMEM);
        }
-        err = v9fs_init_inode(v9ses, inode, mode);
+        err = v9fs_init_inode(v9ses, inode, mode, rdev);
        if (err) {
                iput(inode);
                return ERR_PTR(err);
@@ -435,11 +452,12 @@ void v9fs_evict_inode(struct inode *inode)
 static int v9fs_test_inode(struct inode *inode, void *data)
 {
        int umode;
+        dev_t rdev;
        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_wstat *st = (struct p9_wstat *)data;
        struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-        umode = p9mode2unixmode(v9ses, st->mode);
+        umode = p9mode2unixmode(v9ses, st, &rdev);
        /* don't match inode of different type */
        if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
                return 0;
@@ -473,6 +491,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
                                   struct p9_wstat *st,
                                   int new)
 {
+        dev_t rdev;
        int retval, umode;
        unsigned long i_ino;
        struct inode *inode;
@@ -496,8 +515,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
         * later.
         */
        inode->i_ino = i_ino;
-        umode = p9mode2unixmode(v9ses, st->mode);
+        umode = p9mode2unixmode(v9ses, st, &rdev);
-        retval = v9fs_init_inode(v9ses, inode, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode, rdev);
        if (retval)
                goto error;
@@ -532,6 +551,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 }
 /**
+ * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
+ * plan 9 AT flag.
+ * @flags: flags to convert
+ */
+static int v9fs_at_to_dotl_flags(int flags)
+{
+        int rflags = 0;
+        if (flags & AT_REMOVEDIR)
+                rflags |= P9_DOTL_AT_REMOVEDIR;
+        return rflags;
+}
+/**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
 * @dentry:  dentry that is being deleted
@@ -558,7 +590,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
                return retval;
        }
        if (v9fs_proto_dotl(v9ses))
-                retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags);
+                retval = p9_client_unlinkat(dfid, dentry->d_name.name,
+                                            v9fs_at_to_dotl_flags(flags));
        if (retval == -EOPNOTSUPP) {
                /* Try the one based on path */
                v9fid = v9fs_fid_clone(dentry);
@@ -645,13 +678,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
                goto error;
+        d_instantiate(dentry, inode);
        return ofid;
 error:
        if (ofid)
                p9_client_clunk(ofid);
@@ -792,6 +823,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                                      struct nameidata *nameidata)
 {
+        struct dentry *res;
        struct super_block *sb;
        struct v9fs_session_info *v9ses;
        struct p9_fid *dfid, *fid;
@@ -823,22 +855,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
+        /*
-        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+         * Make sure we don't use a wrong inode due to parallel
+         * unlink. For cached mode create calls request for new
+         * inode. But with cache disabled, lookup should do this.
+         */
+        if (v9ses->cache)
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+        else
+                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
                goto error;
        }
        result = v9fs_fid_add(dentry, fid);
        if (result < 0)
                goto error_iput;
 inst_out:
-        d_add(dentry, inode);
+        /*
-        return NULL;
+         * If we had a rename on the server and a parallel lookup
+         * for the new name, then make sure we instantiate with
+         * the new name. ie look up for a/b, while on server somebody
+         * moved b under k and client parallely did a lookup for
+         * k/b.
+         */
+        res = d_materialise_unique(dentry, inode);
+        if (!IS_ERR(res))
+                return res;
+        result = PTR_ERR(res);
 error_iput:
        iput(inode);
 error:
@@ -1002,7 +1047,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                return PTR_ERR(st);
        v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
-                generic_fillattr(dentry->d_inode, stat);
+        generic_fillattr(dentry->d_inode, stat);
        p9stat_free(st);
        kfree(st);
@@ -1086,6 +1131,7 @@ void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        struct super_block *sb)
 {
+        mode_t mode;
        char ext[32];
        char tag_name[14];
        unsigned int i_nlink;
@@ -1121,31 +1167,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
                                inode->i_nlink = i_nlink;
                }
        }
-        inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
+        mode = stat->mode & S_IALLUGO;
-        if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
+        mode |= inode->i_mode & ~S_IALLUGO;
-                char type = 0;
+        inode->i_mode = mode;
-                int major = -1;
-                int minor = -1;
-                strncpy(ext, stat->extension, sizeof(ext));
-                sscanf(ext, "%c %u %u", &type, &major, &minor);
-                switch (type) {
-                case 'c':
-                        inode->i_mode &= ~S_IFBLK;
-                        inode->i_mode |= S_IFCHR;
-                        break;
-                case 'b':
-                        break;
-                default:
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                "Unknown special type %c %s\n", type,
-                                stat->extension);
-                };
-                inode->i_rdev = MKDEV(major, minor);
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-        } else
-                inode->i_rdev = 0;
        i_size_write(inode, stat->length);
        /* not real number of blocks, but 512 byte ones ... */
@@ -1411,6 +1435,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
+        int umode;
+        dev_t rdev;
        loff_t i_size;
        struct p9_wstat *st;
        struct v9fs_session_info *v9ses;
@@ -1419,6 +1445,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
        st = p9_client_stat(fid);
        if (IS_ERR(st))
                return PTR_ERR(st);
+        /*
+         * Don't update inode if the file type is different
+         */
+        umode = p9mode2unixmode(v9ses, st, &rdev);
+        if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+                goto out;
        spin_lock(&inode->i_lock);
        /*
@@ -1430,6 +1462,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
        if (v9ses->cache)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
+out:
        p9stat_free(st);
        kfree(st);
        return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index b6c8ed205192..aded79fcd5cf 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
         * later.
         */
        inode->i_ino = i_ino;
-        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        retval = v9fs_init_inode(v9ses, inode,
+                                 st->st_mode, new_decode_dev(st->st_rdev));
        if (retval)
                goto error;
@@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        return inode;
 }
+struct dotl_openflag_map {
+        int open_flag;
+        int dotl_flag;
+};
+static int v9fs_mapped_dotl_flags(int flags)
+{
+        int i;
+        int rflags = 0;
+        struct dotl_openflag_map dotl_oflag_map[] = {
+                { O_CREAT,      P9_DOTL_CREATE },
+                { O_EXCL,       P9_DOTL_EXCL },
+                { O_NOCTTY,     P9_DOTL_NOCTTY },
+                { O_TRUNC,      P9_DOTL_TRUNC },
+                { O_APPEND,     P9_DOTL_APPEND },
+                { O_NONBLOCK,   P9_DOTL_NONBLOCK },
+                { O_DSYNC,      P9_DOTL_DSYNC },
+                { FASYNC,       P9_DOTL_FASYNC },
+                { O_DIRECT,     P9_DOTL_DIRECT },
+                { O_LARGEFILE,  P9_DOTL_LARGEFILE },
+                { O_DIRECTORY,  P9_DOTL_DIRECTORY },
+                { O_NOFOLLOW,   P9_DOTL_NOFOLLOW },
+                { O_NOATIME,    P9_DOTL_NOATIME },
+                { O_CLOEXEC,    P9_DOTL_CLOEXEC },
+                { O_SYNC,       P9_DOTL_SYNC},
+        };
+        for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+                if (flags & dotl_oflag_map[i].open_flag)
+                        rflags |= dotl_oflag_map[i].dotl_flag;
+        }
+        return rflags;
+}
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+        int rflags = 0;
+        /*
+         * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+         * and P9_DOTL_NOACCESS
+         */
+        rflags |= flags & O_ACCMODE;
+        rflags |= v9fs_mapped_dotl_flags(flags);
+        return rflags;
+}
 /**
 * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
 * @dir: directory inode that is being created
@@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                           "Failed to get acl values in creat %d\n", err);
                goto error;
        }
-        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+                                    mode, gid, &qid);
        if (err < 0) {
                P9_DPRINTK(P9_DEBUG_VFS,
                                "p9_client_open_dotl failed in creat %d\n",
@@ -281,10 +335,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
-        d_instantiate(dentry, inode);
        err = v9fs_fid_add(dentry, fid);
        if (err < 0)
                goto error;
+        d_instantiate(dentry, inode);
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, &dacl, &pacl);
@@ -403,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                                err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /*
@@ -414,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                 * inode with stat. We need to get an inode
                 * so that we can set the acl with dentry
                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
+                inode = v9fs_get_inode(dir->i_sb, mode, 0);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -540,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        mode_t mode;
        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -552,11 +607,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
                inode->i_uid = stat->st_uid;
                inode->i_gid = stat->st_gid;
                inode->i_nlink = stat->st_nlink;
-                inode->i_mode = stat->st_mode;
-                inode->i_rdev = new_decode_dev(stat->st_rdev);
-                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                mode = stat->st_mode & S_IALLUGO;
-                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                mode |= inode->i_mode & ~S_IALLUGO;
+                inode->i_mode = mode;
                i_size_write(inode, stat->st_size);
                inode->i_blocks = stat->st_blocks;
@@ -657,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                                        err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /* Not in cached mode. No need to populate inode with stat */
-                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -810,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                        goto error;
                }
-                d_instantiate(dentry, inode);
                err = v9fs_fid_add(dentry, fid);
                if (err < 0)
                        goto error;
+                d_instantiate(dentry, inode);
                fid = NULL;
        } else {
                /*
                 * Not in cached mode. No need to populate inode with stat.
                 * socket syscall returns a fd, so we need instantiate
                 */
-                inode = v9fs_get_inode(dir->i_sb, mode);
+                inode = v9fs_get_inode(dir->i_sb, mode, rdev);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        goto error;
@@ -886,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
        if (IS_ERR(st))
                return PTR_ERR(st);
+        /*
+         * Don't update inode if the file type is different
+         */
+        if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+                goto out;
        spin_lock(&inode->i_lock);
        /*
@@ -897,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
        if (v9ses->cache)
                inode->i_size = i_size;
        spin_unlock(&inode->i_lock);
+out:
        kfree(st);
        return 0;
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index feef6cdc1fd2..c70251d47ed1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        else
                sb->s_d_op = &v9fs_dentry_operations;
-        inode = v9fs_get_inode(sb, S_IFDIR | mode);
+        inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
        if (IS_ERR(inode)) {
                retval = PTR_ERR(inode);
                goto release_sb;
diff --git a/fs/Makefile b/fs/Makefile
index afc109691a9b..d2c3353d5477 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,6 +120,6 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
 obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-y                           += exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)           += ceph/
 obj-$(CONFIG_PSTORE)            += pstore/
diff --git a/fs/attr.c b/fs/attr.c
index 538e27959d3f..7ee7ba488313 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -13,6 +13,7 @@
 #include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/security.h>
+#include <linux/evm.h>
 /**
 * inode_change_ok - check if attribute changes to an inode are allowed
@@ -237,8 +238,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
        else
                error = simple_setattr(dentry, attr);
-        if (!error)
+        if (!error) {
                fsnotify_change(dentry, ia_valid);
+                evm_inode_post_setattr(dentry, ia_valid);
+        }
        return error;
 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 475f9c597cb7..326dc08d3e3f 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -39,27 +39,17 @@
 /* #define DEBUG */
-#ifdef DEBUG
+#define DPRINTK(fmt, ...)                               \
-#define DPRINTK(fmt, args...)                           \
+        pr_debug("pid %d: %s: " fmt "\n",               \
-do {                                                    \
+                current->pid, __func__, ##__VA_ARGS__)
-        printk(KERN_DEBUG "pid %d: %s: " fmt "\n",      \
-                current->pid, __func__, ##args);        \
+#define AUTOFS_WARN(fmt, ...)                           \
-} while (0)
-#else
-#define DPRINTK(fmt, args...) do {} while (0)
-#endif
-#define AUTOFS_WARN(fmt, args...)                       \
-do {                                                    \
        printk(KERN_WARNING "pid %d: %s: " fmt "\n",    \
-                current->pid, __func__, ##args);        \
+                current->pid, __func__, ##__VA_ARGS__)
-} while (0)
-#define AUTOFS_ERROR(fmt, args...)                      \
+#define AUTOFS_ERROR(fmt, ...)                          \
-do {                                                    \
        printk(KERN_ERR "pid %d: %s: " fmt "\n",        \
-                current->pid, __func__, ##args);        \
+                current->pid, __func__, ##__VA_ARGS__)
-} while (0)
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 25435987d6ae..e1fbdeef85db 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        size_t pktsz;
        DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-                wq->wait_queue_token, wq->name.len, wq->name.name, type);
+                (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
        memset(&pkt,0,sizeof pkt); /* For security reasons */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 54b8c28bebc8..720d885e8dca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                befs_data_stream *data = &befs_ino->i_data.ds;
                befs_off_t len = data->size;
-                befs_debug(sb, "Follow long symlink");
+                if (len == 0) {
+                        befs_error(sb, "Long symlink with illegal length");
-                link = kmalloc(len, GFP_NOFS);
-                if (!link) {
-                        link = ERR_PTR(-ENOMEM);
-                } else if (befs_read_lsymlink(sb, data, link, len) != len) {
-                        kfree(link);
-                        befs_error(sb, "Failed to read entire long symlink");
                        link = ERR_PTR(-EIO);
                } else {
-                        link[len - 1] = '\0';
+                        befs_debug(sb, "Follow long symlink");
+                        link = kmalloc(len, GFP_NOFS);
+                        if (!link) {
+                                link = ERR_PTR(-ENOMEM);
+                        } else if (befs_read_lsymlink(sb, data, link, len) != len) {
+                                kfree(link);
+                                befs_error(sb, "Failed to read entire long symlink");
+                                link = ERR_PTR(-EIO);
+                        } else {
+                                link[len - 1] = '\0';
+                        }
                }
        } else {
                link = befs_ino->i_data.symlink;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ff77262e887c..95f786ec7f08 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1429,6 +1429,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
+                /* ->release can cause the old bdi to disappear,
+                 * so must switch it out first
+                 */
+                bdev_inode_switch_bdi(bdev->bd_inode,
+                                        &default_backing_dev_info);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
@@ -1442,8 +1447,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                disk_put_part(bdev->bd_part);
                bdev->bd_part = NULL;
                bdev->bd_disk = NULL;
-                bdev_inode_switch_bdi(bdev->bd_inode,
-                                        &default_backing_dev_info);
                if (bdev != bdev->bd_contains)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 502b9e988679..d9f99a16edd6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -176,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode)
 {
        u64 ino = BTRFS_I(inode)->location.objectid;
-        if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+        /*
+         * !ino: btree_inode
+         * type == BTRFS_ROOT_ITEM_KEY: subvol dir
+         */
+        if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
                ino = inode->i_ino;
        return ino;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0469263e327e..03912c5c6f49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1415,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)             \
 static inline u##bits btrfs_##name(struct extent_buffer *eb)            \
 {                                                                       \
-        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        type *p = page_address(eb->first_page);                         \
        u##bits res = le##bits##_to_cpu(p->member);                     \
-        kunmap_atomic(p, KM_USER0);                                     \
        return res;                                                     \
 }                                                                       \
 static inline void btrfs_set_##name(struct extent_buffer *eb,           \
                                    u##bits val)                        \
 {                                                                       \
-        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        type *p = page_address(eb->first_page);                         \
        p->member = cpu_to_le##bits(val);                               \
-        kunmap_atomic(p, KM_USER0);                                     \
 }
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)              \
@@ -2367,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root,
+void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref);
+                         struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 66bac226944e..f5be06a2462f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1782,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                        if (!stripe->dev->can_discard)
+                                continue;
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
                                                  stripe->length);
@@ -1789,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                                discarded_bytes += stripe->length;
                        else if (ret != -EOPNOTSUPP)
                                break;
+                        /*
+                         * Just in case we get back EOPNOTSUPP for some reason,
+                         * just ignore the return value so we don't screw up
+                         * people calling discard_extent.
+                         */
+                        ret = 0;
                }
                kfree(multi);
        }
-        if (discarded_bytes && ret == -EOPNOTSUPP)
-                ret = 0;
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
@@ -6269,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root,
+void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+                         struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6283,13 +6291,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
        path = btrfs_alloc_path();
-        if (!path)
+        if (!path) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto out;
+        }
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        if (!wc) {
                btrfs_free_path(path);
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto out;
        }
        trans = btrfs_start_transaction(tree_root, 0);
@@ -6318,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                path->lowest_level = 0;
                if (ret < 0) {
                        err = ret;
-                        goto out;
+                        goto out_free;
                }
                WARN_ON(ret > 0);
@@ -6425,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                free_extent_buffer(root->commit_root);
                kfree(root);
        }
-out:
+out_free:
        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
-        return err;
+out:
+        if (err)
+                btrfs_std_error(root->fs_info, err);
+        return;
 }
 /*
@@ -6720,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
+        u64 min_free;
+        u64 dev_min = 1;
+        u64 dev_nr = 0;
+        int index;
        int full = 0;
        int ret = 0;
@@ -6729,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (!block_group)
                return -1;
+        min_free = btrfs_block_group_used(&block_group->item);
        /* no bytes used, we're good */
-        if (!btrfs_block_group_used(&block_group->item))
+        if (!min_free)
                goto out;
        space_info = block_group->space_info;
@@ -6746,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         * all of the extents from this block group.  If we can, we're good
         */
        if ((space_info->total_bytes != block_group->key.offset) &&
-           (space_info->bytes_used + space_info->bytes_reserved +
+            (space_info->bytes_used + space_info->bytes_reserved +
-            space_info->bytes_pinned + space_info->bytes_readonly +
+             space_info->bytes_pinned + space_info->bytes_readonly +
-            btrfs_block_group_used(&block_group->item) <
+             min_free < space_info->total_bytes)) {
-            space_info->total_bytes)) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -6766,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (full)
                goto out;
+        /*
+         * index:
+         *      0: raid10
+         *      1: raid1
+         *      2: dup
+         *      3: raid0
+         *      4: single
+         */
+        index = get_block_group_index(block_group);
+        if (index == 0) {
+                dev_min = 4;
+                /* Divide by 2 */
+                min_free >>= 1;
+        } else if (index == 1) {
+                dev_min = 2;
+        } else if (index == 2) {
+                /* Multiply by 2 */
+                min_free <<= 1;
+        } else if (index == 3) {
+                dev_min = fs_devices->rw_devices;
+                do_div(min_free, dev_min);
+        }
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-                u64 min_free = btrfs_block_group_used(&block_group->item);
                u64 dev_offset;
                /*
@@ -6779,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                        ret = find_free_dev_extent(NULL, device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
+                                dev_nr++;
+                        if (dev_nr >= dev_min)
                                break;
                        ret = -1;
                }
        }
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b910694f61ed..a1cb7821becd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -183,8 +183,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
         * read from the commit root and sidestep a nasty deadlock
         * between reading the free space cache and updating the csum tree.
         */
-        if (btrfs_is_free_space_inode(root, inode))
+        if (btrfs_is_free_space_inode(root, inode)) {
                path->search_commit_root = 1;
+                path->skip_locking = 1;
+        }
        disk_bytenr = (u64)bio->bi_sector << 9;
        if (dio)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 658d66959abe..1266f6e9cdb2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -150,6 +150,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->defrag_inodes_lock);
        if (!BTRFS_I(inode)->in_defrag)
                __btrfs_add_inode_defrag(inode, defrag);
+        else
+                kfree(defrag);
        spin_unlock(&root->fs_info->defrag_inodes_lock);
        return 0;
 }
@@ -1034,11 +1036,13 @@ out:
 * on error we return an unlocked page and the error value
 * on success we return a locked page and 0
 */
-static int prepare_uptodate_page(struct page *page, u64 pos)
+static int prepare_uptodate_page(struct page *page, u64 pos,
+                                 bool force_uptodate)
 {
        int ret = 0;
-        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+        if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+            !PageUptodate(page)) {
                ret = btrfs_readpage(NULL, page);
                if (ret)
                        return ret;
@@ -1059,7 +1063,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
                         struct page **pages, size_t num_pages,
                         loff_t pos, unsigned long first_index,
-                         size_t write_bytes)
+                         size_t write_bytes, bool force_uptodate)
 {
        struct extent_state *cached_state = NULL;
        int i;
@@ -1073,12 +1077,6 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        start_pos = pos & ~((u64)root->sectorsize - 1);
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-        if (start_pos > inode->i_size) {
-                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
-                if (err)
-                        return err;
-        }
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
@@ -1090,10 +1088,11 @@ again:
                }
                if (i == 0)
-                        err = prepare_uptodate_page(pages[i], pos);
+                        err = prepare_uptodate_page(pages[i], pos,
+                                                    force_uptodate);
                if (i == num_pages - 1)
                        err = prepare_uptodate_page(pages[i],
-                                                    pos + write_bytes);
+                                                    pos + write_bytes, false);
                if (err) {
                        page_cache_release(pages[i]);
                        faili = i - 1;
@@ -1162,6 +1161,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
+        bool force_page_uptodate = false;
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1204,7 +1204,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * contents of pages from loop to loop
                 */
                ret = prepare_pages(root, file, pages, num_pages,
-                                    pos, first_index, write_bytes);
+                                    pos, first_index, write_bytes,
+                                    force_page_uptodate);
                if (ret) {
                        btrfs_delalloc_release_space(inode,
                                        num_pages << PAGE_CACHE_SHIFT);
@@ -1221,12 +1222,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                if (copied < write_bytes)
                        nrptrs = 1;
-                if (copied == 0)
+                if (copied == 0) {
+                        force_page_uptodate = true;
                        dirty_pages = 0;
-                else
+                } else {
+                        force_page_uptodate = false;
                        dirty_pages = (copied + offset +
                                       PAGE_CACHE_SIZE - 1) >>
                                       PAGE_CACHE_SHIFT;
+                }
                /*
                 * If we had a short copy we need to release the excess delaloc
@@ -1336,6 +1340,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        loff_t *ppos = &iocb->ki_pos;
+        u64 start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
@@ -1384,6 +1389,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
+        start_pos = round_down(pos, root->sectorsize);
+        if (start_pos > i_size_read(inode)) {
+                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+                if (err) {
+                        mutex_unlock(&inode->i_mutex);
+                        goto out;
+                }
+        }
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                   pos, ppos, count, ocount);
@@ -1638,11 +1652,15 @@ static long btrfs_fallocate(struct file *file, int mode,
        cur_offset = alloc_start;
        while (1) {
+                u64 actual_end;
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), alloc_end);
+                actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = (last_byte + mask) & ~mask;
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1655,6 +1673,16 @@ static long btrfs_fallocate(struct file *file, int mode,
                                free_extent_map(em);
                                break;
                        }
+                } else if (actual_end > inode->i_size &&
+                           !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        /*
+                         * We didn't need to allocate any more space, but we
+                         * still extended the size of the file so we need to
+                         * update i_size.
+                         */
+                        inode->i_ctime = CURRENT_TIME;
+                        i_size_write(inode, actual_end);
+                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
                }
                free_extent_map(em);
@@ -1793,10 +1821,15 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
        switch (origin) {
        case SEEK_END:
        case SEEK_CUR:
-                offset = generic_file_llseek_unlocked(file, offset, origin);
+                offset = generic_file_llseek(file, offset, origin);
                goto out;
        case SEEK_DATA:
        case SEEK_HOLE:
+                if (offset >= i_size_read(inode)) {
+                        mutex_unlock(&inode->i_mutex);
+                        return -ENXIO;
+                }
                ret = find_desired_extent(inode, &offset, origin);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
@@ -1804,10 +1837,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
                }
        }
-        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+        if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
-                return -EINVAL;
+                offset = -EINVAL;
-        if (offset > inode->i_sb->s_maxbytes)
+                goto out;
-                return -EINVAL;
+        }
+        if (offset > inode->i_sb->s_maxbytes) {
+                offset = -EINVAL;
+                goto out;
+        }
        /* Special lock needed here? */
        if (offset != file->f_pos) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6377713f639c..41ac927401d0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -190,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_path *path,
                                    struct inode *inode)
 {
+        struct btrfs_block_rsv *rsv;
        loff_t oldsize;
        int ret = 0;
+        rsv = trans->block_rsv;
        trans->block_rsv = root->orphan_block_rsv;
        ret = btrfs_block_rsv_check(trans, root,
                                    root->orphan_block_rsv,
@@ -210,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
         */
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
+        trans->block_rsv = rsv;
        if (ret) {
                WARN_ON(1);
                return ret;
@@ -1168,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-                              struct btrfs_free_space *info, u64 offset,
+                                       struct btrfs_free_space *info,
-                              u64 bytes)
+                                       u64 offset, u64 bytes)
 {
        unsigned long start, count;
@@ -1181,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
        bitmap_clear(info->bitmap, start, count);
        info->bytes -= bytes;
+}
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        __bitmap_clear_bits(ctl, info, offset, bytes);
        ctl->free_space -= bytes;
 }
@@ -1984,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                return 0;
        ret = search_start;
-        bitmap_clear_bits(ctl, entry, ret, bytes);
+        __bitmap_clear_bits(ctl, entry, ret, bytes);
        return ret;
 }
@@ -2039,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                continue;
                        }
                } else {
                        ret = entry->offset;
                        entry->offset += bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 15fceefbca0a..b2d004ad66a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1786,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                          &ordered_extent->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        if (!ret) {
+        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
        }
@@ -3510,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
-                        if (err)
+                        if (err) {
+                                btrfs_end_transaction(trans, root);
                                break;
+                        }
                        err = btrfs_insert_file_extent(trans, root,
                                        btrfs_ino(inode), cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
-                        if (err)
+                        if (err) {
+                                btrfs_end_transaction(trans, root);
                                break;
+                        }
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
@@ -3952,7 +3956,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *new)
 {
        struct inode *inode;
-        int bad_inode = 0;
        inode = btrfs_iget_locked(s, location->objectid, root);
        if (!inode)
@@ -3968,15 +3971,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                        if (new)
                                *new = 1;
                } else {
-                        bad_inode = 1;
+                        unlock_new_inode(inode);
+                        iput(inode);
+                        inode = ERR_PTR(-ESTALE);
                }
        }
-        if (bad_inode) {
-                iput(inode);
-                inode = ERR_PTR(-ESTALE);
-        }
        return inode;
 }
@@ -4018,7 +4018,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
                kfree(dentry->d_fsdata);
                dentry->d_fsdata = NULL;
-                d_clear_need_lookup(dentry);
+                /* This thing is hashed, drop it for now */
+                d_drop(dentry);
        } else {
                ret = btrfs_inode_by_name(dir, dentry, &location);
        }
@@ -4085,7 +4086,15 @@ static void btrfs_dentry_release(struct dentry *dentry)
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
 {
-        return d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
+        struct dentry *ret;
+        ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
+        if (unlikely(d_need_lookup(dentry))) {
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
+                spin_unlock(&dentry->d_lock);
+        }
+        return ret;
 }
 unsigned char btrfs_filetype_table[] = {
@@ -4125,7 +4134,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        /* special case for "." */
        if (filp->f_pos == 0) {
-                over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
+                over = filldir(dirent, ".", 1,
+                               filp->f_pos, btrfs_ino(inode), DT_DIR);
                if (over)
                        return 0;
                filp->f_pos = 1;
@@ -4134,7 +4144,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        if (filp->f_pos == 1) {
                u64 pino = parent_ino(filp->f_path.dentry);
                over = filldir(dirent, "..", 2,
-                               2, pino, DT_DIR);
+                               filp->f_pos, pino, DT_DIR);
                if (over)
                        return 0;
                filp->f_pos = 2;
@@ -5823,7 +5833,7 @@ again:
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        if (!ret)
+        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
                btrfs_update_inode(trans, root, inode);
        ret = 0;
 out_unlock:
@@ -7354,11 +7364,15 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        umode_t mode = inode->i_mode;
-        if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+        if (mask & MAY_WRITE &&
-                return -EROFS;
+            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
-        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
+                if (btrfs_root_readonly(root))
-                return -EACCES;
+                        return -EROFS;
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+                        return -EACCES;
+        }
        return generic_permission(inode, mask);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7cf013349941..dae5dfe41ba5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1047,7 +1047,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        if (!max_to_defrag)
                max_to_defrag = last_index - 1;
-        while (i <= last_index && defrag_count < max_to_defrag) {
+        /*
+         * make writeback starts from i, so the defrag range can be
+         * written sequentially.
+         */
+        if (i < inode->i_mapping->writeback_index)
+                inode->i_mapping->writeback_index = i;
+        while (i <= last_index && defrag_count < max_to_defrag &&
+               (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT)) {
                /*
                 * make sure we stop running if someone unmounts
                 * the FS
@@ -2177,6 +2186,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (!(src_file->f_mode & FMODE_READ))
                goto out_fput;
+        /* don't make the dst file partly checksummed */
+        if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+            (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+                goto out_fput;
        ret = -EISDIR;
        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
                goto out_fput;
@@ -2220,6 +2234,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
+        if (destoff > inode->i_size) {
+                ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+                if (ret)
+                        goto out_unlock;
+        }
+        /* truncate page cache pages from target inode range */
+        truncate_inode_pages_range(&inode->i_data, destoff,
+                                   PAGE_CACHE_ALIGN(destoff + len) - 1);
        /* do any pending delalloc/csum calc on src, one way or
           another, and lock file content */
        while (1) {
@@ -2313,7 +2337,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        else
                                new_key.offset = destoff;
-                        trans = btrfs_start_transaction(root, 1);
+                        /*
+                         * 1 - adjusting old extent (we may have to split it)
+                         * 1 - add new extent
+                         * 1 - inode update
+                         */
+                        trans = btrfs_start_transaction(root, 3);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                goto out;
@@ -2321,14 +2350,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                /*
+                                 *    a  | --- range to clone ---|  b
+                                 * | ------------- extent ------------- |
+                                 */
+                                /* substract range b */
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                /* substract range a */
                                if (off > key.offset) {
                                        datao += off - key.offset;
                                        datal -= off - key.offset;
                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                ret = btrfs_drop_extents(trans, inode,
                                                         new_key.offset,
                                                         new_key.offset + datal,
@@ -2425,7 +2461,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        if (endoff > inode->i_size)
                                btrfs_i_size_write(inode, endoff);
-                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
                        btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7dc36fab4afc..e24b7964a155 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -884,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
+        struct btrfs_block_rsv *rsv;
        struct inode *parent_inode;
        struct dentry *parent;
        struct dentry *dentry;
@@ -895,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 objectid;
        u64 root_flags;
+        rsv = trans->block_rsv;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
                pending->error = -ENOMEM;
@@ -1002,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
+        trans->block_rsv = rsv;
        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
        return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index babee65f8eda..786639fca067 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct extent_buffer *eb, int slot,
                                  struct btrfs_key *key)
 {
-        struct inode *dir;
-        int ret;
        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
+        struct inode *dir;
        struct inode *inode;
-        char *name;
-        int namelen;
        unsigned long ref_ptr;
        unsigned long ref_end;
+        char *name;
+        int namelen;
+        int ret;
        int search_done = 0;
        /*
@@ -909,6 +910,25 @@ again:
        }
        btrfs_release_path(path);
+        /* look for a conflicting sequence number */
+        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+                                         btrfs_inode_ref_index(eb, ref),
+                                         name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(path);
+        /* look for a conflicing name */
+        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+                                   name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(path);
 insert:
        /* insert our name */
        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 53875ae73ad4..f2a4cc79da61 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        unsigned long limit;
        unsigned long last_waited = 0;
        int force_reg = 0;
+        int sync_pending = 0;
        struct blk_plug plug;
        /*
@@ -229,6 +230,22 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+                /*
+                 * if we're doing the sync list, record that our
+                 * plug has some sync requests on it
+                 *
+                 * If we're doing the regular list and there are
+                 * sync requests sitting around, unplug before
+                 * we add more
+                 */
+                if (pending_bios == &device->pending_sync_bios) {
+                        sync_pending = 1;
+                } else if (sync_pending) {
+                        blk_finish_plug(&plug);
+                        blk_start_plug(&plug);
+                        sync_pending = 0;
+                }
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                        fs_devices->rw_devices--;
                }
+                if (device->can_discard)
+                        fs_devices->num_can_discard--;
                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
                BUG_ON(!new_device);
                memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
+                new_device->can_discard = 0;
                list_replace_rcu(&device->dev_list, &new_device->dev_list);
                call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                                fmode_t flags, void *holder)
 {
+        struct request_queue *q;
        struct block_device *bdev;
        struct list_head *head = &fs_devices->devices;
        struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        seeding = 0;
                }
+                q = bdev_get_queue(bdev);
+                if (blk_queue_discard(q)) {
+                        device->can_discard = 1;
+                        fs_devices->num_can_discard++;
+                }
                device->bdev = bdev;
                device->in_fs_metadata = 0;
                device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        max_hole_start = search_start;
        max_hole_size = 0;
+        hole_size = 0;
        if (search_start >= search_end) {
                ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
                cond_resched();
        }
-        hole_size = search_end- search_start;
+        /*
+         * At this point, search_start should be the end of
+         * allocated dev extents, and when shrinking the device,
+         * search_end may be smaller than search_start.
+         */
+        if (search_end > search_start)
+                hole_size = search_end - search_start;
        if (hole_size > max_hole_size) {
                max_hole_start = search_start;
                max_hole_size = hole_size;
@@ -1543,6 +1579,7 @@ error:
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
+        struct request_queue *q;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device;
        struct block_device *bdev;
@@ -1612,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        lock_chunks(root);
+        q = bdev_get_queue(bdev);
+        if (blk_queue_discard(q))
+                device->can_discard = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1647,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->num_devices++;
        root->fs_info->fs_devices->open_devices++;
        root->fs_info->fs_devices->rw_devices++;
+        if (device->can_discard)
+                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2413,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                        total_avail = device->total_bytes - device->bytes_used;
                else
                        total_avail = 0;
-                /* avail is off by max(alloc_start, 1MB), but that is the same
-                 * for all devices, so it doesn't hurt the sorting later on
+                /* If there is no space on this device, skip it. */
-                 */
+                if (total_avail == 0)
+                        continue;
                ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61ae7ae..6d866db4e177 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
        int missing;
+        int can_discard;
        spinlock_t io_lock;
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 missing_devices;
        u64 total_rw_bytes;
+        u64 num_can_discard;
        struct block_device *latest_bdev;
        /* all of the devices in the FS, protected by a mutex
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index d733b9cfea34..426aa464f1af 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -116,6 +116,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                if (ret)
                        goto out;
                btrfs_release_path(path);
+                /*
+                 * remove the attribute
+                 */
+                if (!value)
+                        goto out;
        }
 again:
@@ -158,6 +164,9 @@ out:
        return ret;
 }
+/*
+ * @value: "" makes the attribute to empty, NULL removes it
+ */
 int __btrfs_setxattr(struct btrfs_trans_handle *trans,
                     struct inode *inode, const char *name,
                     const void *value, size_t size, int flags)
@@ -374,36 +383,36 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
                                XATTR_REPLACE);
 }
-int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                              struct inode *inode, struct inode *dir,
+                     void *fs_info)
-                              const struct qstr *qstr)
 {
-        int err;
+        const struct xattr *xattr;
-        size_t len;
+        struct btrfs_trans_handle *trans = fs_info;
-        void *value;
-        char *suffix;
        char *name;
+        int err = 0;
-        err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                                           &len);
+                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
-        if (err) {
+                               strlen(xattr->name) + 1, GFP_NOFS);
-                if (err == -EOPNOTSUPP)
+                if (!name) {
-                        return 0;
+                        err = -ENOMEM;
-                return err;
+                        break;
-        }
+                }
-        name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
-                       GFP_NOFS);
-        if (!name) {
-                err = -ENOMEM;
-        } else {
                strcpy(name, XATTR_SECURITY_PREFIX);
-                strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+                strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
-                err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+                err = __btrfs_setxattr(trans, inode, name,
+                                       xattr->value, xattr->value_len, 0);
                kfree(name);
+                if (err < 0)
+                        break;
        }
-        kfree(suffix);
-        kfree(value);
        return err;
 }
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+                              struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &btrfs_initxattrs, trans);
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 1a80b048ade8..936d6035f6e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1470,13 +1470,13 @@ static void discard_buffer(struct buffer_head * bh)
 }
 /**
- * block_invalidatepage - invalidate part of all of a buffer-backed page
+ * block_invalidatepage - invalidate part or all of a buffer-backed page
 *
 * @page: the page which is affected
 * @offset: the index of the truncation point
 *
 * block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
 *
 * block_invalidatepage() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fee028b5332e..86c59e16ba74 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1595,7 +1595,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
                r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
                dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
                     *ppath);
-        } else if (rpath) {
+        } else if (rpath || rino) {
                *ino = rino;
                *ppath = rpath;
                *pathlen = strlen(rpath);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d47c5ec7fb1f..88bacaf385d9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -813,8 +813,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
        fsc = create_fs_client(fsopt, opt);
        if (IS_ERR(fsc)) {
                res = ERR_CAST(fsc);
-                kfree(fsopt);
+                destroy_mount_options(fsopt);
-                kfree(opt);
+                ceph_destroy_options(opt);
                goto out_final;
        }
diff --git a/fs/cifs/README b/fs/cifs/README
index c5c2c5e5f0f2..895da1dc1550 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -745,4 +745,18 @@ installed and something like the following lines should be added to the
 create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
 create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+CIFS kernel module parameters
+=============================
+These module parameters can be specified or modified either during the time of
+module loading or during the runtime by using the interface
+        /proc/module/cifs/parameters/<param>
+i.e. echo "value" > /sys/module/cifs/parameters/<param>
+1. echo_retries - The number of echo attempts before giving up and
+                  reconnecting to the server. The default is 5. The value 0
+                  means never reconnect.
+2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
+                    [Y/y/1]. To disable use any of [N/n/0].
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2fe3cf13b2e9..84e8c0724704 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_CIFS_STATS2
                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-                                atomic_read(&server->inSend),
+                                atomic_read(&server->in_send),
                                atomic_read(&server->num_waiters));
 #endif
@@ -511,7 +511,7 @@ static const struct file_operations cifsFYI_proc_fops = {
 static int cifs_oplock_proc_show(struct seq_file *m, void *v)
 {
-        seq_printf(m, "%d\n", oplockEnabled);
+        seq_printf(m, "%d\n", enable_oplocks);
        return 0;
 }
@@ -526,13 +526,16 @@ static ssize_t cifs_oplock_proc_write(struct file *file,
        char c;
        int rc;
+        printk(KERN_WARNING "CIFS: The /proc/fs/cifs/OplockEnabled interface "
+               "will be removed in kernel version 3.4. Please migrate to "
+               "using the 'enable_oplocks' module parameter in cifs.ko.\n");
        rc = get_user(c, buffer);
        if (rc)
                return rc;
        if (c == '0' || c == 'n' || c == 'N')
-                oplockEnabled = 0;
+                enable_oplocks = false;
        else if (c == '1' || c == 'y' || c == 'Y')
-                oplockEnabled = 1;
+                enable_oplocks = true;
        return count;
 }
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 7260e11e21f8..500d65859279 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -43,6 +43,8 @@
 #define CIFS_MOUNT_STRICT_IO    0x40000 /* strict cache mode */
 #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */
 #define CIFS_MOUNT_POSIXACL     0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
+#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
+#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
 struct cifs_sb_info {
        struct rb_root tlink_tree;
@@ -55,6 +57,8 @@ struct cifs_sb_info {
        atomic_t active;
        uid_t   mnt_uid;
        gid_t   mnt_gid;
+        uid_t   mnt_backupuid;
+        gid_t   mnt_backupgid;
        mode_t  mnt_file_mode;
        mode_t  mnt_dir_mode;
        unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 21de1d6d5849..72ddf23ef6f7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -91,9 +91,76 @@ cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
        spin_unlock(&sidgidlock);
+        root = &siduidtree;
+        spin_lock(&uidsidlock);
+        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+        spin_unlock(&uidsidlock);
+        root = &sidgidtree;
+        spin_lock(&gidsidlock);
+        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+        spin_unlock(&gidsidlock);
        return nr_rem;
 }
+static void
+sid_rb_insert(struct rb_root *root, unsigned long cid,
+                struct cifs_sid_id **psidid, char *typestr)
+{
+        char *strptr;
+        struct rb_node *node = root->rb_node;
+        struct rb_node *parent = NULL;
+        struct rb_node **linkto = &(root->rb_node);
+        struct cifs_sid_id *lsidid;
+        while (node) {
+                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                parent = node;
+                if (cid > lsidid->id) {
+                        linkto = &(node->rb_left);
+                        node = node->rb_left;
+                }
+                if (cid < lsidid->id) {
+                        linkto = &(node->rb_right);
+                        node = node->rb_right;
+                }
+        }
+        (*psidid)->id = cid;
+        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+        (*psidid)->refcount = 0;
+        sprintf((*psidid)->sidstr, "%s", typestr);
+        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+        sprintf(strptr, "%ld", cid);
+        clear_bit(SID_ID_PENDING, &(*psidid)->state);
+        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+        rb_link_node(&(*psidid)->rbnode, parent, linkto);
+        rb_insert_color(&(*psidid)->rbnode, root);
+}
+static struct cifs_sid_id *
+sid_rb_search(struct rb_root *root, unsigned long cid)
+{
+        struct rb_node *node = root->rb_node;
+        struct cifs_sid_id *lsidid;
+        while (node) {
+                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                if (cid > lsidid->id)
+                        node = node->rb_left;
+                else if (cid < lsidid->id)
+                        node = node->rb_right;
+                else /* node found */
+                        return lsidid;
+        }
+        return NULL;
+}
 static struct shrinker cifs_shrinker = {
        .shrink = cifs_idmap_shrinker,
        .seeks = DEFAULT_SEEKS,
@@ -110,6 +177,7 @@ cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
        memcpy(payload, data, datalen);
        key->payload.data = payload;
+        key->datalen = datalen;
        return 0;
 }
@@ -224,6 +292,120 @@ sidid_pending_wait(void *unused)
 }
 static int
+id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+{
+        int rc = 0;
+        struct key *sidkey;
+        const struct cred *saved_cred;
+        struct cifs_sid *lsid;
+        struct cifs_sid_id *psidid, *npsidid;
+        struct rb_root *cidtree;
+        spinlock_t *cidlock;
+        if (sidtype == SIDOWNER) {
+                cidlock = &siduidlock;
+                cidtree = &uidtree;
+        } else if (sidtype == SIDGROUP) {
+                cidlock = &sidgidlock;
+                cidtree = &gidtree;
+        } else
+                return -EINVAL;
+        spin_lock(cidlock);
+        psidid = sid_rb_search(cidtree, cid);
+        if (!psidid) { /* node does not exist, allocate one & attempt adding */
+                spin_unlock(cidlock);
+                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+                if (!npsidid)
+                        return -ENOMEM;
+                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+                if (!npsidid->sidstr) {
+                        kfree(npsidid);
+                        return -ENOMEM;
+                }
+                spin_lock(cidlock);
+                psidid = sid_rb_search(cidtree, cid);
+                if (psidid) { /* node happened to get inserted meanwhile */
+                        ++psidid->refcount;
+                        spin_unlock(cidlock);
+                        kfree(npsidid->sidstr);
+                        kfree(npsidid);
+                } else {
+                        psidid = npsidid;
+                        sid_rb_insert(cidtree, cid, &psidid,
+                                        sidtype == SIDOWNER ? "oi:" : "gi:");
+                        ++psidid->refcount;
+                        spin_unlock(cidlock);
+                }
+        } else {
+                ++psidid->refcount;
+                spin_unlock(cidlock);
+        }
+        /*
+         * If we are here, it is safe to access psidid and its fields
+         * since a reference was taken earlier while holding the spinlock.
+         * A reference on the node is put without holding the spinlock
+         * and it is OK to do so in this case, shrinker will not erase
+         * this node until all references are put and we do not access
+         * any fields of the node after a reference is put .
+         */
+        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+                memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+                psidid->time = jiffies; /* update ts for accessing */
+                goto id_sid_out;
+        }
+        if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+                rc = -EINVAL;
+                goto id_sid_out;
+        }
+        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+                saved_cred = override_creds(root_cred);
+                sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+                if (IS_ERR(sidkey)) {
+                        rc = -EINVAL;
+                        cFYI(1, "%s: Can't map and id to a SID", __func__);
+                } else {
+                        lsid = (struct cifs_sid *)sidkey->payload.data;
+                        memcpy(&psidid->sid, lsid,
+                                sidkey->datalen < sizeof(struct cifs_sid) ?
+                                sidkey->datalen : sizeof(struct cifs_sid));
+                        memcpy(ssid, &psidid->sid,
+                                sidkey->datalen < sizeof(struct cifs_sid) ?
+                                sidkey->datalen : sizeof(struct cifs_sid));
+                        set_bit(SID_ID_MAPPED, &psidid->state);
+                        key_put(sidkey);
+                        kfree(psidid->sidstr);
+                }
+                psidid->time = jiffies; /* update ts for accessing */
+                revert_creds(saved_cred);
+                clear_bit(SID_ID_PENDING, &psidid->state);
+                wake_up_bit(&psidid->state, SID_ID_PENDING);
+        } else {
+                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+                                sidid_pending_wait, TASK_INTERRUPTIBLE);
+                if (rc) {
+                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
+                                        __func__, rc);
+                        --psidid->refcount;
+                        return rc;
+                }
+                if (test_bit(SID_ID_MAPPED, &psidid->state))
+                        memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
+                else
+                        rc = -EINVAL;
+        }
+id_sid_out:
+        --psidid->refcount;
+        return rc;
+}
+static int
 sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
                struct cifs_fattr *fattr, uint sidtype)
 {
@@ -383,6 +565,10 @@ init_cifs_idmap(void)
        spin_lock_init(&sidgidlock);
        gidtree = RB_ROOT;
+        spin_lock_init(&uidsidlock);
+        siduidtree = RB_ROOT;
+        spin_lock_init(&gidsidlock);
+        sidgidtree = RB_ROOT;
        register_shrinker(&cifs_shrinker);
        cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
@@ -422,6 +608,18 @@ cifs_destroy_idmaptrees(void)
        while ((node = rb_first(root)))
                rb_erase(node, root);
        spin_unlock(&sidgidlock);
+        root = &siduidtree;
+        spin_lock(&uidsidlock);
+        while ((node = rb_first(root)))
+                rb_erase(node, root);
+        spin_unlock(&uidsidlock);
+        root = &sidgidtree;
+        spin_lock(&gidsidlock);
+        while ((node = rb_first(root)))
+                rb_erase(node, root);
+        spin_unlock(&gidsidlock);
 }
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -706,7 +904,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        acl_size = sizeof(struct cifs_acl);
        num_aces = le32_to_cpu(pdacl->num_aces);
-        if (num_aces  > 0) {
+        if (num_aces > 0) {
                umode_t user_mask = S_IRWXU;
                umode_t group_mask = S_IRWXG;
                umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
@@ -868,52 +1066,82 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
        else
                cFYI(1, "no ACL"); /* BB grant all or default perms? */
-/*      cifscred->uid = owner_sid_ptr->rid;
-        cifscred->gid = group_sid_ptr->rid;
-        memcpy((void *)(&(cifscred->osid)), (void *)owner_sid_ptr,
-                        sizeof(struct cifs_sid));
-        memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
-                        sizeof(struct cifs_sid)); */
        return rc;
 }
 /* Convert permission bits from mode to equivalent CIFS ACL */
 static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
-                                struct inode *inode, __u64 nmode)
+        __u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
 {
        int rc = 0;
        __u32 dacloffset;
        __u32 ndacloffset;
        __u32 sidsoffset;
        struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
+        struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
        struct cifs_acl *dacl_ptr = NULL;  /* no need for SACL ptr */
        struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
-        if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
+        if (nmode != NO_CHANGE_64) { /* chmod */
-                return -EIO;
+                owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
-        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->osidoffset));
-        group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+                group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->gsidoffset));
+                dacloffset = le32_to_cpu(pntsd->dacloffset);
-        dacloffset = le32_to_cpu(pntsd->dacloffset);
+                dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+                ndacloffset = sizeof(struct cifs_ntsd);
+                ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
-        ndacloffset = sizeof(struct cifs_ntsd);
+                ndacl_ptr->revision = dacl_ptr->revision;
-        ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+                ndacl_ptr->size = 0;
-        ndacl_ptr->revision = dacl_ptr->revision;
+                ndacl_ptr->num_aces = 0;
-        ndacl_ptr->size = 0;
-        ndacl_ptr->num_aces = 0;
+                rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr,
+                                        nmode);
-        rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, nmode);
+                sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+                /* copy sec desc control portion & owner and group sids */
-        sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size);
+                copy_sec_desc(pntsd, pnntsd, sidsoffset);
+                *aclflag = CIFS_ACL_DACL;
-        /* copy security descriptor control portion and owner and group sid */
+        } else {
-        copy_sec_desc(pntsd, pnntsd, sidsoffset);
+                memcpy(pnntsd, pntsd, secdesclen);
+                if (uid != NO_CHANGE_32) { /* chown */
+                        owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+                                        le32_to_cpu(pnntsd->osidoffset));
+                        nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+                                                                GFP_KERNEL);
+                        if (!nowner_sid_ptr)
+                                return -ENOMEM;
+                        rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+                        if (rc) {
+                                cFYI(1, "%s: Mapping error %d for owner id %d",
+                                                __func__, rc, uid);
+                                kfree(nowner_sid_ptr);
+                                return rc;
+                        }
+                        memcpy(owner_sid_ptr, nowner_sid_ptr,
+                                        sizeof(struct cifs_sid));
+                        kfree(nowner_sid_ptr);
+                        *aclflag = CIFS_ACL_OWNER;
+                }
+                if (gid != NO_CHANGE_32) { /* chgrp */
+                        group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
+                                        le32_to_cpu(pnntsd->gsidoffset));
+                        ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+                                                                GFP_KERNEL);
+                        if (!ngroup_sid_ptr)
+                                return -ENOMEM;
+                        rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+                        if (rc) {
+                                cFYI(1, "%s: Mapping error %d for group id %d",
+                                                __func__, rc, gid);
+                                kfree(ngroup_sid_ptr);
+                                return rc;
+                        }
+                        memcpy(group_sid_ptr, ngroup_sid_ptr,
+                                        sizeof(struct cifs_sid));
+                        kfree(ngroup_sid_ptr);
+                        *aclflag = CIFS_ACL_GROUP;
+                }
+        }
        return rc;
 }
@@ -945,7 +1173,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 {
        struct cifs_ntsd *pntsd = NULL;
        int oplock = 0;
-        int xid, rc;
+        int xid, rc, create_options = 0;
        __u16 fid;
        struct cifs_tcon *tcon;
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -956,9 +1184,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0,
+        if (backup_cred(cifs_sb))
-                         &fid, &oplock, NULL, cifs_sb->local_nls,
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
+                        create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (!rc) {
                rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
                CIFSSMBClose(xid, tcon, fid);
@@ -991,31 +1222,15 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        return pntsd;
 }
-static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+ /* Set an ACL on the server */
-                struct cifs_ntsd *pnntsd, u32 acllen)
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-{
+                        struct inode *inode, const char *path, int aclflag)
-        int xid, rc;
-        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
-        if (IS_ERR(tlink))
-                return PTR_ERR(tlink);
-        xid = GetXid();
-        rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
-        FreeXid(xid);
-        cifs_put_tlink(tlink);
-        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-        return rc;
-}
-static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-                struct cifs_ntsd *pnntsd, u32 acllen)
 {
        int oplock = 0;
-        int xid, rc;
+        int xid, rc, access_flags, create_options = 0;
        __u16 fid;
        struct cifs_tcon *tcon;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -1024,15 +1239,23 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
        tcon = tlink_tcon(tlink);
        xid = GetXid();
-        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, WRITE_DAC, 0,
+        if (backup_cred(cifs_sb))
-                         &fid, &oplock, NULL, cifs_sb->local_nls,
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
-                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
+                access_flags = WRITE_OWNER;
+        else
+                access_flags = WRITE_DAC;
+        rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
+                        create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                cERROR(1, "Unable to open file to set ACL");
                goto out;
        }
-        rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen);
+        rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        CIFSSMBClose(xid, tcon, fid);
@@ -1042,25 +1265,6 @@ out:
        return rc;
 }
-/* Set an ACL on the server */
-int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-                                struct inode *inode, const char *path)
-{
-        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsFileInfo *open_file;
-        int rc;
-        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
-        open_file = find_readable_file(CIFS_I(inode), true);
-        if (!open_file)
-                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-        cifsFileInfo_put(open_file);
-        return rc;
-}
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
 int
 cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
@@ -1092,9 +1296,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 }
 /* Convert mode bits to an ACL so we can update the ACL on the server */
-int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
+int
+id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
+                        uid_t uid, gid_t gid)
 {
        int rc = 0;
+        int aclflag = CIFS_ACL_DACL; /* default flag to set */
        __u32 secdesclen = 0;
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
@@ -1124,13 +1331,15 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode)
                        return -ENOMEM;
                }
-                rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
+                rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+                                        &aclflag);
                cFYI(DBG2, "build_sec_desc rc: %d", rc);
                if (!rc) {
                        /* Set the security descriptor */
-                        rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
+                        rc = set_cifs_acl(pnntsd, secdesclen, inode,
+                                                path, aclflag);
                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
                }
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e76bfeb68267..2cfb695d1f89 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,83 +37,8 @@
 * the sequence number before this function is called. Also, this function
 * should be called with the server->srv_mutex held.
 */
-static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
+static int cifs_calc_signature(const struct kvec *iov, int n_vec,
-                                struct TCP_Server_Info *server, char *signature)
+                        struct TCP_Server_Info *server, char *signature)
-{
-        int rc;
-        if (cifs_pdu == NULL || signature == NULL || server == NULL)
-                return -EINVAL;
-        if (!server->secmech.sdescmd5) {
-                cERROR(1, "%s: Can't generate signature\n", __func__);
-                return -1;
-        }
-        rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
-        if (rc) {
-                cERROR(1, "%s: Could not init md5\n", __func__);
-                return rc;
-        }
-        rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
-                server->session_key.response, server->session_key.len);
-        if (rc) {
-                cERROR(1, "%s: Could not update with response\n", __func__);
-                return rc;
-        }
-        rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
-                cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
-        if (rc) {
-                cERROR(1, "%s: Could not update with payload\n", __func__);
-                return rc;
-        }
-        rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
-        if (rc)
-                cERROR(1, "%s: Could not generate md5 hash\n", __func__);
-        return rc;
-}
-/* must be called with server->srv_mutex held */
-int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
-                  __u32 *pexpected_response_sequence_number)
-{
-        int rc = 0;
-        char smb_signature[20];
-        if ((cifs_pdu == NULL) || (server == NULL))
-                return -EINVAL;
-        if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
-            server->tcpStatus == CifsNeedNegotiate)
-                return rc;
-        if (!server->session_estab) {
-                strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
-                return rc;
-        }
-        cifs_pdu->Signature.Sequence.SequenceNumber =
-                        cpu_to_le32(server->sequence_number);
-        cifs_pdu->Signature.Sequence.Reserved = 0;
-        *pexpected_response_sequence_number = server->sequence_number++;
-        server->sequence_number++;
-        rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
-        if (rc)
-                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
-        else
-                memcpy(cifs_pdu->Signature.SecuritySignature, smb_signature, 8);
-        return rc;
-}
-static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-                                struct TCP_Server_Info *server, char *signature)
 {
        int i;
        int rc;
@@ -179,7 +104,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 {
        int rc = 0;
        char smb_signature[20];
-        struct smb_hdr *cifs_pdu = iov[0].iov_base;
+        struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
        if ((cifs_pdu == NULL) || (server == NULL))
                return -EINVAL;
@@ -189,7 +114,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
                return rc;
        if (!server->session_estab) {
-                strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
+                memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
                return rc;
        }
@@ -200,7 +125,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        *pexpected_response_sequence_number = server->sequence_number++;
        server->sequence_number++;
-        rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
+        rc = cifs_calc_signature(iov, n_vec, server, smb_signature);
        if (rc)
                memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
        else
@@ -209,13 +134,27 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
        return rc;
 }
-int cifs_verify_signature(struct smb_hdr *cifs_pdu,
+/* must be called with server->srv_mutex held */
+int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
+                  __u32 *pexpected_response_sequence_number)
+{
+        struct kvec iov;
+        iov.iov_base = cifs_pdu;
+        iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+        return cifs_sign_smb2(&iov, 1, server,
+                              pexpected_response_sequence_number);
+}
+int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
                          struct TCP_Server_Info *server,
                          __u32 expected_sequence_number)
 {
        unsigned int rc;
        char server_response_sig[8];
        char what_we_think_sig_should_be[20];
+        struct smb_hdr *cifs_pdu = (struct smb_hdr *)iov[0].iov_base;
        if (cifs_pdu == NULL || server == NULL)
                return -EINVAL;
@@ -247,8 +186,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
        cifs_pdu->Signature.Sequence.Reserved = 0;
        mutex_lock(&server->srv_mutex);
-        rc = cifs_calculate_signature(cifs_pdu, server,
+        rc = cifs_calc_signature(iov, nr_iov, server,
-                what_we_think_sig_should_be);
+                                 what_we_think_sig_should_be);
        mutex_unlock(&server->srv_mutex);
        if (rc)
@@ -351,9 +290,7 @@ static int
 build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int dlen;
-        unsigned int wlen;
+        unsigned int size = 2 * sizeof(struct ntlmssp2_name);
-        unsigned int size = 6 * sizeof(struct ntlmssp2_name);
-        __le64  curtime;
        char *defdmname = "WORKGROUP";
        unsigned char *blobptr;
        struct ntlmssp2_name *attrptr;
@@ -365,15 +302,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
        }
        dlen = strlen(ses->domainName);
-        wlen = strlen(ses->server->hostname);
-        /* The length of this blob is a size which is
+        /*
-         * six times the size of a structure which holds name/size +
+         * The length of this blob is two times the size of a
-         * two times the unicode length of a domain name +
+         * structure (av pair) which holds name/size
-         * two times the unicode length of a server name +
+         * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
-         * size of a timestamp (which is 8 bytes).
+         * unicode length of a netbios domain name
         */
-        ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+        ses->auth_key.len = size + 2 * dlen;
        ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
        if (!ses->auth_key.response) {
                ses->auth_key.len = 0;
@@ -384,44 +320,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
        blobptr = ses->auth_key.response;
        attrptr = (struct ntlmssp2_name *) blobptr;
+        /*
+         * As defined in MS-NTLM 3.3.2, just this av pair field
+         * is sufficient as part of the temp
+         */
        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
        attrptr->length = cpu_to_le16(2 * dlen);
        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-        blobptr += 2 * dlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
-        attrptr->length = cpu_to_le16(2 * wlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-        blobptr += 2 * wlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
-        attrptr->length = cpu_to_le16(2 * dlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-        blobptr += 2 * dlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
-        attrptr->length = cpu_to_le16(2 * wlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-        blobptr += 2 * wlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
-        attrptr->length = cpu_to_le16(sizeof(__le64));
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        memcpy(blobptr, &curtime, sizeof(__le64));
        return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f93eb948d071..8f1fe324162b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,7 @@
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
-unsigned int oplockEnabled = 1;
+bool enable_oplocks = true;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
                                 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, int, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 50 Range: 2 to 256");
 unsigned short echo_retries = 5;
@@ -82,6 +82,10 @@ module_param(echo_retries, ushort, 0644);
 MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
                               "reconnecting server. Default: 5. 0 means "
                               "never reconnect.");
+module_param(enable_oplocks, bool, 0644);
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+                                 "y/Y/1");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -132,12 +136,12 @@ cifs_read_super(struct super_block *sb)
        else
                sb->s_d_op = &cifs_dentry_ops;
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cFYI(1, "export ops supported");
                sb->s_export_op = &cifs_export_ops;
        }
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
        return 0;
@@ -432,6 +436,12 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
                seq_printf(s, ",mfsymlinks");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
                seq_printf(s, ",fsc");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
+                seq_printf(s, ",nostrictsync");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
+                seq_printf(s, ",noperm");
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
+                seq_printf(s, ",strictcache");
        seq_printf(s, ",rsize=%d", cifs_sb->rsize);
        seq_printf(s, ",wsize=%d", cifs_sb->wsize);
@@ -530,7 +540,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        char *full_path = NULL;
        char *s, *p;
        char sep;
-        int xid;
        full_path = cifs_build_path_to_root(vol, cifs_sb,
                                            cifs_sb_master_tcon(cifs_sb));
@@ -539,7 +548,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        cFYI(1, "Get root dentry for %s", full_path);
-        xid = GetXid();
        sep = CIFS_DIR_SEP(cifs_sb);
        dentry = dget(sb->s_root);
        p = s = full_path;
@@ -548,6 +556,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                struct inode *dir = dentry->d_inode;
                struct dentry *child;
+                if (!dir) {
+                        dput(dentry);
+                        dentry = ERR_PTR(-ENOENT);
+                        break;
+                }
                /* skip separators */
                while (*s == sep)
                        s++;
@@ -563,12 +577,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                mutex_unlock(&dir->i_mutex);
                dput(dentry);
                dentry = child;
-                if (!dentry->d_inode) {
-                        dput(dentry);
-                        dentry = ERR_PTR(-ENOENT);
-                }
        } while (!IS_ERR(dentry));
-        _FreeXid(xid);
        kfree(full_path);
        return dentry;
 }
@@ -721,7 +730,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (rc < 0)
                        return (loff_t)rc;
        }
-        return generic_file_llseek_unlocked(file, offset, origin);
+        return generic_file_llseek(file, offset, origin);
 }
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -940,7 +949,8 @@ cifs_init_once(void *inode)
        struct cifsInodeInfo *cifsi = inode;
        inode_init_once(&cifsi->vfs_inode);
-        INIT_LIST_HEAD(&cifsi->lockList);
+        INIT_LIST_HEAD(&cifsi->llist);
+        mutex_init(&cifsi->lock_mutex);
 }
 static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index cb71dc1f94d1..d9dbaf869cd1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -121,9 +121,9 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t  cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "1.74"
+#define CIFS_VERSION   "1.75"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 38ce6d44b145..8238aa13e01c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -167,6 +167,8 @@ struct smb_vol {
        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
+        uid_t backupuid;
+        gid_t backupgid;
        mode_t file_mode;
        mode_t dir_mode;
        unsigned secFlg;
@@ -179,6 +181,8 @@ struct smb_vol {
        bool noperm:1;
        bool no_psx_acl:1; /* set if posix acl support should be disabled */
        bool cifs_acl:1;
+        bool backupuid_specified; /* mount option  backupuid  is specified */
+        bool backupgid_specified; /* mount option  backupgid  is specified */
        bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
        bool server_ino:1; /* use inode numbers from server ie UniqueId */
        bool direct_io:1;
@@ -219,7 +223,8 @@ struct smb_vol {
                         CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
                         CIFS_MOUNT_NOPOSIXBRL | CIFS_MOUNT_NOSSYNC | \
                         CIFS_MOUNT_FSCACHE | CIFS_MOUNT_MF_SYMLINKS | \
-                         CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO)
+                         CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \
+                         CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID)
 #define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \
                      MS_NODEV | MS_SYNCHRONOUS)
@@ -286,12 +291,18 @@ struct TCP_Server_Info {
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
+        bool    large_buf;              /* is current buffer large? */
        struct delayed_work     echo; /* echo ping workqueue job */
+        struct kvec *iov;       /* reusable kvec array for receives */
+        unsigned int nr_iov;    /* number of kvecs in array */
+        char    *smallbuf;      /* pointer to current "small" buffer */
+        char    *bigbuf;        /* pointer to current "big" buffer */
+        unsigned int total_read; /* total amount of data read in this pass */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
 #ifdef CONFIG_CIFS_STATS2
-        atomic_t inSend; /* requests trying to send */
+        atomic_t in_send; /* requests trying to send */
        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
 #endif
 };
@@ -485,9 +496,13 @@ extern struct cifs_tcon *cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb);
 */
 struct cifsLockInfo {
        struct list_head llist; /* pointer to next cifsLockInfo */
+        struct list_head blist; /* pointer to locks blocked on this */
+        wait_queue_head_t block_q;
        __u64 offset;
        __u64 length;
+        __u32 pid;
        __u8 type;
+        __u16 netfid;
 };
 /*
@@ -520,8 +535,6 @@ struct cifsFileInfo {
        struct dentry *dentry;
        unsigned int f_flags;
        struct tcon_link *tlink;
-        struct mutex lock_mutex;
-        struct list_head llist; /* list of byte range locks we have. */
        bool invalidHandle:1;   /* file closed via session abend */
        bool oplock_break_cancelled:1;
        int count;              /* refcount protected by cifs_file_list_lock */
@@ -554,7 +567,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 */
 struct cifsInodeInfo {
-        struct list_head lockList;
+        struct list_head llist;         /* brlocks for this inode */
+        bool can_cache_brlcks;
+        struct mutex lock_mutex;        /* protect two fields above */
        /* BB add in lists for dirty pages i.e. write caching info for oplock */
        struct list_head openFileList;
        __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
@@ -643,8 +658,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
 struct mid_q_entry;
 /*
- * This is the prototype for the mid callback function. When creating one,
+ * This is the prototype for the mid receive function. This function is for
- * take special care to avoid deadlocks. Things to bear in mind:
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+                            struct mid_q_entry *mid);
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
 *
 * - it will be called by cifsd, with no locks held
 * - the mid will be removed from any lists
@@ -662,9 +693,10 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
+        mid_receive_t *receive; /* call receive callback */
        mid_callback_t *callback; /* call completion callback */
        void *callback_data;      /* general purpose pointer for callback */
-        struct smb_hdr *resp_buf;       /* response buffer */
+        struct smb_hdr *resp_buf;       /* pointer to received SMB header */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
        bool largeBuf:1;        /* if valid response, is pointer to large buf */
@@ -672,12 +704,54 @@ struct mid_q_entry {
        bool multiEnd:1;        /* both received */
 };
-struct oplock_q_entry {
+/*      Make code in transport.c a little cleaner by moving
-        struct list_head qhead;
+        update of optional stats into function below */
-        struct inode *pinode;
+#ifdef CONFIG_CIFS_STATS2
-        struct cifs_tcon *tcon;
-        __u16 netfid;
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
-};
+{
+        atomic_inc(&server->in_send);
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+        atomic_dec(&server->in_send);
+}
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+        atomic_inc(&server->num_waiters);
+}
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+        atomic_dec(&server->num_waiters);
+}
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+        mid->when_sent = jiffies;
+}
+#else
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+}
+#endif
 /* for pending dnotify requests */
 struct dir_notify_req {
@@ -922,7 +996,8 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
                                to be established on existing mount if we
                                have the uid/password or Kerberos credential
                                or equivalent for current user */
-GLOBAL_EXTERN unsigned int oplockEnabled;
+/* enable or disable oplocks */
+GLOBAL_EXTERN bool enable_oplocks;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
@@ -936,10 +1011,16 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
 /* reconnect after this many failed echo attempts */
 GLOBAL_EXTERN unsigned short echo_retries;
+#ifdef CONFIG_CIFS_ACL
 GLOBAL_EXTERN struct rb_root uidtree;
 GLOBAL_EXTERN struct rb_root gidtree;
 GLOBAL_EXTERN spinlock_t siduidlock;
 GLOBAL_EXTERN spinlock_t sidgidlock;
+GLOBAL_EXTERN struct rb_root siduidtree;
+GLOBAL_EXTERN struct rb_root sidgidtree;
+GLOBAL_EXTERN spinlock_t uidsidlock;
+GLOBAL_EXTERN spinlock_t gidsidlock;
+#endif /* CONFIG_CIFS_ACL */
 void cifs_oplock_break(struct work_struct *work);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de3aa285de03..3fb03e2c8e86 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
        __le16 DataLengthHigh;
        __u64 Reserved2;
        __u16 ByteCount;
-        __u8 Pad;               /* BB check for whether padded to DWORD
+        /* read response data immediately follows */
-                                   boundary and optimum performance here */
-        char Data[1];
 } __attribute__((packed)) READ_RSP;
 typedef struct locking_andx_range {
@@ -1913,6 +1911,10 @@ typedef struct whoami_rsp_data { /* Query level 0x202 */
 /* SETFSInfo Levels */
 #define SMB_SET_CIFS_UNIX_INFO    0x200
+/* level 0x203 is defined above in list of QFS info levels */
+/* #define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 */
+/* Level 0x200 request structure follows */
 typedef struct smb_com_transaction2_setfsi_req {
        struct smb_hdr hdr;     /* wct = 15 */
        __le16 TotalParameterCount;
@@ -1940,13 +1942,39 @@ typedef struct smb_com_transaction2_setfsi_req {
        __le64 ClientUnixCap;   /* Data end */
 } __attribute__((packed)) TRANSACTION2_SETFSI_REQ;
+/* level 0x203 request structure follows */
+typedef struct smb_com_transaction2_setfs_enc_req {
+        struct smb_hdr hdr;     /* wct = 15 */
+        __le16 TotalParameterCount;
+        __le16 TotalDataCount;
+        __le16 MaxParameterCount;
+        __le16 MaxDataCount;
+        __u8 MaxSetupCount;
+        __u8 Reserved;
+        __le16 Flags;
+        __le32 Timeout;
+        __u16 Reserved2;
+        __le16 ParameterCount;  /* 4 */
+        __le16 ParameterOffset;
+        __le16 DataCount;       /* 12 */
+        __le16 DataOffset;
+        __u8 SetupCount;        /* one */
+        __u8 Reserved3;
+        __le16 SubCommand;      /* TRANS2_SET_FS_INFORMATION */
+        __le16 ByteCount;
+        __u8 Pad;
+        __u16  Reserved4;       /* Parameters start. */
+        __le16 InformationLevel;/* Parameters end. */
+        /* NTLMSSP Blob, Data start. */
+} __attribute__((packed)) TRANSACTION2_SETFSI_ENC_REQ;
+/* response for setfsinfo levels 0x200 and 0x203 */
 typedef struct smb_com_transaction2_setfsi_rsp {
        struct smb_hdr hdr;     /* wct = 10 */
        struct trans2_resp t2;
        __u16 ByteCount;
 } __attribute__((packed)) TRANSACTION2_SETFSI_RSP;
 typedef struct smb_com_transaction2_get_dfs_refer_req {
        struct smb_hdr hdr;     /* wct = 15 */
        __le16 TotalParameterCount;
@@ -2098,13 +2126,13 @@ typedef struct {
 #define CIFS_UNIX_PROXY_CAP             0x00000400 /* Proxy cap: 0xACE ioctl and
                                                      QFS PROXY call */
 #ifdef CONFIG_CIFS_POSIX
-/* Can not set pathnames cap yet until we send new posix create SMB since
+/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
-   otherwise server can treat such handles opened with older ntcreatex
+   LockingX instead of posix locking call on unix sess (and we do not expect
-   (by a new client which knows how to send posix path ops)
+   LockingX to use different (ie Windows) semantics than posix locking on
-   as non-posix handles (can affect write behavior with byte range locks.
+   the same session (if WINE needs to do this later, we can add this cap
-   We can add back in POSIX_PATH_OPS cap when Posix Create/Mkdir finished */
+   back in later */
 /* #define CIFS_UNIX_CAP_MASK              0x000000fb */
-#define CIFS_UNIX_CAP_MASK              0x000000db
+#define CIFS_UNIX_CAP_MASK              0x000003db
 #else
 #define CIFS_UNIX_CAP_MASK              0x00000013
 #endif /* CONFIG_CIFS_POSIX */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8df28e925e5b..ef4f631e4c01 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
                                        struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
 extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                           unsigned int nvec, mid_callback_t *callback,
+                           unsigned int nvec, mid_receive_t *receive,
-                           void *cbdata, bool ignore_pend);
+                           mid_callback_t *callback, void *cbdata,
+                           bool ignore_pend);
 extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
@@ -90,6 +91,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
                                  struct TCP_Server_Info *);
+extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
 extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
                            unsigned int bytes_written);
@@ -145,12 +147,19 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
 extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
                              struct cifs_fattr *fattr, struct inode *inode,
                              const char *path, const __u16 *pfid);
-extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
+extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
+                                        uid_t, gid_t);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
                                        const char *, u32 *);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
-                                const char *);
+                                const char *, int);
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+                     unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+                struct kvec *iov_orig, unsigned int nr_segs,
+                unsigned int to_read);
 extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
                               struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
@@ -359,14 +368,17 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifs_tcon *tcon,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
+extern int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+                      const __u8 lock_type, const __u32 num_unlock,
+                      const __u32 num_lock, LOCKING_ANDX_RANGE *buf);
 extern int CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
-                        const __u16 netfid, const __u64 len,
+                        const __u16 netfid, const __u32 netpid, const __u64 len,
                        const __u64 offset, const __u32 numUnlock,
                        const __u32 numLock, const __u8 lockType,
                        const bool waitFlag, const __u8 oplock_level);
 extern int CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
-                        const __u16 smb_file_id, const int get_flag,
+                        const __u16 smb_file_id, const __u32 netpid,
-                        const __u64 len, struct file_lock *,
+                        const int get_flag, const __u64 len, struct file_lock *,
                        const __u16 lock_type, const bool waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifs_tcon *tcon);
 extern int CIFSSMBEcho(struct TCP_Server_Info *server);
@@ -380,7 +392,7 @@ extern void tconInfoFree(struct cifs_tcon *);
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
                          __u32 *);
-extern int cifs_verify_signature(struct smb_hdr *,
+extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov,
                                 struct TCP_Server_Info *server,
                                __u32 expected_sequence_number);
 extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
@@ -419,7 +431,7 @@ extern int CIFSSMBSetEA(const int xid, struct cifs_tcon *tcon,
 extern int CIFSSMBGetCIFSACL(const int xid, struct cifs_tcon *tcon,
                        __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
 extern int CIFSSMBSetCIFSACL(const int, struct cifs_tcon *, __u16,
-                        struct cifs_ntsd *, __u32);
+                        struct cifs_ntsd *, __u32, int);
 extern int CIFSSMBGetPosixACL(const int xid, struct cifs_tcon *tcon,
                const unsigned char *searchName,
                char *acl_inf, const int buflen, const int acl_type,
@@ -440,6 +452,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                        unsigned char *p24);
+/* asynchronous read support */
+struct cifs_readdata {
+        struct cifsFileInfo             *cfile;
+        struct address_space            *mapping;
+        __u64                           offset;
+        unsigned int                    bytes;
+        pid_t                           pid;
+        int                             result;
+        struct list_head                pages;
+        struct work_struct              work;
+        unsigned int                    nr_iov;
+        struct kvec                     iov[1];
+};
+struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
+void cifs_readdata_free(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_readdata *rdata);
 /* asynchronous write support */
 struct cifs_writedata {
        struct kref                     refcount;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aac37d99a487..6600aa2d2ef3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -33,6 +33,8 @@
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -40,6 +42,7 @@
 #include "cifsproto.h"
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
+#include "fscache.h"
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -83,6 +86,9 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
+/* Forward declarations */
+static void cifs_readv_complete(struct work_struct *work);
 /* Mark as invalid, all open files on tree connections since they
   were closed when session to server was lost */
 static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -453,8 +459,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
                }
                server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode);
                server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
-                server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
+                server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
-                                (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
                server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
                /* even though we do not use raw we might as well set this
                accurately, in case we ever find a need for it */
@@ -561,8 +566,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)
           little endian */
        server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
        /* probably no need to store and check maxvcs */
-        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
+        server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
-                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -739,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
        iov.iov_base = smb;
        iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
-        rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+        rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
+                             server, true);
        if (rc)
                cFYI(1, "Echo request failed: %d", rc);
@@ -1376,6 +1381,359 @@ openRetry:
        return rc;
 }
+struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages)
+{
+        struct cifs_readdata *rdata;
+        /* readdata + 1 kvec for each page */
+        rdata = kzalloc(sizeof(*rdata) +
+                        sizeof(struct kvec) * nr_pages, GFP_KERNEL);
+        if (rdata != NULL) {
+                INIT_WORK(&rdata->work, cifs_readv_complete);
+                INIT_LIST_HEAD(&rdata->pages);
+        }
+        return rdata;
+}
+void
+cifs_readdata_free(struct cifs_readdata *rdata)
+{
+        cifsFileInfo_put(rdata->cfile);
+        kfree(rdata);
+}
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+        READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+        unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+        int remaining = rfclen + 4 - server->total_read;
+        struct cifs_readdata *rdata = mid->callback_data;
+        while (remaining > 0) {
+                int length;
+                length = cifs_read_from_socket(server, server->bigbuf,
+                                min_t(unsigned int, remaining,
+                                        CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+                if (length < 0)
+                        return length;
+                server->total_read += length;
+                remaining -= length;
+        }
+        dequeue_mid(mid, rdata->result);
+        return 0;
+}
+static int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+        int length, len;
+        unsigned int data_offset, remaining, data_len;
+        struct cifs_readdata *rdata = mid->callback_data;
+        READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+        unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+        u64 eof;
+        pgoff_t eof_index;
+        struct page *page, *tpage;
+        cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+                mid->mid, rdata->offset, rdata->bytes);
+        /*
+         * read the rest of READ_RSP header (sans Data array), or whatever we
+         * can if there's not enough data. At this point, we've read down to
+         * the Mid.
+         */
+        len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
+                        sizeof(struct smb_hdr) + 1;
+        rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+        rdata->iov[0].iov_len = len;
+        length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+        if (length < 0)
+                return length;
+        server->total_read += length;
+        /* Was the SMB read successful? */
+        rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+        if (rdata->result != 0) {
+                cFYI(1, "%s: server returned error %d", __func__,
+                        rdata->result);
+                return cifs_readv_discard(server, mid);
+        }
+        /* Is there enough to get to the rest of the READ_RSP header? */
+        if (server->total_read < sizeof(READ_RSP)) {
+                cFYI(1, "%s: server returned short header. got=%u expected=%zu",
+                        __func__, server->total_read, sizeof(READ_RSP));
+                rdata->result = -EIO;
+                return cifs_readv_discard(server, mid);
+        }
+        data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+        if (data_offset < server->total_read) {
+                /*
+                 * win2k8 sometimes sends an offset of 0 when the read
+                 * is beyond the EOF. Treat it as if the data starts just after
+                 * the header.
+                 */
+                cFYI(1, "%s: data offset (%u) inside read response header",
+                        __func__, data_offset);
+                data_offset = server->total_read;
+        } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+                /* data_offset is beyond the end of smallbuf */
+                cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
+                        __func__, data_offset);
+                rdata->result = -EIO;
+                return cifs_readv_discard(server, mid);
+        }
+        cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
+                server->total_read, data_offset);
+        len = data_offset - server->total_read;
+        if (len > 0) {
+                /* read any junk before data into the rest of smallbuf */
+                rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+                rdata->iov[0].iov_len = len;
+                length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+                if (length < 0)
+                        return length;
+                server->total_read += length;
+        }
+        /* set up first iov for signature check */
+        rdata->iov[0].iov_base = server->smallbuf;
+        rdata->iov[0].iov_len = server->total_read;
+        cFYI(1, "0: iov_base=%p iov_len=%zu",
+                rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+        /* how much data is in the response? */
+        data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
+        data_len += le16_to_cpu(rsp->DataLength);
+        if (data_offset + data_len > rfclen) {
+                /* data_len is corrupt -- discard frame */
+                rdata->result = -EIO;
+                return cifs_readv_discard(server, mid);
+        }
+        /* marshal up the page array */
+        len = 0;
+        remaining = data_len;
+        rdata->nr_iov = 1;
+        /* determine the eof that the server (probably) has */
+        eof = CIFS_I(rdata->mapping->host)->server_eof;
+        eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+        cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+        list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+                if (remaining >= PAGE_CACHE_SIZE) {
+                        /* enough data to fill the page */
+                        rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+                        rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+                        cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+                                rdata->nr_iov, page->index,
+                                rdata->iov[rdata->nr_iov].iov_base,
+                                rdata->iov[rdata->nr_iov].iov_len);
+                        ++rdata->nr_iov;
+                        len += PAGE_CACHE_SIZE;
+                        remaining -= PAGE_CACHE_SIZE;
+                } else if (remaining > 0) {
+                        /* enough for partial page, fill and zero the rest */
+                        rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+                        rdata->iov[rdata->nr_iov].iov_len = remaining;
+                        cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+                                rdata->nr_iov, page->index,
+                                rdata->iov[rdata->nr_iov].iov_base,
+                                rdata->iov[rdata->nr_iov].iov_len);
+                        memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+                                '\0', PAGE_CACHE_SIZE - remaining);
+                        ++rdata->nr_iov;
+                        len += remaining;
+                        remaining = 0;
+                } else if (page->index > eof_index) {
+                        /*
+                         * The VFS will not try to do readahead past the
+                         * i_size, but it's possible that we have outstanding
+                         * writes with gaps in the middle and the i_size hasn't
+                         * caught up yet. Populate those with zeroed out pages
+                         * to prevent the VFS from repeatedly attempting to
+                         * fill them until the writes are flushed.
+                         */
+                        zero_user(page, 0, PAGE_CACHE_SIZE);
+                        list_del(&page->lru);
+                        lru_cache_add_file(page);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                } else {
+                        /* no need to hold page hostage */
+                        list_del(&page->lru);
+                        lru_cache_add_file(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+        }
+        /* issue the read if we have any iovecs left to fill */
+        if (rdata->nr_iov > 1) {
+                length = cifs_readv_from_socket(server, &rdata->iov[1],
+                                                rdata->nr_iov - 1, len);
+                if (length < 0)
+                        return length;
+                server->total_read += length;
+        } else {
+                length = 0;
+        }
+        rdata->bytes = length;
+        cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
+                rfclen, remaining);
+        /* discard anything left over */
+        if (server->total_read < rfclen)
+                return cifs_readv_discard(server, mid);
+        dequeue_mid(mid, false);
+        return length;
+}
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+        struct cifs_readdata *rdata = container_of(work,
+                                                struct cifs_readdata, work);
+        struct page *page, *tpage;
+        list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+                list_del(&page->lru);
+                lru_cache_add_file(page);
+                if (rdata->result == 0) {
+                        kunmap(page);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                }
+                unlock_page(page);
+                if (rdata->result == 0)
+                        cifs_readpage_to_fscache(rdata->mapping->host, page);
+                page_cache_release(page);
+        }
+        cifs_readdata_free(rdata);
+}
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+        struct cifs_readdata *rdata = mid->callback_data;
+        struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
+                mid->mid, mid->midState, rdata->result, rdata->bytes);
+        switch (mid->midState) {
+        case MID_RESPONSE_RECEIVED:
+                /* result already set, check signature */
+                if (server->sec_mode &
+                    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                        if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+                                          server, mid->sequence_number + 1))
+                                cERROR(1, "Unexpected SMB signature");
+                }
+                /* FIXME: should this be counted toward the initiating task? */
+                task_io_account_read(rdata->bytes);
+                cifs_stats_bytes_read(tcon, rdata->bytes);
+                break;
+        case MID_REQUEST_SUBMITTED:
+        case MID_RETRY_NEEDED:
+                rdata->result = -EAGAIN;
+                break;
+        default:
+                rdata->result = -EIO;
+        }
+        queue_work(system_nrt_wq, &rdata->work);
+        DeleteMidQEntry(mid);
+        atomic_dec(&server->inFlight);
+        wake_up(&server->request_q);
+}
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+        int rc;
+        READ_REQ *smb = NULL;
+        int wct;
+        struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+        cFYI(1, "%s: offset=%llu bytes=%u", __func__,
+                rdata->offset, rdata->bytes);
+        if (tcon->ses->capabilities & CAP_LARGE_FILES)
+                wct = 12;
+        else {
+                wct = 10; /* old style read */
+                if ((rdata->offset >> 32) > 0)  {
+                        /* can not handle this big offset for old */
+                        return -EIO;
+                }
+        }
+        rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+        if (rc)
+                return rc;
+        smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+        smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+        smb->AndXCommand = 0xFF;        /* none */
+        smb->Fid = rdata->cfile->netfid;
+        smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+        if (wct == 12)
+                smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+        smb->Remaining = 0;
+        smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+        smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+        if (wct == 12)
+                smb->ByteCount = 0;
+        else {
+                /* old style read */
+                struct smb_com_readx_req *smbr =
+                        (struct smb_com_readx_req *)smb;
+                smbr->ByteCount = 0;
+        }
+        /* 4 for RFC1001 length + 1 for BCC */
+        rdata->iov[0].iov_base = smb;
+        rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+        rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
+                             cifs_readv_receive, cifs_readv_callback,
+                             rdata, false);
+        if (rc == 0)
+                cifs_stats_inc(&tcon->num_reads);
+        cifs_small_buf_release(smb);
+        return rc;
+}
 int
 CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
            char **buf, int *pbuf_type)
@@ -1836,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
        kref_get(&wdata->refcount);
        rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
-                             cifs_writev_callback, wdata, false);
+                             NULL, cifs_writev_callback, wdata, false);
        if (rc == 0)
                cifs_stats_inc(&tcon->num_writes);
@@ -1962,10 +2320,50 @@ CIFSSMBWrite2(const int xid, struct cifs_io_parms *io_parms,
        return rc;
 }
+int cifs_lockv(const int xid, struct cifs_tcon *tcon, const __u16 netfid,
+               const __u8 lock_type, const __u32 num_unlock,
+               const __u32 num_lock, LOCKING_ANDX_RANGE *buf)
+{
+        int rc = 0;
+        LOCK_REQ *pSMB = NULL;
+        struct kvec iov[2];
+        int resp_buf_type;
+        __u16 count;
+        cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+        rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
+        if (rc)
+                return rc;
+        pSMB->Timeout = 0;
+        pSMB->NumberOfLocks = cpu_to_le16(num_lock);
+        pSMB->NumberOfUnlocks = cpu_to_le16(num_unlock);
+        pSMB->LockType = lock_type;
+        pSMB->AndXCommand = 0xFF; /* none */
+        pSMB->Fid = netfid; /* netfid stays le */
+        count = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+        inc_rfc1001_len(pSMB, count);
+        pSMB->ByteCount = cpu_to_le16(count);
+        iov[0].iov_base = (char *)pSMB;
+        iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4 -
+                         (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+        iov[1].iov_base = (char *)buf;
+        iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
+        cifs_stats_inc(&tcon->num_locks);
+        rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+        if (rc)
+                cFYI(1, "Send error in cifs_lockv = %d", rc);
+        return rc;
+}
 int
 CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
-            const __u16 smb_file_id, const __u64 len,
+            const __u16 smb_file_id, const __u32 netpid, const __u64 len,
            const __u64 offset, const __u32 numUnlock,
            const __u32 numLock, const __u8 lockType,
            const bool waitFlag, const __u8 oplock_level)
@@ -2001,7 +2399,7 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
        pSMB->Fid = smb_file_id; /* netfid stays le */
        if ((numLock != 0) || (numUnlock != 0)) {
-                pSMB->Locks[0].Pid = cpu_to_le16(current->tgid);
+                pSMB->Locks[0].Pid = cpu_to_le16(netpid);
                /* BB where to store pid high? */
                pSMB->Locks[0].LengthLow = cpu_to_le32((u32)len);
                pSMB->Locks[0].LengthHigh = cpu_to_le32((u32)(len>>32));
@@ -2035,9 +2433,9 @@ CIFSSMBLock(const int xid, struct cifs_tcon *tcon,
 int
 CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
-                const __u16 smb_file_id, const int get_flag, const __u64 len,
+                const __u16 smb_file_id, const __u32 netpid, const int get_flag,
-                struct file_lock *pLockData, const __u16 lock_type,
+                const __u64 len, struct file_lock *pLockData,
-                const bool waitFlag)
+                const __u16 lock_type, const bool waitFlag)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
        struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -2095,7 +2493,7 @@ CIFSSMBPosixLock(const int xid, struct cifs_tcon *tcon,
        } else
                pSMB->Timeout = 0;
-        parm_data->pid = cpu_to_le32(current->tgid);
+        parm_data->pid = cpu_to_le32(netpid);
        parm_data->start = cpu_to_le64(pLockData->fl_start);
        parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
@@ -2812,8 +3210,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifs_tcon *tcon,
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le32(2);
        /* BB find exact data count max from sess structure BB */
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+        pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
        pSMB->MaxSetupCount = 4;
        pSMB->Reserved = 0;
        pSMB->ParameterOffset = 0;
@@ -3306,8 +3703,7 @@ smb_init_nttransact(const __u16 sub_command, const int setup_count,
        pSMB->Reserved = 0;
        pSMB->TotalParameterCount = cpu_to_le32(parm_len);
        pSMB->TotalDataCount  = 0;
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+        pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
        pSMB->ParameterCount = pSMB->TotalParameterCount;
        pSMB->DataCount  = pSMB->TotalDataCount;
        temp_offset = offsetof(struct smb_com_ntransact_req, Parms) +
@@ -3467,7 +3863,7 @@ qsec_out:
 int
 CIFSSMBSetCIFSACL(const int xid, struct cifs_tcon *tcon, __u16 fid,
-                        struct cifs_ntsd *pntsd, __u32 acllen)
+                        struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
 {
        __u16 byte_count, param_count, data_count, param_offset, data_offset;
        int rc = 0;
@@ -3504,7 +3900,7 @@ setCifsAclRetry:
        pSMB->Fid = fid; /* file handle always le */
        pSMB->Reserved2 = 0;
-        pSMB->AclFlags = cpu_to_le32(CIFS_ACL_DACL);
+        pSMB->AclFlags = cpu_to_le32(aclflag);
        if (pntsd && acllen) {
                memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
@@ -3977,8 +4373,7 @@ findFirstRetry:
        params = 12 + name_len /* includes null */ ;
        pSMB->TotalDataCount = 0;       /* no EAs */
        pSMB->MaxParameterCount = cpu_to_le16(10);
-        pSMB->MaxDataCount = cpu_to_le16((tcon->ses->server->maxBuf -
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
-                                          MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
@@ -4052,8 +4447,7 @@ findFirstRetry:
                        psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
                                psrch_inf->entries_in_buffer;
                        lnoff = le16_to_cpu(parms->LastNameOffset);
-                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+                        if (CIFSMaxBufSize < lnoff) {
-                              lnoff) {
                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
@@ -4079,7 +4473,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
        T2_FNEXT_RSP_PARMS *parms;
        char *response_data;
        int rc = 0;
-        int bytes_returned, name_len;
+        int bytes_returned;
+        unsigned int name_len;
        __u16 params, byte_count;
        cFYI(1, "In FindNext");
@@ -4096,9 +4491,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
        byte_count = 0;
        pSMB->TotalDataCount = 0;       /* no EAs */
        pSMB->MaxParameterCount = cpu_to_le16(8);
-        pSMB->MaxDataCount =
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize & 0xFFFFFF00);
-                cpu_to_le16((tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) &
-                                0xFFFFFF00);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
@@ -4180,8 +4573,7 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
                        psrch_inf->index_of_last_entry +=
                                psrch_inf->entries_in_buffer;
                        lnoff = le16_to_cpu(parms->LastNameOffset);
-                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+                        if (CIFSMaxBufSize < lnoff) {
-                              lnoff) {
                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
@@ -5839,7 +6231,7 @@ QAllEAsRetry:
                if (ea_name) {
                        if (ea_name_len == name_len &&
-                            strncmp(ea_name, temp_ptr, name_len) == 0) {
+                            memcmp(ea_name, temp_ptr, name_len) == 0) {
                                temp_ptr += name_len + 1;
                                rc = value_len;
                                if (buf_size == 0)
@@ -6034,12 +6426,7 @@ int CIFSSMBNotify(const int xid, struct cifs_tcon *tcon,
        pSMB->TotalParameterCount = 0 ;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le32(2);
-        /* BB find exact data count max from sess structure BB */
+        pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
-        pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
-        pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
-                                             MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
        pSMB->MaxSetupCount = 4;
        pSMB->Reserved = 0;
        pSMB->ParameterOffset = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 80c2e3add3a2..d545a95c30ed 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -181,7 +181,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                -EINVAL = invalid transact2
 */
-static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
+static int check2ndT2(struct smb_hdr *pSMB)
 {
        struct smb_t2_rsp *pSMBt;
        int remaining;
@@ -214,9 +214,9 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        cFYI(1, "missing %d bytes from transact2, check next response",
                remaining);
-        if (total_data_size > maxBufSize) {
+        if (total_data_size > CIFSMaxBufSize) {
                cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                        total_data_size, maxBufSize);
+                        total_data_size, CIFSMaxBufSize);
                return -EINVAL;
        }
        return remaining;
@@ -320,27 +320,24 @@ requeue_echo:
 }
 static bool
-allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
+allocate_buffers(struct TCP_Server_Info *server)
-                 bool is_large_buf)
 {
-        char *bbuf = *bigbuf, *sbuf = *smallbuf;
+        if (!server->bigbuf) {
+                server->bigbuf = (char *)cifs_buf_get();
-        if (bbuf == NULL) {
+                if (!server->bigbuf) {
-                bbuf = (char *)cifs_buf_get();
-                if (!bbuf) {
                        cERROR(1, "No memory for large SMB response");
                        msleep(3000);
                        /* retry will check if exiting */
                        return false;
                }
-        } else if (is_large_buf) {
+        } else if (server->large_buf) {
                /* we are reusing a dirty large buf, clear its start */
-                memset(bbuf, 0, size);
+                memset(server->bigbuf, 0, sizeof(struct smb_hdr));
        }
-        if (sbuf == NULL) {
+        if (!server->smallbuf) {
-                sbuf = (char *)cifs_small_buf_get();
+                server->smallbuf = (char *)cifs_small_buf_get();
-                if (!sbuf) {
+                if (!server->smallbuf) {
                        cERROR(1, "No memory for SMB response");
                        msleep(1000);
                        /* retry will check if exiting */
@@ -349,36 +346,116 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
                /* beginning of smb buffer is cleared in our buf_get */
        } else {
                /* if existing small buf clear beginning */
-                memset(sbuf, 0, size);
+                memset(server->smallbuf, 0, sizeof(struct smb_hdr));
        }
-        *bigbuf = bbuf;
-        *smallbuf = sbuf;
        return true;
 }
-static int
+static bool
-read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
+server_unresponsive(struct TCP_Server_Info *server)
-                 struct kvec *iov, unsigned int to_read,
+{
-                 unsigned int *ptotal_read, bool is_header_read)
+        if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+            time_after(jiffies, server->lstrp +
+                                (echo_retries * SMB_ECHO_INTERVAL))) {
+                cERROR(1, "Server %s has not responded in %d seconds. "
+                          "Reconnecting...", server->hostname,
+                          (echo_retries * SMB_ECHO_INTERVAL / HZ));
+                cifs_reconnect(server);
+                wake_up(&server->response_q);
+                return true;
+        }
+        return false;
+}
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new:        pointer to memory for cloned array
+ * @iov:        pointer to original array
+ * @nr_segs:    number of members in original array
+ * @bytes:      number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+                size_t bytes)
+{
+        size_t base = 0;
+        while (bytes || !iov->iov_len) {
+                int copy = min(bytes, iov->iov_len);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        nr_segs--;
+                        base = 0;
+                }
+        }
+        memcpy(new, iov, sizeof(*iov) * nr_segs);
+        new->iov_base += base;
+        new->iov_len -= base;
+        return nr_segs;
+}
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
+{
+        struct kvec *new_iov;
+        if (server->iov && nr_segs <= server->nr_iov)
+                return server->iov;
+        /* not big enough -- allocate a new one and release the old */
+        new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+        if (new_iov) {
+                kfree(server->iov);
+                server->iov = new_iov;
+                server->nr_iov = nr_segs;
+        }
+        return new_iov;
+}
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+                       unsigned int nr_segs, unsigned int to_read)
 {
-        int length, rc = 0;
+        int length = 0;
-        unsigned int total_read;
+        int total_read;
-        char *buf = iov->iov_base;
+        unsigned int segs;
+        struct msghdr smb_msg;
+        struct kvec *iov;
+        iov = get_server_iovec(server, nr_segs);
+        if (!iov)
+                return -ENOMEM;
+        smb_msg.msg_control = NULL;
+        smb_msg.msg_controllen = 0;
+        for (total_read = 0; to_read; total_read += length, to_read -= length) {
+                if (server_unresponsive(server)) {
+                        total_read = -EAGAIN;
+                        break;
+                }
+                segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+                length = kernel_recvmsg(server->ssocket, &smb_msg,
+                                        iov, segs, to_read, 0);
-        for (total_read = 0; total_read < to_read; total_read += length) {
-                length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
-                                        to_read - total_read, 0);
                if (server->tcpStatus == CifsExiting) {
-                        /* then will exit */
+                        total_read = -ESHUTDOWN;
-                        rc = 2;
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
                        cifs_reconnect(server);
-                        /* Reconnect wakes up rspns q */
+                        total_read = -EAGAIN;
-                        /* Now we will reread sock */
-                        rc = 1;
                        break;
                } else if (length == -ERESTARTSYS ||
                           length == -EAGAIN ||
@@ -390,56 +467,54 @@ read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
                         */
                        usleep_range(1000, 2000);
                        length = 0;
-                        if (!is_header_read)
+                        continue;
-                                continue;
-                        /* Special handling for header read */
-                        if (total_read) {
-                                iov->iov_base = (to_read - total_read) +
-                                                buf;
-                                iov->iov_len = to_read - total_read;
-                                smb_msg->msg_control = NULL;
-                                smb_msg->msg_controllen = 0;
-                                rc = 3;
-                        } else
-                                rc = 1;
-                        break;
                } else if (length <= 0) {
-                        cERROR(1, "Received no data, expecting %d",
+                        cFYI(1, "Received no data or error: expecting %d "
-                               to_read - total_read);
+                                "got %d", to_read, length);
                        cifs_reconnect(server);
-                        rc = 1;
+                        total_read = -EAGAIN;
                        break;
                }
        }
+        return total_read;
+}
-        *ptotal_read = total_read;
+int
-        return rc;
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+                      unsigned int to_read)
+{
+        struct kvec iov;
+        iov.iov_base = buf;
+        iov.iov_len = to_read;
+        return cifs_readv_from_socket(server, &iov, 1, to_read);
 }
 static bool
-check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
+is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 {
-        char temp = *buf;
-        unsigned int pdu_length = be32_to_cpu(
-                                ((struct smb_hdr *)buf)->smb_buf_length);
        /*
         * The first byte big endian of the length field,
         * is actually not part of the length but the type
         * with the most common, zero, as regular data.
         */
-        if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
+        switch (type) {
-                return false;
+        case RFC1002_SESSION_MESSAGE:
-        } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
+                /* Regular SMB response */
-                cFYI(1, "Good RFC 1002 session rsp");
+                return true;
-                return false;
+        case RFC1002_SESSION_KEEP_ALIVE:
-        } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
+                cFYI(1, "RFC 1002 session keep alive");
+                break;
+        case RFC1002_POSITIVE_SESSION_RESPONSE:
+                cFYI(1, "RFC 1002 positive session response");
+                break;
+        case RFC1002_NEGATIVE_SESSION_RESPONSE:
                /*
                 * We get this from Windows 98 instead of an error on
                 * SMB negprot response.
                 */
-                cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
+                cFYI(1, "RFC 1002 negative session response");
-                        pdu_length);
                /* give server a second to clean up */
                msleep(1000);
                /*
@@ -448,87 +523,89 @@ check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
                 * is since we do not begin with RFC1001 session
                 * initialize frame).
                 */
-                cifs_set_port((struct sockaddr *)
+                cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
-                                &server->dstaddr, CIFS_PORT);
                cifs_reconnect(server);
                wake_up(&server->response_q);
-                return false;
+                break;
-        } else if (temp != (char) 0) {
+        default:
-                cERROR(1, "Unknown RFC 1002 frame");
+                cERROR(1, "RFC 1002 unknown response type 0x%x", type);
-                cifs_dump_mem(" Received Data: ", buf, 4);
-                cifs_reconnect(server);
-                return false;
-        }
-        /* else we have an SMB response */
-        if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                       4, pdu_length+4);
                cifs_reconnect(server);
-                wake_up(&server->response_q);
-                return false;
        }
-        return true;
+        return false;
 }
 static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
+find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
-              int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
 {
-        struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+        struct mid_q_entry *mid;
        spin_lock(&GlobalMid_Lock);
-        list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
+        list_for_each_entry(mid, &server->pending_mid_q, qhead) {
-                if (mid->mid != buf->Mid ||
+                if (mid->mid == buf->Mid &&
-                    mid->midState != MID_REQUEST_SUBMITTED ||
+                    mid->midState == MID_REQUEST_SUBMITTED &&
-                    mid->command != buf->Command)
+                    mid->command == buf->Command) {
-                        continue;
+                        spin_unlock(&GlobalMid_Lock);
+                        return mid;
-                if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
-                        /* We have a multipart transact2 resp */
-                        *is_multi_rsp = true;
-                        if (mid->resp_buf) {
-                                /* merge response - fix up 1st*/
-                                *length = coalesce_t2(buf, mid->resp_buf);
-                                if (*length > 0) {
-                                        *length = 0;
-                                        mid->multiRsp = true;
-                                        break;
-                                }
-                                /* All parts received or packet is malformed. */
-                                mid->multiEnd = true;
-                                goto multi_t2_fnd;
-                        }
-                        if (!is_large_buf) {
-                                /*FIXME: switch to already allocated largebuf?*/
-                                cERROR(1, "1st trans2 resp needs bigbuf");
-                        } else {
-                                /* Have first buffer */
-                                mid->resp_buf = buf;
-                                mid->largeBuf = true;
-                                *bigbuf = NULL;
-                        }
-                        break;
                }
-                mid->resp_buf = buf;
+        }
-                mid->largeBuf = is_large_buf;
+        spin_unlock(&GlobalMid_Lock);
-multi_t2_fnd:
+        return NULL;
-                if (*length == 0)
+}
-                        mid->midState = MID_RESPONSE_RECEIVED;
-                else
+void
-                        mid->midState = MID_RESPONSE_MALFORMED;
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
 #ifdef CONFIG_CIFS_STATS2
-                mid->when_received = jiffies;
+        mid->when_received = jiffies;
 #endif
-                list_del_init(&mid->qhead);
+        spin_lock(&GlobalMid_Lock);
-                ret = mid;
+        if (!malformed)
-                break;
+                mid->midState = MID_RESPONSE_RECEIVED;
-        }
+        else
+                mid->midState = MID_RESPONSE_MALFORMED;
+        list_del_init(&mid->qhead);
        spin_unlock(&GlobalMid_Lock);
+}
-        return ret;
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+           struct smb_hdr *buf, int malformed)
+{
+        if (malformed == 0 && check2ndT2(buf) > 0) {
+                mid->multiRsp = true;
+                if (mid->resp_buf) {
+                        /* merge response - fix up 1st*/
+                        malformed = coalesce_t2(buf, mid->resp_buf);
+                        if (malformed > 0)
+                                return;
+                        /* All parts received or packet is malformed. */
+                        mid->multiEnd = true;
+                        return dequeue_mid(mid, malformed);
+                }
+                if (!server->large_buf) {
+                        /*FIXME: switch to already allocated largebuf?*/
+                        cERROR(1, "1st trans2 resp needs bigbuf");
+                } else {
+                        /* Have first buffer */
+                        mid->resp_buf = buf;
+                        mid->largeBuf = true;
+                        server->bigbuf = NULL;
+                }
+                return;
+        }
+        mid->resp_buf = buf;
+        mid->largeBuf = server->large_buf;
+        /* Was previous buf put in mpx struct for multi-rsp? */
+        if (!mid->multiRsp) {
+                /* smb buffer will be freed by user thread */
+                if (server->large_buf)
+                        server->bigbuf = NULL;
+                else
+                        server->smallbuf = NULL;
+        }
+        dequeue_mid(mid, malformed);
 }
 static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -618,6 +695,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
        }
        kfree(server->hostname);
+        kfree(server->iov);
        kfree(server);
        length = atomic_dec_return(&tcpSesAllocCount);
@@ -627,20 +705,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 }
 static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+        int length;
+        char *buf = server->smallbuf;
+        struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
+        unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+        /* make sure this will fit in a large buffer */
+        if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+                cERROR(1, "SMB response too long (%u bytes)",
+                        pdu_length);
+                cifs_reconnect(server);
+                wake_up(&server->response_q);
+                return -EAGAIN;
+        }
+        /* switch to large buffer if too big for a small one */
+        if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+                server->large_buf = true;
+                memcpy(server->bigbuf, server->smallbuf, server->total_read);
+                buf = server->bigbuf;
+                smb_buffer = (struct smb_hdr *)buf;
+        }
+        /* now read the rest */
+        length = cifs_read_from_socket(server,
+                          buf + sizeof(struct smb_hdr) - 1,
+                          pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+        if (length < 0)
+                return length;
+        server->total_read += length;
+        dump_smb(smb_buffer, server->total_read);
+        /*
+         * We know that we received enough to get to the MID as we
+         * checked the pdu_length earlier. Now check to see
+         * if the rest of the header is OK. We borrow the length
+         * var for the rest of the loop to avoid a new stack var.
+         *
+         * 48 bytes is enough to display the header and a little bit
+         * into the payload for debugging purposes.
+         */
+        length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+        if (length != 0)
+                cifs_dump_mem("Bad SMB: ", buf,
+                        min_t(unsigned int, server->total_read, 48));
+        if (mid)
+                handle_mid(mid, server, smb_buffer, length);
+        return length;
+}
+static int
 cifs_demultiplex_thread(void *p)
 {
        int length;
        struct TCP_Server_Info *server = p;
-        unsigned int pdu_length, total_read;
+        unsigned int pdu_length;
-        char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
+        char *buf = NULL;
        struct smb_hdr *smb_buffer = NULL;
-        struct msghdr smb_msg;
-        struct kvec iov;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
-        bool isLargeBuf = false;
-        bool isMultiRsp = false;
-        int rc;
        current->flags |= PF_MEMALLOC;
        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -655,111 +783,65 @@ cifs_demultiplex_thread(void *p)
                if (try_to_freeze())
                        continue;
-                if (!allocate_buffers(&bigbuf, &smallbuf,
+                if (!allocate_buffers(server))
-                                      sizeof(struct smb_hdr), isLargeBuf))
                        continue;
-                isLargeBuf = false;
+                server->large_buf = false;
-                isMultiRsp = false;
+                smb_buffer = (struct smb_hdr *)server->smallbuf;
-                smb_buffer = (struct smb_hdr *)smallbuf;
+                buf = server->smallbuf;
-                buf = smallbuf;
-                iov.iov_base = buf;
-                iov.iov_len = 4;
-                smb_msg.msg_control = NULL;
-                smb_msg.msg_controllen = 0;
                pdu_length = 4; /* enough to get RFC1001 header */
-incomplete_rcv:
+                length = cifs_read_from_socket(server, buf, pdu_length);
-                if (echo_retries > 0 && server->tcpStatus == CifsGood &&
+                if (length < 0)
-                    time_after(jiffies, server->lstrp +
-                                        (echo_retries * SMB_ECHO_INTERVAL))) {
-                        cERROR(1, "Server %s has not responded in %d seconds. "
-                                  "Reconnecting...", server->hostname,
-                                  (echo_retries * SMB_ECHO_INTERVAL / HZ));
-                        cifs_reconnect(server);
-                        wake_up(&server->response_q);
-                        continue;
-                }
-                rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
-                                      &total_read, true /* header read */);
-                if (rc == 3)
-                        goto incomplete_rcv;
-                else if (rc == 2)
-                        break;
-                else if (rc == 1)
                        continue;
+                server->total_read = length;
                /*
                 * The right amount was read from socket - 4 bytes,
                 * so we can now interpret the length field.
                 */
-                /*
-                 * Note that RFC 1001 length is big endian on the wire,
-                 * but we convert it here so it is always manipulated
-                 * as host byte order.
-                 */
                pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
-                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
+                cFYI(1, "RFC1002 header 0x%x", pdu_length);
-                if (!check_rfc1002_header(server, buf))
+                if (!is_smb_response(server, buf[0]))
                        continue;
-                /* else length ok */
+                /* make sure we have enough to get to the MID */
-                if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+                if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
-                        isLargeBuf = true;
+                        cERROR(1, "SMB response too short (%u bytes)",
-                        memcpy(bigbuf, smallbuf, 4);
+                                pdu_length);
-                        smb_buffer = (struct smb_hdr *)bigbuf;
+                        cifs_reconnect(server);
-                        buf = bigbuf;
+                        wake_up(&server->response_q);
+                        continue;
                }
-                iov.iov_base = 4 + buf;
+                /* read down to the MID */
-                iov.iov_len = pdu_length;
+                length = cifs_read_from_socket(server, buf + 4,
-                rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
+                                        sizeof(struct smb_hdr) - 1 - 4);
-                                      &total_read, false);
+                if (length < 0)
-                if (rc == 2)
-                        break;
-                else if (rc == 1)
                        continue;
+                server->total_read += length;
-                total_read += 4; /* account for rfc1002 hdr */
+                mid_entry = find_mid(server, smb_buffer);
-                dump_smb(smb_buffer, total_read);
+                if (!mid_entry || !mid_entry->receive)
+                        length = standard_receive3(server, mid_entry);
+                else
+                        length = mid_entry->receive(server, mid_entry);
-                /*
+                if (length < 0)
-                 * We know that we received enough to get to the MID as we
+                        continue;
-                 * checked the pdu_length earlier. Now check to see
-                 * if the rest of the header is OK. We borrow the length
-                 * var for the rest of the loop to avoid a new stack var.
-                 *
-                 * 48 bytes is enough to display the header and a little bit
-                 * into the payload for debugging purposes.
-                 */
-                length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
-                if (length != 0)
-                        cifs_dump_mem("Bad SMB: ", buf,
-                                      min_t(unsigned int, total_read, 48));
-                server->lstrp = jiffies;
+                if (server->large_buf) {
+                        buf = server->bigbuf;
+                        smb_buffer = (struct smb_hdr *)buf;
+                }
-                mid_entry = find_cifs_mid(server, smb_buffer, &length,
+                server->lstrp = jiffies;
-                                          isLargeBuf, &isMultiRsp, &bigbuf);
                if (mid_entry != NULL) {
-                        mid_entry->callback(mid_entry);
+                        if (!mid_entry->multiRsp || mid_entry->multiEnd)
-                        /* Was previous buf put in mpx struct for multi-rsp? */
+                                mid_entry->callback(mid_entry);
-                        if (!isMultiRsp) {
+                } else if (!is_valid_oplock_break(smb_buffer, server)) {
-                                /* smb buffer will be freed by user thread */
-                                if (isLargeBuf)
-                                        bigbuf = NULL;
-                                else
-                                        smallbuf = NULL;
-                        }
-                } else if (length != 0) {
-                        /* response sanity checks failed */
-                        continue;
-                } else if (!is_valid_oplock_break(smb_buffer, server) &&
-                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
                                   "NumMids %d", atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", buf,
@@ -773,9 +855,9 @@ incomplete_rcv:
        } /* end while !EXITING */
        /* buffer usually freed in free_mid - need to free it here on exit */
-        cifs_buf_release(bigbuf);
+        cifs_buf_release(server->bigbuf);
-        if (smallbuf) /* no sense logging a debug message if NULL */
+        if (server->smallbuf) /* no sense logging a debug message if NULL */
-                cifs_small_buf_release(smallbuf);
+                cifs_small_buf_release(server->smallbuf);
        task_to_wake = xchg(&server->tsk, NULL);
        clean_demultiplex_info(server);
@@ -827,6 +909,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 {
        char *value, *data, *end;
        char *mountdata_copy = NULL, *options;
+        int err;
        unsigned int  temp_len, i, j;
        char separator[2];
        short int override_uid = -1;
@@ -883,6 +966,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        cFYI(1, "Null separator not allowed");
                }
        }
+        vol->backupuid_specified = false; /* no backup intent for a user */
+        vol->backupgid_specified = false; /* no backup intent for a group */
        while ((data = strsep(&options, separator)) != NULL) {
                if (!*data)
@@ -1298,7 +1383,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        /* ignore */
                } else if (strnicmp(data, "guest", 5) == 0) {
                        /* ignore */
-                } else if (strnicmp(data, "rw", 2) == 0) {
+                } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
                        /* ignore */
                } else if (strnicmp(data, "ro", 2) == 0) {
                        /* ignore */
@@ -1401,7 +1486,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->server_ino = 1;
                } else if (strnicmp(data, "noserverino", 9) == 0) {
                        vol->server_ino = 0;
-                } else if (strnicmp(data, "rwpidforward", 4) == 0) {
+                } else if (strnicmp(data, "rwpidforward", 12) == 0) {
                        vol->rwpidforward = 1;
                } else if (strnicmp(data, "cifsacl", 7) == 0) {
                        vol->cifs_acl = 1;
@@ -1442,6 +1527,22 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->mfsymlinks = true;
                } else if (strnicmp(data, "multiuser", 8) == 0) {
                        vol->multiuser = true;
+                } else if (!strnicmp(data, "backupuid", 9) && value && *value) {
+                        err = kstrtouint(value, 0, &vol->backupuid);
+                        if (err < 0) {
+                                cERROR(1, "%s: Invalid backupuid value",
+                                        __func__);
+                                goto cifs_parse_mount_err;
+                        }
+                        vol->backupuid_specified = true;
+                } else if (!strnicmp(data, "backupgid", 9) && value && *value) {
+                        err = kstrtouint(value, 0, &vol->backupgid);
+                        if (err < 0) {
+                                cERROR(1, "%s: Invalid backupgid value",
+                                        __func__);
+                                goto cifs_parse_mount_err;
+                        }
+                        vol->backupgid_specified = true;
                } else
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
                                                data);
@@ -2018,7 +2119,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                warned_on_ntlm = true;
                cERROR(1, "default security mechanism requested.  The default "
                        "security mechanism will be upgraded from ntlm to "
-                        "ntlmv2 in kernel release 3.1");
+                        "ntlmv2 in kernel release 3.2");
        }
        ses->overrideSecFlg = volume_info->secFlg;
@@ -2209,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
            (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
                return 0;
-        if (old->rsize != new->rsize)
-                return 0;
        /*
-         * We want to share sb only if we don't specify wsize or specified wsize
+         * We want to share sb only if we don't specify an r/wsize or
-         * is greater or equal than existing one.
+         * specified r/wsize is greater than or equal to existing one.
         */
        if (new->wsize && new->wsize < old->wsize)
                return 0;
+        if (new->rsize && new->rsize < old->rsize)
+                return 0;
        if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
                return 0;
@@ -2656,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                                        CIFS_MOUNT_POSIX_PATHS;
                }
-                if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
-                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
-                                cifs_sb->rsize = 127 * 1024;
-                                cFYI(DBG2, "larger reads not supported by srv");
-                        }
-                }
                cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
                if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2708,31 +2801,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
        spin_lock_init(&cifs_sb->tlink_tree_lock);
        cifs_sb->tlink_tree = RB_ROOT;
-        if (pvolume_info->rsize > CIFSMaxBufSize) {
-                cERROR(1, "rsize %d too large, using MaxBufSize",
-                        pvolume_info->rsize);
-                cifs_sb->rsize = CIFSMaxBufSize;
-        } else if ((pvolume_info->rsize) &&
-                        (pvolume_info->rsize <= CIFSMaxBufSize))
-                cifs_sb->rsize = pvolume_info->rsize;
-        else /* default */
-                cifs_sb->rsize = CIFSMaxBufSize;
-        if (cifs_sb->rsize < 2048) {
-                cifs_sb->rsize = 2048;
-                /* Windows ME may prefer this */
-                cFYI(1, "readsize set to minimum: 2048");
-        }
        /*
-         * Temporarily set wsize for matching superblock. If we end up using
+         * Temporarily set r/wsize for matching superblock. If we end up using
-         * new sb then cifs_negotiate_wsize will later negotiate it downward
+         * new sb then client will later negotiate it downward if needed.
-         * if needed.
         */
+        cifs_sb->rsize = pvolume_info->rsize;
        cifs_sb->wsize = pvolume_info->wsize;
        cifs_sb->mnt_uid = pvolume_info->linux_uid;
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
+        if (pvolume_info->backupuid_specified)
+                cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+        if (pvolume_info->backupgid_specified)
+                cifs_sb->mnt_backupgid = pvolume_info->backupgid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
        cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
@@ -2763,6 +2844,10 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
        if (pvolume_info->cifs_acl)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+        if (pvolume_info->backupuid_specified)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
+        if (pvolume_info->backupgid_specified)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
        if (pvolume_info->override_uid)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
        if (pvolume_info->override_gid)
@@ -2795,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 }
 /*
- * When the server supports very large writes via POSIX extensions, we can
+ * When the server supports very large reads and writes via POSIX extensions,
- * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
- * the RFC1001 length.
+ * including the RFC1001 length.
 *
 * Note that this might make for "interesting" allocation problems during
 * writeback however as we have to allocate an array of pointers for the
 * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
 */
 #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
 /*
- * When the server doesn't allow large posix writes, only allow a wsize of
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
- * 128k minus the size of the WRITE_AND_X header. That allows for a write up
+ * of 2^17-1 minus the size of the call header. That allows for a read or
- * to the maximum size described by RFC1002.
+ * write up to the maximum size described by RFC1002.
 */
-#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
 /*
 * The default wsize is 1M. find_get_pages seems to return a maximum of 256
 * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
 * a single wsize request with a single call.
 */
-#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+/*
+ * Windows only supports a max of 60k reads. Default to that when posix
+ * extensions aren't in force.
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2825,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
        struct TCP_Server_Info *server = tcon->ses->server;
        unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
-                                CIFS_DEFAULT_WSIZE;
+                                CIFS_DEFAULT_IOSIZE;
        /* can server support 24-bit write sizes? (via UNIX extensions) */
        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2848,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
        return wsize;
 }
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+        struct TCP_Server_Info *server = tcon->ses->server;
+        unsigned int rsize, defsize;
+        /*
+         * Set default value...
+         *
+         * HACK alert! Ancient servers have very small buffers. Even though
+         * MS-CIFS indicates that servers are only limited by the client's
+         * bufsize for reads, testing against win98se shows that it throws
+         * INVALID_PARAMETER errors if you try to request too large a read.
+         *
+         * If the server advertises a MaxBufferSize of less than one page,
+         * assume that it also can't satisfy reads larger than that either.
+         *
+         * FIXME: Is there a better heuristic for this?
+         */
+        if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+                defsize = CIFS_DEFAULT_IOSIZE;
+        else if (server->capabilities & CAP_LARGE_READ_X)
+                defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+        else if (server->maxBuf >= PAGE_CACHE_SIZE)
+                defsize = CIFSMaxBufSize;
+        else
+                defsize = server->maxBuf - sizeof(READ_RSP);
+        rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
+        /*
+         * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+         * the client's MaxBufferSize.
+         */
+        if (!(server->capabilities & CAP_LARGE_READ_X))
+                rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+        /* hard limit of CIFS_MAX_RSIZE */
+        rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+        return rsize;
+}
 static int
 is_path_accessible(int xid, struct cifs_tcon *tcon,
                   struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -2877,8 +3018,9 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
        kfree(volume_info->username);
        kzfree(volume_info->password);
+        if (volume_info->UNCip != volume_info->UNC + 2)
+                kfree(volume_info->UNCip);
        kfree(volume_info->UNC);
-        kfree(volume_info->UNCip);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
        kfree(volume_info->prepath);
@@ -3040,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
        return volume_info;
 }
+/* make sure ra_pages is a multiple of rsize */
+static inline unsigned int
+cifs_ra_pages(struct cifs_sb_info *cifs_sb)
+{
+        unsigned int reads;
+        unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+        if (rsize_pages >= default_backing_dev_info.ra_pages)
+                return default_backing_dev_info.ra_pages;
+        else if (rsize_pages == 0)
+                return rsize_pages;
+        reads = default_backing_dev_info.ra_pages / rsize_pages;
+        return reads * rsize_pages;
+}
 int
 cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 {
@@ -3058,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        if (rc)
                return rc;
-        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
 #ifdef CONFIG_CIFS_DFS_UPCALL
 try_mount_again:
        /* cleanup activities if we're chasing a referral */
@@ -3124,15 +3280,11 @@ try_mount_again:
                CIFSSMBQFSAttributeInfo(xid, tcon);
        }
-        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-                cifs_sb->rsize = 1024 * 127;
-                cFYI(DBG2, "no very large read support, rsize now 127K");
-        }
-        if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-                cifs_sb->rsize = min(cifs_sb->rsize,
-                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
        cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+        cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
+        /* tune readahead according to rsize */
+        cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ae576fbb5142..d7eeb9d3ed6f 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -105,8 +105,8 @@ cifs_bp_rename_retry:
        }
        rcu_read_unlock();
        if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
-                cERROR(1, "did not end path lookup where expected namelen is %d",
+                cFYI(1, "did not end path lookup where expected. namelen=%d "
-                        namelen);
+                        "dfsplen=%d", namelen, dfsplen);
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
@@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        }
        tcon = tlink_tcon(tlink);
-        if (oplockEnabled)
+        if (enable_oplocks)
                oplock = REQ_OPLOCK;
        if (nd)
@@ -244,6 +244,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
                create_options |= CREATE_OPTION_READONLY;
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
        if (tcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
                         desiredAccess, create_options,
@@ -357,6 +360,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 {
        int rc = -EPERM;
        int xid;
+        int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
        struct cifs_tcon *pTcon;
@@ -431,9 +435,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                return rc;
        }
-        /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
-                         GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                         GENERIC_WRITE, create_options,
                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc)
@@ -642,8 +648,16 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
        if (direntry->d_inode) {
                if (cifs_revalidate_dentry(direntry))
                        return 0;
-                else
+                else {
+                        /*
+                         * Forcibly invalidate automounting directory inodes
+                         * (remote DFS directories) so to have them
+                         * instantiated again for automount
+                         */
+                        if (IS_AUTOMOUNT(direntry->d_inode))
+                                return 0;
                        return 1;
+                }
        }
        /*
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 55d87ac52000..9c7ecdccf2f3 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
 #include "cifs_debug.h"
 #include "cifsfs.h"
-#ifdef CIFS_NFSD_EXPORT
+#ifdef CONFIG_CIFS_NFSD_EXPORT
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
        /* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
        .encode_fs =  */
 };
-#endif /* CIFS_NFSD_EXPORT */
+#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9f41a10523a1..ea096ce5d4f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/swap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -174,6 +175,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
        int rc;
        int desiredAccess;
        int disposition;
+        int create_options = CREATE_NOT_DIR;
        FILE_ALL_INFO *buf;
        desiredAccess = cifs_convert_flags(f_flags);
@@ -210,9 +212,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
        if (!buf)
                return -ENOMEM;
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
        if (tcon->ses->capabilities & CAP_NT_SMBS)
                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
-                         desiredAccess, CREATE_NOT_DIR, pnetfid, poplock, buf,
+                         desiredAccess, create_options, pnetfid, poplock, buf,
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
                                 & CIFS_MOUNT_MAP_SPECIAL_CHR);
        else
@@ -258,8 +263,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
        pCifsFile->invalidHandle = false;
        pCifsFile->tlink = cifs_get_tlink(tlink);
        mutex_init(&pCifsFile->fh_mutex);
-        mutex_init(&pCifsFile->lock_mutex);
-        INIT_LIST_HEAD(&pCifsFile->llist);
        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
        spin_lock(&cifs_file_list_lock);
@@ -272,11 +275,14 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file,
        spin_unlock(&cifs_file_list_lock);
        cifs_set_oplock_level(pCifsInode, oplock);
+        pCifsInode->can_cache_brlcks = pCifsInode->clientCanCacheAll;
        file->private_data = pCifsFile;
        return pCifsFile;
 }
+static void cifs_del_lock_waiters(struct cifsLockInfo *lock);
 /*
 * Release a reference on the file private data. This may involve closing
 * the filehandle out on the server. Must be called without holding
@@ -327,12 +333,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        /* Delete any outstanding lock records. We'll lose them when the file
         * is closed anyway.
         */
-        mutex_lock(&cifs_file->lock_mutex);
+        mutex_lock(&cifsi->lock_mutex);
-        list_for_each_entry_safe(li, tmp, &cifs_file->llist, llist) {
+        list_for_each_entry_safe(li, tmp, &cifsi->llist, llist) {
+                if (li->netfid != cifs_file->netfid)
+                        continue;
                list_del(&li->llist);
+                cifs_del_lock_waiters(li);
                kfree(li);
        }
-        mutex_unlock(&cifs_file->lock_mutex);
+        mutex_unlock(&cifsi->lock_mutex);
        cifs_put_tlink(cifs_file->tlink);
        dput(cifs_file->dentry);
@@ -371,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)
        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
                 inode, file->f_flags, full_path);
-        if (oplockEnabled)
+        if (enable_oplocks)
                oplock = REQ_OPLOCK;
        else
                oplock = 0;
@@ -465,6 +474,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        char *full_path = NULL;
        int desiredAccess;
        int disposition = FILE_OPEN;
+        int create_options = CREATE_NOT_DIR;
        __u16 netfid;
        xid = GetXid();
@@ -495,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
                 inode, pCifsFile->f_flags, full_path);
-        if (oplockEnabled)
+        if (enable_oplocks)
                oplock = REQ_OPLOCK;
        else
                oplock = 0;
@@ -524,6 +534,9 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
        desiredAccess = cifs_convert_flags(pCifsFile->f_flags);
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
        /* Can not refresh inode by passing in file_info buf to be returned
           by SMBOpen and then calling get_inode_info with returned buf
           since file might have write behind data that needs to be flushed
@@ -531,7 +544,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)
           that inode was not dirty locally we could do this */
        rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
-                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
+                         create_options, &netfid, &oplock, NULL,
                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
@@ -631,219 +644,687 @@ int cifs_closedir(struct inode *inode, struct file *file)
        return rc;
 }
-static int store_file_lock(struct cifsFileInfo *fid, __u64 len,
+static struct cifsLockInfo *
-                                __u64 offset, __u8 lockType)
+cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid)
 {
        struct cifsLockInfo *li =
                kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
-        if (li == NULL)
+        if (!li)
-                return -ENOMEM;
+                return li;
+        li->netfid = netfid;
        li->offset = offset;
        li->length = len;
-        li->type = lockType;
+        li->type = type;
-        mutex_lock(&fid->lock_mutex);
+        li->pid = current->tgid;
-        list_add(&li->llist, &fid->llist);
+        INIT_LIST_HEAD(&li->blist);
-        mutex_unlock(&fid->lock_mutex);
+        init_waitqueue_head(&li->block_q);
+        return li;
+}
+static void
+cifs_del_lock_waiters(struct cifsLockInfo *lock)
+{
+        struct cifsLockInfo *li, *tmp;
+        list_for_each_entry_safe(li, tmp, &lock->blist, blist) {
+                list_del_init(&li->blist);
+                wake_up(&li->block_q);
+        }
+}
+static bool
+cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset,
+                        __u64 length, __u8 type, __u16 netfid,
+                        struct cifsLockInfo **conf_lock)
+{
+        struct cifsLockInfo *li, *tmp;
+        list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+                if (offset + length <= li->offset ||
+                    offset >= li->offset + li->length)
+                        continue;
+                else if ((type & LOCKING_ANDX_SHARED_LOCK) &&
+                         ((netfid == li->netfid && current->tgid == li->pid) ||
+                          type == li->type))
+                        continue;
+                else {
+                        *conf_lock = li;
+                        return true;
+                }
+        }
+        return false;
+}
+static int
+cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+               __u8 type, __u16 netfid, struct file_lock *flock)
+{
+        int rc = 0;
+        struct cifsLockInfo *conf_lock;
+        bool exist;
+        mutex_lock(&cinode->lock_mutex);
+        exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+                                        &conf_lock);
+        if (exist) {
+                flock->fl_start = conf_lock->offset;
+                flock->fl_end = conf_lock->offset + conf_lock->length - 1;
+                flock->fl_pid = conf_lock->pid;
+                if (conf_lock->type & LOCKING_ANDX_SHARED_LOCK)
+                        flock->fl_type = F_RDLCK;
+                else
+                        flock->fl_type = F_WRLCK;
+        } else if (!cinode->can_cache_brlcks)
+                rc = 1;
+        else
+                flock->fl_type = F_UNLCK;
+        mutex_unlock(&cinode->lock_mutex);
+        return rc;
+}
+static int
+cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset,
+              __u8 type, __u16 netfid)
+{
+        struct cifsLockInfo *li;
+        li = cifs_lock_init(len, offset, type, netfid);
+        if (!li)
+                return -ENOMEM;
+        mutex_lock(&cinode->lock_mutex);
+        list_add_tail(&li->llist, &cinode->llist);
+        mutex_unlock(&cinode->lock_mutex);
        return 0;
 }
-int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
+static int
+cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length,
+                 __u8 type, __u16 netfid, bool wait)
 {
-        int rc, xid;
+        struct cifsLockInfo *lock, *conf_lock;
-        __u32 numLock = 0;
+        bool exist;
-        __u32 numUnlock = 0;
+        int rc = 0;
-        __u64 length;
-        bool wait_flag = false;
+        lock = cifs_lock_init(length, offset, type, netfid);
-        struct cifs_sb_info *cifs_sb;
+        if (!lock)
+                return -ENOMEM;
+try_again:
+        exist = false;
+        mutex_lock(&cinode->lock_mutex);
+        exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid,
+                                        &conf_lock);
+        if (!exist && cinode->can_cache_brlcks) {
+                list_add_tail(&lock->llist, &cinode->llist);
+                mutex_unlock(&cinode->lock_mutex);
+                return rc;
+        }
+        if (!exist)
+                rc = 1;
+        else if (!wait)
+                rc = -EACCES;
+        else {
+                list_add_tail(&lock->blist, &conf_lock->blist);
+                mutex_unlock(&cinode->lock_mutex);
+                rc = wait_event_interruptible(lock->block_q,
+                                        (lock->blist.prev == &lock->blist) &&
+                                        (lock->blist.next == &lock->blist));
+                if (!rc)
+                        goto try_again;
+                else {
+                        mutex_lock(&cinode->lock_mutex);
+                        list_del_init(&lock->blist);
+                        mutex_unlock(&cinode->lock_mutex);
+                }
+        }
+        kfree(lock);
+        mutex_unlock(&cinode->lock_mutex);
+        return rc;
+}
+static int
+cifs_posix_lock_test(struct file *file, struct file_lock *flock)
+{
+        int rc = 0;
+        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+        unsigned char saved_type = flock->fl_type;
+        mutex_lock(&cinode->lock_mutex);
+        posix_test_lock(file, flock);
+        if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
+                flock->fl_type = saved_type;
+                rc = 1;
+        }
+        mutex_unlock(&cinode->lock_mutex);
+        return rc;
+}
+static int
+cifs_posix_lock_set(struct file *file, struct file_lock *flock)
+{
+        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+        int rc;
+        mutex_lock(&cinode->lock_mutex);
+        if (!cinode->can_cache_brlcks) {
+                mutex_unlock(&cinode->lock_mutex);
+                return 1;
+        }
+        rc = posix_lock_file_wait(file, flock);
+        mutex_unlock(&cinode->lock_mutex);
+        return rc;
+}
+static int
+cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
+{
+        int xid, rc = 0, stored_rc;
+        struct cifsLockInfo *li, *tmp;
        struct cifs_tcon *tcon;
-        __u16 netfid;
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
-        __u8 lockType = LOCKING_ANDX_LARGE_FILES;
+        unsigned int num, max_num;
-        bool posix_locking = 0;
+        LOCKING_ANDX_RANGE *buf, *cur;
+        int types[] = {LOCKING_ANDX_LARGE_FILES,
+                       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+        int i;
+        xid = GetXid();
+        tcon = tlink_tcon(cfile->tlink);
+        mutex_lock(&cinode->lock_mutex);
+        if (!cinode->can_cache_brlcks) {
+                mutex_unlock(&cinode->lock_mutex);
+                FreeXid(xid);
+                return rc;
+        }
+        max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+                  sizeof(LOCKING_ANDX_RANGE);
+        buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+        if (!buf) {
+                mutex_unlock(&cinode->lock_mutex);
+                FreeXid(xid);
+                return rc;
+        }
+        for (i = 0; i < 2; i++) {
+                cur = buf;
+                num = 0;
+                list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+                        if (li->type != types[i])
+                                continue;
+                        cur->Pid = cpu_to_le16(li->pid);
+                        cur->LengthLow = cpu_to_le32((u32)li->length);
+                        cur->LengthHigh = cpu_to_le32((u32)(li->length>>32));
+                        cur->OffsetLow = cpu_to_le32((u32)li->offset);
+                        cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32));
+                        if (++num == max_num) {
+                                stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+                                                       li->type, 0, num, buf);
+                                if (stored_rc)
+                                        rc = stored_rc;
+                                cur = buf;
+                                num = 0;
+                        } else
+                                cur++;
+                }
+                if (num) {
+                        stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+                                               types[i], 0, num, buf);
+                        if (stored_rc)
+                                rc = stored_rc;
+                }
+        }
+        cinode->can_cache_brlcks = false;
+        mutex_unlock(&cinode->lock_mutex);
+        kfree(buf);
+        FreeXid(xid);
+        return rc;
+}
+/* copied from fs/locks.c with a name change */
+#define cifs_for_each_lock(inode, lockp) \
+        for (lockp = &inode->i_flock; *lockp != NULL; \
+             lockp = &(*lockp)->fl_next)
+static int
+cifs_push_posix_locks(struct cifsFileInfo *cfile)
+{
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        struct file_lock *flock, **before;
+        struct cifsLockInfo *lck, *tmp;
+        int rc = 0, xid, type;
+        __u64 length;
+        struct list_head locks_to_send;
-        length = 1 + pfLock->fl_end - pfLock->fl_start;
-        rc = -EACCES;
        xid = GetXid();
-        cFYI(1, "Lock parm: 0x%x flockflags: "
+        mutex_lock(&cinode->lock_mutex);
-                 "0x%x flocktype: 0x%x start: %lld end: %lld",
+        if (!cinode->can_cache_brlcks) {
-                cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
+                mutex_unlock(&cinode->lock_mutex);
-                pfLock->fl_end);
+                FreeXid(xid);
+                return rc;
+        }
+        INIT_LIST_HEAD(&locks_to_send);
-        if (pfLock->fl_flags & FL_POSIX)
+        lock_flocks();
+        cifs_for_each_lock(cfile->dentry->d_inode, before) {
+                flock = *before;
+                length = 1 + flock->fl_end - flock->fl_start;
+                if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+                        type = CIFS_RDLCK;
+                else
+                        type = CIFS_WRLCK;
+                lck = cifs_lock_init(length, flock->fl_start, type,
+                                     cfile->netfid);
+                if (!lck) {
+                        rc = -ENOMEM;
+                        goto send_locks;
+                }
+                lck->pid = flock->fl_pid;
+                list_add_tail(&lck->llist, &locks_to_send);
+        }
+send_locks:
+        unlock_flocks();
+        list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
+                struct file_lock tmp_lock;
+                int stored_rc;
+                tmp_lock.fl_start = lck->offset;
+                stored_rc = CIFSSMBPosixLock(xid, tcon, lck->netfid, lck->pid,
+                                             0, lck->length, &tmp_lock,
+                                             lck->type, 0);
+                if (stored_rc)
+                        rc = stored_rc;
+                list_del(&lck->llist);
+                kfree(lck);
+        }
+        cinode->can_cache_brlcks = false;
+        mutex_unlock(&cinode->lock_mutex);
+        FreeXid(xid);
+        return rc;
+}
+static int
+cifs_push_locks(struct cifsFileInfo *cfile)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        if ((tcon->ses->capabilities & CAP_UNIX) &&
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                return cifs_push_posix_locks(cfile);
+        return cifs_push_mandatory_locks(cfile);
+}
+static void
+cifs_read_flock(struct file_lock *flock, __u8 *type, int *lock, int *unlock,
+                bool *wait_flag)
+{
+        if (flock->fl_flags & FL_POSIX)
                cFYI(1, "Posix");
-        if (pfLock->fl_flags & FL_FLOCK)
+        if (flock->fl_flags & FL_FLOCK)
                cFYI(1, "Flock");
-        if (pfLock->fl_flags & FL_SLEEP) {
+        if (flock->fl_flags & FL_SLEEP) {
                cFYI(1, "Blocking lock");
-                wait_flag = true;
+                *wait_flag = true;
        }
-        if (pfLock->fl_flags & FL_ACCESS)
+        if (flock->fl_flags & FL_ACCESS)
                cFYI(1, "Process suspended by mandatory locking - "
-                         "not implemented yet");
+                        "not implemented yet");
-        if (pfLock->fl_flags & FL_LEASE)
+        if (flock->fl_flags & FL_LEASE)
                cFYI(1, "Lease on file - not implemented yet");
-        if (pfLock->fl_flags &
+        if (flock->fl_flags &
            (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-                cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
+                cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
-        if (pfLock->fl_type == F_WRLCK) {
+        *type = LOCKING_ANDX_LARGE_FILES;
+        if (flock->fl_type == F_WRLCK) {
                cFYI(1, "F_WRLCK ");
-                numLock = 1;
+                *lock = 1;
-        } else if (pfLock->fl_type == F_UNLCK) {
+        } else if (flock->fl_type == F_UNLCK) {
                cFYI(1, "F_UNLCK");
-                numUnlock = 1;
+                *unlock = 1;
-                /* Check if unlock includes more than
+                /* Check if unlock includes more than one lock range */
-                one lock range */
+        } else if (flock->fl_type == F_RDLCK) {
-        } else if (pfLock->fl_type == F_RDLCK) {
                cFYI(1, "F_RDLCK");
-                lockType |= LOCKING_ANDX_SHARED_LOCK;
+                *type |= LOCKING_ANDX_SHARED_LOCK;
-                numLock = 1;
+                *lock = 1;
-        } else if (pfLock->fl_type == F_EXLCK) {
+        } else if (flock->fl_type == F_EXLCK) {
                cFYI(1, "F_EXLCK");
-                numLock = 1;
+                *lock = 1;
-        } else if (pfLock->fl_type == F_SHLCK) {
+        } else if (flock->fl_type == F_SHLCK) {
                cFYI(1, "F_SHLCK");
-                lockType |= LOCKING_ANDX_SHARED_LOCK;
+                *type |= LOCKING_ANDX_SHARED_LOCK;
-                numLock = 1;
+                *lock = 1;
        } else
                cFYI(1, "Unknown type of lock");
+}
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+static int
-        tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink);
+cifs_getlk(struct file *file, struct file_lock *flock, __u8 type,
-        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
+           bool wait_flag, bool posix_lck, int xid)
+{
+        int rc = 0;
+        __u64 length = 1 + flock->fl_end - flock->fl_start;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+        __u16 netfid = cfile->netfid;
-        if ((tcon->ses->capabilities & CAP_UNIX) &&
+        if (posix_lck) {
-            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+                int posix_lock_type;
-            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-                posix_locking = 1;
+                rc = cifs_posix_lock_test(file, flock);
-        /* BB add code here to normalize offset and length to
+                if (!rc)
-        account for negative length which we can not accept over the
-        wire */
-        if (IS_GETLK(cmd)) {
-                if (posix_locking) {
-                        int posix_lock_type;
-                        if (lockType & LOCKING_ANDX_SHARED_LOCK)
-                                posix_lock_type = CIFS_RDLCK;
-                        else
-                                posix_lock_type = CIFS_WRLCK;
-                        rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
-                                        length, pfLock, posix_lock_type,
-                                        wait_flag);
-                        FreeXid(xid);
                        return rc;
-                }
-                /* BB we could chain these into one lock request BB */
+                if (type & LOCKING_ANDX_SHARED_LOCK)
-                rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
+                        posix_lock_type = CIFS_RDLCK;
-                                 0, 1, lockType, 0 /* wait flag */, 0);
+                else
-                if (rc == 0) {
+                        posix_lock_type = CIFS_WRLCK;
-                        rc = CIFSSMBLock(xid, tcon, netfid, length,
+                rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
-                                         pfLock->fl_start, 1 /* numUnlock */ ,
+                                      1 /* get */, length, flock,
-                                         0 /* numLock */ , lockType,
+                                      posix_lock_type, wait_flag);
-                                         0 /* wait flag */, 0);
+                return rc;
-                        pfLock->fl_type = F_UNLCK;
+        }
-                        if (rc != 0)
-                                cERROR(1, "Error unlocking previously locked "
-                                           "range %d during test of lock", rc);
-                        rc = 0;
-                } else {
+        rc = cifs_lock_test(cinode, flock->fl_start, length, type, netfid,
-                        /* if rc == ERR_SHARING_VIOLATION ? */
+                            flock);
-                        rc = 0;
+        if (!rc)
+                return rc;
-                        if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+        /* BB we could chain these into one lock request BB */
-                                pfLock->fl_type = F_WRLCK;
+        rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-                        } else {
+                         flock->fl_start, 0, 1, type, 0, 0);
-                                rc = CIFSSMBLock(xid, tcon, netfid, length,
+        if (rc == 0) {
-                                        pfLock->fl_start, 0, 1,
+                rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
-                                        lockType | LOCKING_ANDX_SHARED_LOCK,
+                                 length, flock->fl_start, 1, 0,
-                                        0 /* wait flag */, 0);
+                                 type, 0, 0);
-                                if (rc == 0) {
+                flock->fl_type = F_UNLCK;
-                                        rc = CIFSSMBLock(xid, tcon, netfid,
+                if (rc != 0)
-                                                length, pfLock->fl_start, 1, 0,
+                        cERROR(1, "Error unlocking previously locked "
-                                                lockType |
+                                   "range %d during test of lock", rc);
-                                                LOCKING_ANDX_SHARED_LOCK,
+                rc = 0;
-                                                0 /* wait flag */, 0);
+                return rc;
-                                        pfLock->fl_type = F_RDLCK;
+        }
-                                        if (rc != 0)
-                                                cERROR(1, "Error unlocking "
-                                                "previously locked range %d "
-                                                "during test of lock", rc);
-                                        rc = 0;
-                                } else {
-                                        pfLock->fl_type = F_WRLCK;
-                                        rc = 0;
-                                }
-                        }
-                }
-                FreeXid(xid);
+        if (type & LOCKING_ANDX_SHARED_LOCK) {
+                flock->fl_type = F_WRLCK;
+                rc = 0;
                return rc;
        }
-        if (!numLock && !numUnlock) {
+        rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-                /* if no lock or unlock then nothing
+                         flock->fl_start, 0, 1,
-                to do since we do not know what it is */
+                         type | LOCKING_ANDX_SHARED_LOCK, 0, 0);
-                FreeXid(xid);
+        if (rc == 0) {
-                return -EOPNOTSUPP;
+                rc = CIFSSMBLock(xid, tcon, netfid, current->tgid,
+                                 length, flock->fl_start, 1, 0,
+                                 type | LOCKING_ANDX_SHARED_LOCK,
+                                 0, 0);
+                flock->fl_type = F_RDLCK;
+                if (rc != 0)
+                        cERROR(1, "Error unlocking previously locked "
+                                  "range %d during test of lock", rc);
+        } else
+                flock->fl_type = F_WRLCK;
+        rc = 0;
+        return rc;
+}
+static void
+cifs_move_llist(struct list_head *source, struct list_head *dest)
+{
+        struct list_head *li, *tmp;
+        list_for_each_safe(li, tmp, source)
+                list_move(li, dest);
+}
+static void
+cifs_free_llist(struct list_head *llist)
+{
+        struct cifsLockInfo *li, *tmp;
+        list_for_each_entry_safe(li, tmp, llist, llist) {
+                cifs_del_lock_waiters(li);
+                list_del(&li->llist);
+                kfree(li);
        }
+}
+static int
+cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid)
+{
+        int rc = 0, stored_rc;
+        int types[] = {LOCKING_ANDX_LARGE_FILES,
+                       LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES};
+        unsigned int i;
+        unsigned int max_num, num;
+        LOCKING_ANDX_RANGE *buf, *cur;
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+        struct cifsLockInfo *li, *tmp;
+        __u64 length = 1 + flock->fl_end - flock->fl_start;
+        struct list_head tmp_llist;
+        INIT_LIST_HEAD(&tmp_llist);
+        max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) /
+                  sizeof(LOCKING_ANDX_RANGE);
+        buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        mutex_lock(&cinode->lock_mutex);
+        for (i = 0; i < 2; i++) {
+                cur = buf;
+                num = 0;
+                list_for_each_entry_safe(li, tmp, &cinode->llist, llist) {
+                        if (flock->fl_start > li->offset ||
+                            (flock->fl_start + length) <
+                            (li->offset + li->length))
+                                continue;
+                        if (current->tgid != li->pid)
+                                continue;
+                        if (cfile->netfid != li->netfid)
+                                continue;
+                        if (types[i] != li->type)
+                                continue;
+                        if (!cinode->can_cache_brlcks) {
+                                cur->Pid = cpu_to_le16(li->pid);
+                                cur->LengthLow = cpu_to_le32((u32)li->length);
+                                cur->LengthHigh =
+                                        cpu_to_le32((u32)(li->length>>32));
+                                cur->OffsetLow = cpu_to_le32((u32)li->offset);
+                                cur->OffsetHigh =
+                                        cpu_to_le32((u32)(li->offset>>32));
+                                /*
+                                 * We need to save a lock here to let us add
+                                 * it again to the inode list if the unlock
+                                 * range request fails on the server.
+                                 */
+                                list_move(&li->llist, &tmp_llist);
+                                if (++num == max_num) {
+                                        stored_rc = cifs_lockv(xid, tcon,
+                                                               cfile->netfid,
+                                                               li->type, num,
+                                                               0, buf);
+                                        if (stored_rc) {
+                                                /*
+                                                 * We failed on the unlock range
+                                                 * request - add all locks from
+                                                 * the tmp list to the head of
+                                                 * the inode list.
+                                                 */
+                                                cifs_move_llist(&tmp_llist,
+                                                                &cinode->llist);
+                                                rc = stored_rc;
+                                        } else
+                                                /*
+                                                 * The unlock range request
+                                                 * succeed - free the tmp list.
+                                                 */
+                                                cifs_free_llist(&tmp_llist);
+                                        cur = buf;
+                                        num = 0;
+                                } else
+                                        cur++;
+                        } else {
+                                /*
+                                 * We can cache brlock requests - simply remove
+                                 * a lock from the inode list.
+                                 */
+                                list_del(&li->llist);
+                                cifs_del_lock_waiters(li);
+                                kfree(li);
+                        }
+                }
+                if (num) {
+                        stored_rc = cifs_lockv(xid, tcon, cfile->netfid,
+                                               types[i], num, 0, buf);
+                        if (stored_rc) {
+                                cifs_move_llist(&tmp_llist, &cinode->llist);
+                                rc = stored_rc;
+                        } else
+                                cifs_free_llist(&tmp_llist);
+                }
+        }
+        mutex_unlock(&cinode->lock_mutex);
+        kfree(buf);
+        return rc;
+}
-        if (posix_locking) {
+static int
+cifs_setlk(struct file *file,  struct file_lock *flock, __u8 type,
+           bool wait_flag, bool posix_lck, int lock, int unlock, int xid)
+{
+        int rc = 0;
+        __u64 length = 1 + flock->fl_end - flock->fl_start;
+        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+        __u16 netfid = cfile->netfid;
+        if (posix_lck) {
                int posix_lock_type;
-                if (lockType & LOCKING_ANDX_SHARED_LOCK)
+                rc = cifs_posix_lock_set(file, flock);
+                if (!rc || rc < 0)
+                        return rc;
+                if (type & LOCKING_ANDX_SHARED_LOCK)
                        posix_lock_type = CIFS_RDLCK;
                else
                        posix_lock_type = CIFS_WRLCK;
-                if (numUnlock == 1)
+                if (unlock == 1)
                        posix_lock_type = CIFS_UNLCK;
-                rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
+                rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid,
-                                      length, pfLock, posix_lock_type,
+                                      0 /* set */, length, flock,
-                                      wait_flag);
+                                      posix_lock_type, wait_flag);
-        } else {
+                goto out;
-                struct cifsFileInfo *fid = file->private_data;
+        }
-                if (numLock) {
-                        rc = CIFSSMBLock(xid, tcon, netfid, length,
-                                         pfLock->fl_start, 0, numLock, lockType,
-                                         wait_flag, 0);
-                        if (rc == 0) {
+        if (lock) {
-                                /* For Windows locks we must store them. */
+                rc = cifs_lock_add_if(cinode, flock->fl_start, length,
-                                rc = store_file_lock(fid, length,
+                                      type, netfid, wait_flag);
-                                                pfLock->fl_start, lockType);
+                if (rc < 0)
-                        }
+                        return rc;
-                } else if (numUnlock) {
+                else if (!rc)
-                        /* For each stored lock that this unlock overlaps
+                        goto out;
-                           completely, unlock it. */
-                        int stored_rc = 0;
-                        struct cifsLockInfo *li, *tmp;
-                        rc = 0;
+                rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length,
-                        mutex_lock(&fid->lock_mutex);
+                                 flock->fl_start, 0, 1, type, wait_flag, 0);
-                        list_for_each_entry_safe(li, tmp, &fid->llist, llist) {
+                if (rc == 0) {
-                                if (pfLock->fl_start <= li->offset &&
+                        /* For Windows locks we must store them. */
-                                                (pfLock->fl_start + length) >=
+                        rc = cifs_lock_add(cinode, length, flock->fl_start,
-                                                (li->offset + li->length)) {
+                                           type, netfid);
-                                        stored_rc = CIFSSMBLock(xid, tcon,
-                                                        netfid, li->length,
-                                                        li->offset, 1, 0,
-                                                        li->type, false, 0);
-                                        if (stored_rc)
-                                                rc = stored_rc;
-                                        else {
-                                                list_del(&li->llist);
-                                                kfree(li);
-                                        }
-                                }
-                        }
-                        mutex_unlock(&fid->lock_mutex);
                }
+        } else if (unlock)
+                rc = cifs_unlock_range(cfile, flock, xid);
+out:
+        if (flock->fl_flags & FL_POSIX)
+                posix_lock_file_wait(file, flock);
+        return rc;
+}
+int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
+{
+        int rc, xid;
+        int lock = 0, unlock = 0;
+        bool wait_flag = false;
+        bool posix_lck = false;
+        struct cifs_sb_info *cifs_sb;
+        struct cifs_tcon *tcon;
+        struct cifsInodeInfo *cinode;
+        struct cifsFileInfo *cfile;
+        __u16 netfid;
+        __u8 type;
+        rc = -EACCES;
+        xid = GetXid();
+        cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
+                "end: %lld", cmd, flock->fl_flags, flock->fl_type,
+                flock->fl_start, flock->fl_end);
+        cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag);
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        cfile = (struct cifsFileInfo *)file->private_data;
+        tcon = tlink_tcon(cfile->tlink);
+        netfid = cfile->netfid;
+        cinode = CIFS_I(file->f_path.dentry->d_inode);
+        if ((tcon->ses->capabilities & CAP_UNIX) &&
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                posix_lck = true;
+        /*
+         * BB add code here to normalize offset and length to account for
+         * negative length which we can not accept over the wire.
+         */
+        if (IS_GETLK(cmd)) {
+                rc = cifs_getlk(file, flock, type, wait_flag, posix_lck, xid);
+                FreeXid(xid);
+                return rc;
        }
-        if (pfLock->fl_flags & FL_POSIX)
+        if (!lock && !unlock) {
-                posix_lock_file_wait(file, pfLock);
+                /*
+                 * if no lock or unlock then nothing to do since we do not
+                 * know what it is
+                 */
+                FreeXid(xid);
+                return -EOPNOTSUPP;
+        }
+        rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock,
+                        xid);
        FreeXid(xid);
        return rc;
 }
@@ -1714,6 +2195,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        struct smb_com_read_rsp *pSMBr;
        struct cifs_io_parms io_parms;
        char *read_data;
+        unsigned int rsize;
        __u32 pid;
        if (!nr_segs)
@@ -1726,6 +2208,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        /* FIXME: set up handlers for larger reads and/or convert to async */
+        rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
@@ -1738,7 +2223,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0; total_read < len; total_read += bytes_read) {
-                cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+                cur_len = min_t(const size_t, len - total_read, rsize);
                rc = -EAGAIN;
                read_data = NULL;
@@ -1830,6 +2315,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        unsigned int bytes_read = 0;
        unsigned int total_read;
        unsigned int current_read_size;
+        unsigned int rsize;
        struct cifs_sb_info *cifs_sb;
        struct cifs_tcon *pTcon;
        int xid;
@@ -1842,6 +2328,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        /* FIXME: set up handlers for larger reads and/or convert to async */
+        rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
        if (file->private_data == NULL) {
                rc = -EBADF;
                FreeXid(xid);
@@ -1861,14 +2350,14 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
             total_read += bytes_read, current_offset += bytes_read) {
-                current_read_size = min_t(const int, read_size - total_read,
+                current_read_size = min_t(uint, read_size - total_read, rsize);
-                                          cifs_sb->rsize);
                /* For windows me and 9x we do not want to request more
                than it negotiated since it will refuse the read then */
                if ((pTcon->ses) &&
                        !(pTcon->ses->capabilities & CAP_LARGE_FILES)) {
-                        current_read_size = min_t(const int, current_read_size,
+                        current_read_size = min_t(uint, current_read_size,
-                                        pTcon->ses->server->maxBuf - 128);
+                                        CIFSMaxBufSize);
                }
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
@@ -1957,82 +2446,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return rc;
 }
-static void cifs_copy_cache_pages(struct address_space *mapping,
-        struct list_head *pages, int bytes_read, char *data)
-{
-        struct page *page;
-        char *target;
-        while (bytes_read > 0) {
-                if (list_empty(pages))
-                        break;
-                page = list_entry(pages->prev, struct page, lru);
-                list_del(&page->lru);
-                if (add_to_page_cache_lru(page, mapping, page->index,
-                                      GFP_KERNEL)) {
-                        page_cache_release(page);
-                        cFYI(1, "Add page cache failed");
-                        data += PAGE_CACHE_SIZE;
-                        bytes_read -= PAGE_CACHE_SIZE;
-                        continue;
-                }
-                page_cache_release(page);
-                target = kmap_atomic(page, KM_USER0);
-                if (PAGE_CACHE_SIZE > bytes_read) {
-                        memcpy(target, data, bytes_read);
-                        /* zero the tail end of this partial page */
-                        memset(target + bytes_read, 0,
-                               PAGE_CACHE_SIZE - bytes_read);
-                        bytes_read = 0;
-                } else {
-                        memcpy(target, data, PAGE_CACHE_SIZE);
-                        bytes_read -= PAGE_CACHE_SIZE;
-                }
-                kunmap_atomic(target, KM_USER0);
-                flush_dcache_page(page);
-                SetPageUptodate(page);
-                unlock_page(page);
-                data += PAGE_CACHE_SIZE;
-                /* add page to FS-Cache */
-                cifs_readpage_to_fscache(mapping->host, page);
-        }
-        return;
-}
 static int cifs_readpages(struct file *file, struct address_space *mapping,
        struct list_head *page_list, unsigned num_pages)
 {
-        int rc = -EACCES;
+        int rc;
-        int xid;
+        struct list_head tmplist;
-        loff_t offset;
+        struct cifsFileInfo *open_file = file->private_data;
-        struct page *page;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        struct cifs_sb_info *cifs_sb;
+        unsigned int rsize = cifs_sb->rsize;
-        struct cifs_tcon *pTcon;
+        pid_t pid;
-        unsigned int bytes_read = 0;
-        unsigned int read_size, i;
-        char *smb_read_data = NULL;
-        struct smb_com_read_rsp *pSMBr;
-        struct cifsFileInfo *open_file;
-        struct cifs_io_parms io_parms;
-        int buf_type = CIFS_NO_BUFFER;
-        __u32 pid;
-        xid = GetXid();
+        /*
-        if (file->private_data == NULL) {
+         * Give up immediately if rsize is too small to read an entire page.
-                rc = -EBADF;
+         * The VFS will fall back to readpage. We should never reach this
-                FreeXid(xid);
+         * point however since we set ra_pages to 0 when the rsize is smaller
-                return rc;
+         * than a cache page.
-        }
+         */
-        open_file = file->private_data;
+        if (unlikely(rsize < PAGE_CACHE_SIZE))
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+                return 0;
-        pTcon = tlink_tcon(open_file->tlink);
        /*
         * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2041,125 +2472,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
                                         &num_pages);
        if (rc == 0)
-                goto read_complete;
+                return rc;
-        cFYI(DBG2, "rpages: num pages %d", num_pages);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
                pid = open_file->pid;
        else
                pid = current->tgid;
-        for (i = 0; i < num_pages; ) {
+        rc = 0;
-                unsigned contig_pages;
+        INIT_LIST_HEAD(&tmplist);
-                struct page *tmp_page;
-                unsigned long expected_index;
-                if (list_empty(page_list))
+        cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
-                        break;
+                mapping, num_pages);
+        /*
+         * Start with the page at end of list and move it to private
+         * list. Do the same with any following pages until we hit
+         * the rsize limit, hit an index discontinuity, or run out of
+         * pages. Issue the async read and then start the loop again
+         * until the list is empty.
+         *
+         * Note that list order is important. The page_list is in
+         * the order of declining indexes. When we put the pages in
+         * the rdata->pages, then we want them in increasing order.
+         */
+        while (!list_empty(page_list)) {
+                unsigned int bytes = PAGE_CACHE_SIZE;
+                unsigned int expected_index;
+                unsigned int nr_pages = 1;
+                loff_t offset;
+                struct page *page, *tpage;
+                struct cifs_readdata *rdata;
                page = list_entry(page_list->prev, struct page, lru);
+                /*
+                 * Lock the page and put it in the cache. Since no one else
+                 * should have access to this page, we're safe to simply set
+                 * PG_locked without checking it first.
+                 */
+                __set_page_locked(page);
+                rc = add_to_page_cache_locked(page, mapping,
+                                              page->index, GFP_KERNEL);
+                /* give up if we can't stick it in the cache */
+                if (rc) {
+                        __clear_page_locked(page);
+                        break;
+                }
+                /* move first page to the tmplist */
                offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+                list_move_tail(&page->lru, &tmplist);
-                /* count adjacent pages that we will read into */
+                /* now try and add more pages onto the request */
-                contig_pages = 0;
+                expected_index = page->index + 1;
-                expected_index =
+                list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
-                        list_entry(page_list->prev, struct page, lru)->index;
+                        /* discontinuity ? */
-                list_for_each_entry_reverse(tmp_page, page_list, lru) {
+                        if (page->index != expected_index)
-                        if (tmp_page->index == expected_index) {
-                                contig_pages++;
-                                expected_index++;
-                        } else
                                break;
+                        /* would this page push the read over the rsize? */
+                        if (bytes + PAGE_CACHE_SIZE > rsize)
+                                break;
+                        __set_page_locked(page);
+                        if (add_to_page_cache_locked(page, mapping,
+                                                page->index, GFP_KERNEL)) {
+                                __clear_page_locked(page);
+                                break;
+                        }
+                        list_move_tail(&page->lru, &tmplist);
+                        bytes += PAGE_CACHE_SIZE;
+                        expected_index++;
+                        nr_pages++;
                }
-                if (contig_pages + i >  num_pages)
-                        contig_pages = num_pages - i;
+                rdata = cifs_readdata_alloc(nr_pages);
+                if (!rdata) {
-                /* for reads over a certain size could initiate async
+                        /* best to give up if we're out of mem */
-                   read ahead */
+                        list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+                                list_del(&page->lru);
-                read_size = contig_pages * PAGE_CACHE_SIZE;
+                                lru_cache_add_file(page);
-                /* Read size needs to be in multiples of one page */
+                                unlock_page(page);
-                read_size = min_t(const unsigned int, read_size,
+                                page_cache_release(page);
-                                  cifs_sb->rsize & PAGE_CACHE_MASK);
+                        }
-                cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
+                        rc = -ENOMEM;
-                                read_size, contig_pages);
+                        break;
-                rc = -EAGAIN;
+                }
-                while (rc == -EAGAIN) {
+                spin_lock(&cifs_file_list_lock);
+                cifsFileInfo_get(open_file);
+                spin_unlock(&cifs_file_list_lock);
+                rdata->cfile = open_file;
+                rdata->mapping = mapping;
+                rdata->offset = offset;
+                rdata->bytes = bytes;
+                rdata->pid = pid;
+                list_splice_init(&tmplist, &rdata->pages);
+                do {
                        if (open_file->invalidHandle) {
                                rc = cifs_reopen_file(open_file, true);
                                if (rc != 0)
-                                        break;
+                                        continue;
                        }
-                        io_parms.netfid = open_file->netfid;
+                        rc = cifs_async_readv(rdata);
-                        io_parms.pid = pid;
+                } while (rc == -EAGAIN);
-                        io_parms.tcon = pTcon;
-                        io_parms.offset = offset;
-                        io_parms.length = read_size;
-                        rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
-                                         &smb_read_data, &buf_type);
-                        /* BB more RC checks ? */
-                        if (rc == -EAGAIN) {
-                                if (smb_read_data) {
-                                        if (buf_type == CIFS_SMALL_BUFFER)
-                                                cifs_small_buf_release(smb_read_data);
-                                        else if (buf_type == CIFS_LARGE_BUFFER)
-                                                cifs_buf_release(smb_read_data);
-                                        smb_read_data = NULL;
-                                }
-                        }
-                }
-                if ((rc < 0) || (smb_read_data == NULL)) {
-                        cFYI(1, "Read error in readpages: %d", rc);
-                        break;
-                } else if (bytes_read > 0) {
-                        task_io_account_read(bytes_read);
-                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-                        cifs_copy_cache_pages(mapping, page_list, bytes_read,
-                                smb_read_data + 4 /* RFC1001 hdr */ +
-                                le16_to_cpu(pSMBr->DataOffset));
-                        i +=  bytes_read >> PAGE_CACHE_SHIFT;
-                        cifs_stats_bytes_read(pTcon, bytes_read);
-                        if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
-                                i++; /* account for partial page */
-                                /* server copy of file can have smaller size
-                                   than client */
-                                /* BB do we need to verify this common case ?
-                                   this case is ok - if we are at server EOF
-                                   we will hit it on next read */
-                                /* break; */
+                if (rc != 0) {
+                        list_for_each_entry_safe(page, tpage, &rdata->pages,
+                                                 lru) {
+                                list_del(&page->lru);
+                                lru_cache_add_file(page);
+                                unlock_page(page);
+                                page_cache_release(page);
                        }
-                } else {
+                        cifs_readdata_free(rdata);
-                        cFYI(1, "No bytes read (%d) at offset %lld . "
-                                "Cleaning remaining pages from readahead list",
-                                bytes_read, offset);
-                        /* BB turn off caching and do new lookup on
-                           file size at server? */
                        break;
                }
-                if (smb_read_data) {
-                        if (buf_type == CIFS_SMALL_BUFFER)
-                                cifs_small_buf_release(smb_read_data);
-                        else if (buf_type == CIFS_LARGE_BUFFER)
-                                cifs_buf_release(smb_read_data);
-                        smb_read_data = NULL;
-                }
-                bytes_read = 0;
        }
-/* need to free smb_read_data buf before exit */
-        if (smb_read_data) {
-                if (buf_type == CIFS_SMALL_BUFFER)
-                        cifs_small_buf_release(smb_read_data);
-                else if (buf_type == CIFS_LARGE_BUFFER)
-                        cifs_buf_release(smb_read_data);
-                smb_read_data = NULL;
-        }
-read_complete:
-        FreeXid(xid);
        return rc;
 }
@@ -2408,6 +2841,10 @@ void cifs_oplock_break(struct work_struct *work)
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
+        rc = cifs_push_locks(cfile);
+        if (rc)
+                cERROR(1, "Push locks rc = %d", rc);
        /*
         * releasing stale oplock after recent reconnect of smb session using
         * a now incorrect file handle is not a data integrity issue but do
@@ -2415,8 +2852,9 @@ void cifs_oplock_break(struct work_struct *work)
         * disconnected since oplock already released by the server
         */
        if (!cfile->oplock_break_cancelled) {
-                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
+                rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid,
-                                 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+                                 current->tgid, 0, 0, 0, 0,
+                                 LOCKING_ANDX_OPLOCK_RELEASE, false,
                                 cinode->clientCanCacheRead ? 1 : 0);
                cFYI(1, "Oplock release rc = %d", rc);
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a7b2dcd4a53e..2c50bd2f65d1 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -562,7 +562,16 @@ int cifs_get_file_info(struct file *filp)
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
-        if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+        switch (rc) {
+        case 0:
+                cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+                break;
+        case -EREMOTE:
+                cifs_create_dfs_fattr(&fattr, inode->i_sb);
+                rc = 0;
+                break;
+        case -EOPNOTSUPP:
+        case -EINVAL:
                /*
                 * FIXME: legacy server -- fall back to path-based call?
                 * for now, just skip revalidating and mark inode for
@@ -570,18 +579,14 @@ int cifs_get_file_info(struct file *filp)
                 */
                rc = 0;
                CIFS_I(inode)->time = 0;
+        default:
                goto cgfi_exit;
-        } else if (rc == -EREMOTE) {
+        }
-                cifs_create_dfs_fattr(&fattr, inode->i_sb);
-                rc = 0;
-        } else if (rc)
-                goto cgfi_exit;
        /*
         * don't bother with SFU junk here -- just mark inode as needing
         * revalidation.
         */
-        cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
        fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
        fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
        cifs_fattr_to_inode(inode, &fattr);
@@ -2096,6 +2101,8 @@ static int
 cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 {
        int xid;
+        uid_t uid = NO_CHANGE_32;
+        gid_t gid = NO_CHANGE_32;
        struct inode *inode = direntry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2146,13 +2153,25 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                        goto cifs_setattr_exit;
        }
-        /*
+        if (attrs->ia_valid & ATTR_UID)
-         * Without unix extensions we can't send ownership changes to the
+                uid = attrs->ia_uid;
-         * server, so silently ignore them. This is consistent with how
-         * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
+        if (attrs->ia_valid & ATTR_GID)
-         * CIFSACL support + proper Windows to Unix idmapping, we may be
+                gid = attrs->ia_gid;
-         * able to support this in the future.
-         */
+#ifdef CONFIG_CIFS_ACL
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+                if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+                        rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
+                                                        uid, gid);
+                        if (rc) {
+                                cFYI(1, "%s: Setting id failed with error: %d",
+                                        __func__, rc);
+                                goto cifs_setattr_exit;
+                        }
+                }
+        } else
+#endif /* CONFIG_CIFS_ACL */
        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
                attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
@@ -2161,15 +2180,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                attrs->ia_valid &= ~ATTR_MODE;
        if (attrs->ia_valid & ATTR_MODE) {
-                cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
                mode = attrs->ia_mode;
-        }
-        if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
 #ifdef CONFIG_CIFS_ACL
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                        rc = mode_to_cifs_acl(inode, full_path, mode);
+                        rc = id_mode_to_cifs_acl(inode, full_path, mode,
+                                                NO_CHANGE_32, NO_CHANGE_32);
                        if (rc) {
                                cFYI(1, "%s: Setting ACL failed with error: %d",
                                        __func__, rc);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index db3f18cdf024..8693b5d0e180 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -183,14 +183,20 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 static int
 CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
                    const char *fromName, const char *toName,
-                    const struct nls_table *nls_codepage, int remap)
+                    struct cifs_sb_info *cifs_sb)
 {
        int rc;
        int oplock = 0;
+        int remap;
+        int create_options = CREATE_NOT_DIR;
        __u16 netfid = 0;
        u8 *buf;
        unsigned int bytes_written = 0;
        struct cifs_io_parms io_parms;
+        struct nls_table *nls_codepage;
+        nls_codepage = cifs_sb->local_nls;
+        remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
        buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
        if (!buf)
@@ -202,8 +208,11 @@ CIFSCreateMFSymLink(const int xid, struct cifs_tcon *tcon,
                return rc;
        }
+        if (backup_cred(cifs_sb))
+                create_options |= CREATE_OPEN_BACKUP_INTENT;
        rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
-                         CREATE_NOT_DIR, &netfid, &oplock, NULL,
+                         create_options, &netfid, &oplock, NULL,
                         nls_codepage, remap);
        if (rc != 0) {
                kfree(buf);
@@ -559,9 +568,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
        /* BB what if DFS and this volume is on different share? BB */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
                rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
-                                         cifs_sb->local_nls,
+                                        cifs_sb);
-                                         cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        else if (pTcon->unix_ext)
                rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
                                           cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 7c1693392598..703ef5c6fdb1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -420,19 +420,22 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 }
 int
-checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int total_read)
 {
-        __u32 len = be32_to_cpu(smb->smb_buf_length);
+        __u32 rfclen = be32_to_cpu(smb->smb_buf_length);
        __u32 clc_len;  /* calculated length */
-        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
+        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
+                total_read, rfclen);
-        if (length < 2 + sizeof(struct smb_hdr)) {
+        /* is this frame too small to even get to a BCC? */
-                if ((length >= sizeof(struct smb_hdr) - 1)
+        if (total_read < 2 + sizeof(struct smb_hdr)) {
+                if ((total_read >= sizeof(struct smb_hdr) - 1)
                            && (smb->Status.CifsError != 0)) {
+                        /* it's an error return */
                        smb->WordCount = 0;
                        /* some error cases do not return wct and bcc */
                        return 0;
-                } else if ((length == sizeof(struct smb_hdr) + 1) &&
+                } else if ((total_read == sizeof(struct smb_hdr) + 1) &&
                                (smb->WordCount == 0)) {
                        char *tmp = (char *)smb;
                        /* Need to work around a bug in two servers here */
@@ -452,39 +455,35 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                } else {
                        cERROR(1, "Length less than smb header size");
                }
-                return 1;
+                return -EIO;
-        }
-        if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-                                   smb->Mid);
-                return 1;
        }
+        /* otherwise, there is enough to get to the BCC */
        if (check_smb_hdr(smb, mid))
-                return 1;
+                return -EIO;
        clc_len = smbCalcSize(smb);
-        if (4 + len != length) {
+        if (4 + rfclen != total_read) {
                cERROR(1, "Length read does not match RFC1001 length %d",
-                           len);
+                                rfclen);
-                return 1;
+                return -EIO;
        }
-        if (4 + len != clc_len) {
+        if (4 + rfclen != clc_len) {
                /* check if bcc wrapped around for large read responses */
-                if ((len > 64 * 1024) && (len > clc_len)) {
+                if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
                        /* check if lengths match mod 64K */
-                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
+                        if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
                cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
-                                clc_len, 4 + len, smb->Mid);
+                                clc_len, 4 + rfclen, smb->Mid);
-                if (4 + len < clc_len) {
+                if (4 + rfclen < clc_len) {
                        cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-                                        len, smb->Mid);
+                                        rfclen, smb->Mid);
-                        return 1;
+                        return -EIO;
-                } else if (len > clc_len + 512) {
+                } else if (rfclen > clc_len + 512) {
                        /*
                         * Some servers (Windows XP in particular) send more
                         * data than the lengths in the SMB packet would
@@ -495,8 +494,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                         * data to 512 bytes.
                         */
                        cERROR(1, "RFC1001 size %u more than 512 bytes larger "
-                                  "than SMB for mid=%u", len, smb->Mid);
+                                  "than SMB for mid=%u", rfclen, smb->Mid);
-                        return 1;
+                        return -EIO;
                }
        }
        return 0;
@@ -676,3 +675,18 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
                cinode->clientCanCacheRead = false;
        }
 }
+bool
+backup_cred(struct cifs_sb_info *cifs_sb)
+{
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
+                if (cifs_sb->mnt_backupuid == current_fsuid())
+                        return true;
+        }
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
+                if (in_group_p(cifs_sb->mnt_backupgid))
+                        return true;
+        }
+        return false;
+}
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d3e619692ee0..c7d80e24f24e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -124,7 +124,9 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
        /*      that we use in next few lines                               */
        /* Note that header is initialized to zero in header_assemble */
        pSMB->req.AndXCommand = 0xFF;
-        pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+        pSMB->req.MaxBufferSize = cpu_to_le16(min_t(u32,
+                                        CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4,
+                                        USHRT_MAX));
        pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
        pSMB->req.VcNumber = get_next_vcnum(ses);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 42b9fff48751..ac1221d969d6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -265,91 +265,6 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
        return rc;
 }
-#if 0 /* currently unused */
-/* Does both the NT and LM owfs of a user's password */
-static void
-nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
-{
-        char passwd[514];
-        memset(passwd, '\0', 514);
-        if (strlen(pwd) < 513)
-                strcpy(passwd, pwd);
-        else
-                memcpy(passwd, pwd, 512);
-        /* Calculate the MD4 hash (NT compatible) of the password */
-        memset(nt_p16, '\0', 16);
-        E_md4hash(passwd, nt_p16);
-        /* Mangle the passwords into Lanman format */
-        passwd[14] = '\0';
-/*      strupper(passwd); */
-        /* Calculate the SMB (lanman) hash functions of the password */
-        memset(p16, '\0', 16);
-        E_P16((unsigned char *) passwd, (unsigned char *) p16);
-        /* clear out local copy of user's password (just being paranoid). */
-        memset(passwd, '\0', sizeof(passwd));
-}
-#endif
-/* Does the NTLMv2 owfs of a user's password */
-#if 0  /* function not needed yet - but will be soon */
-static void
-ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
-                const char *domain_n, unsigned char kr_buf[16],
-                const struct nls_table *nls_codepage)
-{
-        wchar_t *user_u;
-        wchar_t *dom_u;
-        int user_l, domain_l;
-        struct HMACMD5Context ctx;
-        /* might as well do one alloc to hold both (user_u and dom_u) */
-        user_u = kmalloc(2048 * sizeof(wchar_t), GFP_KERNEL);
-        if (user_u == NULL)
-                return;
-        dom_u = user_u + 1024;
-        /* push_ucs2(NULL, user_u, user_n, (user_l+1)*2,
-                        STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER);
-           push_ucs2(NULL, dom_u, domain_n, (domain_l+1)*2,
-                        STR_UNICODE|STR_NOALIGN|STR_TERMINATE|STR_UPPER); */
-        /* BB user and domain may need to be uppercased */
-        user_l = cifs_strtoUCS(user_u, user_n, 511, nls_codepage);
-        domain_l = cifs_strtoUCS(dom_u, domain_n, 511, nls_codepage);
-        user_l++;               /* trailing null */
-        domain_l++;
-        hmac_md5_init_limK_to_64(owf, 16, &ctx);
-        hmac_md5_update((const unsigned char *) user_u, user_l * 2, &ctx);
-        hmac_md5_update((const unsigned char *) dom_u, domain_l * 2, &ctx);
-        hmac_md5_final(kr_buf, &ctx);
-        kfree(user_u);
-}
-#endif
-/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-#if 0 /* currently unused */
-static void
-NTLMSSPOWFencrypt(unsigned char passwd[8],
-                  unsigned char *ntlmchalresp, unsigned char p24[24])
-{
-        unsigned char p21[21];
-        memset(p21, '\0', 21);
-        memcpy(p21, passwd, 8);
-        memset(p21 + 8, 0xbd, 8);
-        E_P24(p21, ntlmchalresp, p24);
-}
-#endif
 /* Does the NT MD4 hash then des encryption. */
 int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
@@ -369,39 +284,3 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
        rc = E_P24(p21, c8, p24);
        return rc;
 }
-/* Does the md5 encryption from the NT hash for NTLMv2. */
-/* These routines will be needed later */
-#if 0
-static void
-SMBOWFencrypt_ntv2(const unsigned char kr[16],
-                   const struct data_blob *srv_chal,
-                   const struct data_blob *cli_chal, unsigned char resp_buf[16])
-{
-        struct HMACMD5Context ctx;
-        hmac_md5_init_limK_to_64(kr, 16, &ctx);
-        hmac_md5_update(srv_chal->data, srv_chal->length, &ctx);
-        hmac_md5_update(cli_chal->data, cli_chal->length, &ctx);
-        hmac_md5_final(resp_buf, &ctx);
-}
-static void
-SMBsesskeygen_ntv2(const unsigned char kr[16],
-                   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-        struct HMACMD5Context ctx;
-        hmac_md5_init_limK_to_64(kr, 16, &ctx);
-        hmac_md5_update(nt_resp, 16, &ctx);
-        hmac_md5_final((unsigned char *) sess_key, &ctx);
-}
-static void
-SMBsesskeygen_ntv1(const unsigned char kr[16],
-                   const unsigned char *nt_resp, __u8 sess_key[16])
-{
-        mdfour((unsigned char *) sess_key, (unsigned char *) kr, 16);
-}
-#endif
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1b9c4b10739..0cc9584f5889 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
@@ -266,15 +267,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
        while (1) {
                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
                        spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_STATS2
+                        cifs_num_waiters_inc(server);
-                        atomic_inc(&server->num_waiters);
-#endif
                        wait_event(server->request_q,
                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
-#ifdef CONFIG_CIFS_STATS2
+                        cifs_num_waiters_dec(server);
-                        atomic_dec(&server->num_waiters);
-#endif
                        spin_lock(&GlobalMid_Lock);
                } else {
                        if (server->tcpStatus == CifsExiting) {
@@ -328,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
        int error;
-        error = wait_event_killable(server->response_q,
+        error = wait_event_freezekillable(server->response_q,
                                    midQ->midState != MID_REQUEST_SUBMITTED);
        if (error < 0)
                return -ERESTARTSYS;
@@ -343,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 */
 int
 cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                unsigned int nvec, mid_callback_t *callback, void *cbdata,
+                unsigned int nvec, mid_receive_t *receive,
-                bool ignore_pend)
+                mid_callback_t *callback, void *cbdata, bool ignore_pend)
 {
        int rc;
        struct mid_q_entry *mid;
@@ -378,18 +375,17 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
                goto out_err;
        }
+        mid->receive = receive;
        mid->callback = callback;
        mid->callback_data = cbdata;
        mid->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-        atomic_inc(&server->inSend);
+        cifs_in_send_inc(server);
-#endif
        rc = smb_sendv(server, iov, nvec);
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(server);
-        atomic_dec(&server->inSend);
+        cifs_save_when_sent(mid);
-        mid->when_sent = jiffies;
-#endif
        mutex_unlock(&server->srv_mutex);
        if (rc)
                goto out_err;
@@ -502,13 +498,18 @@ int
 cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
                   bool log_error)
 {
-        dump_smb(mid->resp_buf,
+        unsigned int len = be32_to_cpu(mid->resp_buf->smb_buf_length) + 4;
-                 min_t(u32, 92, be32_to_cpu(mid->resp_buf->smb_buf_length)));
+        dump_smb(mid->resp_buf, min_t(u32, 92, len));
        /* convert the length into a more usable form */
        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                struct kvec iov;
+                iov.iov_base = mid->resp_buf;
+                iov.iov_len = len;
                /* FIXME: add code to kill session */
-                if (cifs_verify_signature(mid->resp_buf, server,
+                if (cifs_verify_signature(&iov, 1, server,
                                          mid->sequence_number + 1) != 0)
                        cERROR(1, "Unexpected SMB signature");
        }
@@ -575,14 +576,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_inc(ses->server);
-        atomic_inc(&ses->server->inSend);
-#endif
        rc = smb_sendv(ses->server, iov, n_vec);
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
@@ -703,14 +700,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-        atomic_inc(&ses->server->inSend);
+        cifs_in_send_inc(ses->server);
-#endif
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0)
@@ -843,14 +837,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
        }
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_inc(ses->server);
-        atomic_inc(&ses->server->inSend);
-#endif
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
+        cifs_in_send_dec(ses->server);
-        atomic_dec(&ses->server->inSend);
+        cifs_save_when_sent(midQ);
-        midQ->when_sent = jiffies;
-#endif
        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 2a22fb2989e4..45f07c46f3ed 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -31,16 +32,8 @@
 #define MAX_EA_VALUE_SIZE 65535
 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
-#define CIFS_XATTR_USER_PREFIX "user."
-#define CIFS_XATTR_SYSTEM_PREFIX "system."
-#define CIFS_XATTR_OS2_PREFIX "os2."
-#define CIFS_XATTR_SECURITY_PREFIX "security."
-#define CIFS_XATTR_TRUSTED_PREFIX "trusted."
-#define XATTR_TRUSTED_PREFIX_LEN  8
-#define XATTR_SECURITY_PREFIX_LEN 9
-/* BB need to add server (Samba e.g) support for security and trusted prefix */
+/* BB need to add server (Samba e.g) support for security and trusted prefix */
 int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 {
@@ -76,8 +69,8 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
        }
        if (ea_name == NULL) {
                cFYI(1, "Null xattr names not supported");
-        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
+        } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
-                && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
+                && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
                cFYI(1,
                     "illegal xattr request %s (only user namespace supported)",
                     ea_name);
@@ -88,7 +81,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto remove_ea_exit;
-                ea_name += 5; /* skip past user. prefix */
+                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
                        (__u16)0, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -149,21 +142,23 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
        if (ea_name == NULL) {
                cFYI(1, "Null xattr names not supported");
-        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+        } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+                   == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
                        cFYI(1, "attempt to set cifs inode metadata");
-                ea_name += 5; /* skip past user. prefix */
+                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
                        (__u16)value_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+        } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
+                   == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
-                ea_name += 4; /* skip past os2. prefix */
+                ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
                        (__u16)value_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -178,7 +173,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #ifdef CONFIG_CIFS_ACL
                        memcpy(pacl, ea_value, value_size);
                        rc = set_cifs_acl(pacl, value_size,
-                                direntry->d_inode, full_path);
+                                direntry->d_inode, full_path, CIFS_ACL_DACL);
                        if (rc == 0) /* force revalidate of the inode */
                                CIFS_I(direntry->d_inode)->time = 0;
                        kfree(pacl);
@@ -269,7 +264,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        /* return alt name if available as pseudo attr */
        if (ea_name == NULL) {
                cFYI(1, "Null xattr names not supported");
-        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
+        } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
+                   == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
@@ -277,15 +273,15 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                        cFYI(1, "attempt to query cifs inode metadata");
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
-                ea_name += 5; /* skip past user. prefix */
+                ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
                        buf_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-        } else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
+        } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
-                ea_name += 4; /* skip past os2. prefix */
+                ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
                rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
                        buf_size, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -339,10 +335,10 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                cFYI(1, "Query CIFS ACL not supported yet");
 #endif /* CONFIG_CIFS_ACL */
        } else if (strncmp(ea_name,
-                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
+                  XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
                cFYI(1, "Trusted xattr namespace not supported yet");
        } else if (strncmp(ea_name,
-                  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
+                  XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
                cFYI(1, "Security xattr namespace not supported yet");
        } else
                cFYI(1,
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index 44e17e9c21ae..cc0ea9fe5ecf 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -59,12 +59,11 @@ void coda_sysctl_clean(void);
 #define CODA_ALLOC(ptr, cast, size) do { \
    if (size < PAGE_SIZE) \
-        ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+        ptr = kzalloc((unsigned long) size, GFP_KERNEL); \
    else \
-        ptr = (cast)vmalloc((unsigned long) size); \
+        ptr = (cast)vzalloc((unsigned long) size); \
    if (!ptr) \
        printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
-    else memset( ptr, 0, size ); \
 } while (0)
diff --git a/fs/compat.c b/fs/compat.c
index 0b48d018e38a..302e761bd0aa 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,6 @@
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
 #include <linux/tsacct_kern.h>
@@ -247,11 +246,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
            __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
            __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
            __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-            __put_user(0, &ubuf->f_spare[0]) || 
+            __put_user(kbuf->f_flags, &ubuf->f_flags) ||
-            __put_user(0, &ubuf->f_spare[1]) || 
+            __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
-            __put_user(0, &ubuf->f_spare[2]) || 
-            __put_user(0, &ubuf->f_spare[3]) || 
-            __put_user(0, &ubuf->f_spare[4]))
                return -EFAULT;
        return 0;
 }
@@ -1675,11 +1671,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
-long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2)
-{
-        return sys_ni_syscall();
-}
 #ifdef CONFIG_EPOLL
 #ifdef HAVE_SET_RESTORE_SIGMASK
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8be086e9abe4..51352de88ef1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
 COMPATIBLE_IOCTL(PPPIOCDISCONN)
 COMPATIBLE_IOCTL(PPPIOCATTCHAN)
 COMPATIBLE_IOCTL(PPPIOCGCHAN)
+COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
 /* PPPOX */
 COMPATIBLE_IOCTL(PPPOEIOCSFWD)
 COMPATIBLE_IOCTL(PPPOEIOCDFWD)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c83f4768eeaa..ca418aaf6352 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -23,7 +23,8 @@
 *
 * configfs Copyright (C) 2005 Oracle.  All rights reserved.
 *
- * Please see Documentation/filesystems/configfs.txt for more information.
+ * Please see Documentation/filesystems/configfs/configfs.txt for more
+ * information.
 */
 #undef DEBUG
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 76dc4c3e5d51..50cee7f9110b 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -23,7 +23,7 @@
 *
 * configfs Copyright (C) 2005 Oracle.  All rights reserved.
 *
- * Please see the file Documentation/filesystems/configfs.txt for
+ * Please see the file Documentation/filesystems/configfs/configfs.txt for
 * critical information about using the config_item interface.
 */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e7a7a2f07324..f3a257d7a985 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,5 +1,5 @@
 /*
- *  file.c - part of debugfs, a tiny little debug file system
+ *  inode.c - part of debugfs, a tiny little debug file system
 *
 *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 *  Copyright (C) 2004 IBM Inc.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360ca8046..d740ab67ff6e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
- * the size of a structure on the stack.
+ * the size of a structure in the slab cache
 */
 #define DIO_PAGES       64
@@ -55,13 +55,10 @@
 * blocksize.
 */
-struct dio {
+/* dio_state only used in the submission path */
-        /* BIO submission state */
+struct dio_submit {
        struct bio *bio;                /* bio under assembly */
-        struct inode *inode;
-        int rw;
-        loff_t i_size;                  /* i_size when submitted */
-        int flags;                      /* doesn't change */
        unsigned blkbits;               /* doesn't change */
        unsigned blkfactor;             /* When we're using an alignment which
                                           is finer than the filesystem's soft
@@ -76,18 +73,17 @@ struct dio {
        sector_t block_in_file;         /* Current offset into the underlying
                                           file in dio_block units. */
        unsigned blocks_available;      /* At block_in_file.  changes */
+        int reap_counter;               /* rate limit reaping */
        sector_t final_block_in_request;/* doesn't change */
        unsigned first_block_in_page;   /* doesn't change, Used only once */
        int boundary;                   /* prev block is at a boundary */
-        int reap_counter;               /* rate limit reaping */
        get_block_t *get_block;         /* block mapping function */
-        dio_iodone_t *end_io;           /* IO completion function */
        dio_submit_t *submit_io;        /* IO submition function */
        loff_t logical_offset_in_bio;   /* current first logical block in bio */
        sector_t final_block_in_bio;    /* current final block in bio + 1 */
        sector_t next_block_for_io;     /* next block to be put under IO,
                                           in dio_blocks units */
-        struct buffer_head map_bh;      /* last get_block() result */
        /*
         * Deferred addition of a page to the dio.  These variables are
@@ -100,18 +96,6 @@ struct dio {
        sector_t cur_page_block;        /* Where it starts */
        loff_t cur_page_fs_offset;      /* Offset in file */
-        /* BIO completion state */
-        spinlock_t bio_lock;            /* protects BIO fields below */
-        unsigned long refcount;         /* direct_io_worker() and bios */
-        struct bio *bio_list;           /* singly linked via bi_private */
-        struct task_struct *waiter;     /* waiting task (NULL if none) */
-        /* AIO related stuff */
-        struct kiocb *iocb;             /* kiocb */
-        int is_async;                   /* is IO async ? */
-        int io_error;                   /* IO error in completion path */
-        ssize_t result;                 /* IO result */
        /*
         * Page fetching state. These variables belong to dio_refill_pages().
         */
@@ -125,7 +109,30 @@ struct dio {
         */
        unsigned head;                  /* next page to process */
        unsigned tail;                  /* last valid page + 1 */
+};
+/* dio_state communicated between submission path and end_io */
+struct dio {
+        int flags;                      /* doesn't change */
+        int rw;
+        struct inode *inode;
+        loff_t i_size;                  /* i_size when submitted */
+        dio_iodone_t *end_io;           /* IO completion function */
+        void *private;                  /* copy from map_bh.b_private */
+        /* BIO completion state */
+        spinlock_t bio_lock;            /* protects BIO fields below */
        int page_errors;                /* errno from get_user_pages() */
+        int is_async;                   /* is IO async ? */
+        int io_error;                   /* IO error in completion path */
+        unsigned long refcount;         /* direct_io_worker() and bios */
+        struct bio *bio_list;           /* singly linked via bi_private */
+        struct task_struct *waiter;     /* waiting task (NULL if none) */
+        /* AIO related stuff */
+        struct kiocb *iocb;             /* kiocb */
+        ssize_t result;                 /* IO result */
        /*
         * pages[] (and any fields placed after it) are not zeroed out at
@@ -133,7 +140,9 @@ struct dio {
         * wish that they not be zeroed.
         */
        struct page *pages[DIO_PAGES];  /* page buffer */
-};
+} ____cacheline_aligned_in_smp;
+static struct kmem_cache *dio_cache __read_mostly;
 static void __inode_dio_wait(struct inode *inode)
 {
@@ -182,27 +191,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
 /*
 * How many pages are in the queue?
 */
-static inline unsigned dio_pages_present(struct dio *dio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
 {
-        return dio->tail - dio->head;
+        return sdio->tail - sdio->head;
 }
 /*
 * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
 */
-static int dio_refill_pages(struct dio *dio)
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
        int ret;
        int nr_pages;
-        nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+        nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
        ret = get_user_pages_fast(
-                dio->curr_user_address,         /* Where from? */
+                sdio->curr_user_address,                /* Where from? */
                nr_pages,                       /* How many pages? */
                dio->rw == READ,                /* Write to memory? */
                &dio->pages[0]);                /* Put results here */
-        if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+        if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
                struct page *page = ZERO_PAGE(0);
                /*
                 * A memory fault, but the filesystem has some outstanding
@@ -213,17 +222,17 @@ static int dio_refill_pages(struct dio *dio)
                        dio->page_errors = ret;
                page_cache_get(page);
                dio->pages[0] = page;
-                dio->head = 0;
+                sdio->head = 0;
-                dio->tail = 1;
+                sdio->tail = 1;
                ret = 0;
                goto out;
        }
        if (ret >= 0) {
-                dio->curr_user_address += ret * PAGE_SIZE;
+                sdio->curr_user_address += ret * PAGE_SIZE;
-                dio->curr_page += ret;
+                sdio->curr_page += ret;
-                dio->head = 0;
+                sdio->head = 0;
-                dio->tail = ret;
+                sdio->tail = ret;
                ret = 0;
        }
 out:
@@ -236,17 +245,18 @@ out:
 * decent number of pages, less frequently.  To provide nicer use of the
 * L1 cache.
 */
-static struct page *dio_get_page(struct dio *dio)
+static inline struct page *dio_get_page(struct dio *dio,
+                struct dio_submit *sdio)
 {
-        if (dio_pages_present(dio) == 0) {
+        if (dio_pages_present(sdio) == 0) {
                int ret;
-                ret = dio_refill_pages(dio);
+                ret = dio_refill_pages(dio, sdio);
                if (ret)
                        return ERR_PTR(ret);
-                BUG_ON(dio_pages_present(dio) == 0);
+                BUG_ON(dio_pages_present(sdio) == 0);
        }
-        return dio->pages[dio->head++];
+        return dio->pages[sdio->head++];
 }
 /**
@@ -292,7 +302,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
        if (dio->end_io && dio->result) {
                dio->end_io(dio->iocb, offset, transferred,
-                            dio->map_bh.b_private, ret, is_async);
+                            dio->private, ret, is_async);
        } else {
                if (is_async)
                        aio_complete(dio->iocb, ret, 0);
@@ -323,7 +333,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
        if (remaining == 0) {
                dio_complete(dio, dio->iocb->ki_pos, 0, true);
-                kfree(dio);
+                kmem_cache_free(dio_cache, dio);
        }
 }
@@ -367,9 +377,10 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
-static void
+static inline void
-dio_bio_alloc(struct dio *dio, struct block_device *bdev,
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
-                sector_t first_sector, int nr_vecs)
+              struct block_device *bdev,
+              sector_t first_sector, int nr_vecs)
 {
        struct bio *bio;
@@ -386,8 +397,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
        else
                bio->bi_end_io = dio_bio_end_io;
-        dio->bio = bio;
+        sdio->bio = bio;
-        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+        sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
 /*
@@ -397,9 +408,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 *
 * bios hold a dio reference between submit_bio and ->end_io.
 */
-static void dio_bio_submit(struct dio *dio)
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
-        struct bio *bio = dio->bio;
+        struct bio *bio = sdio->bio;
        unsigned long flags;
        bio->bi_private = dio;
@@ -411,24 +422,24 @@ static void dio_bio_submit(struct dio *dio)
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
-        if (dio->submit_io)
+        if (sdio->submit_io)
-                dio->submit_io(dio->rw, bio, dio->inode,
+                sdio->submit_io(dio->rw, bio, dio->inode,
-                               dio->logical_offset_in_bio);
+                               sdio->logical_offset_in_bio);
        else
                submit_bio(dio->rw, bio);
-        dio->bio = NULL;
+        sdio->bio = NULL;
-        dio->boundary = 0;
+        sdio->boundary = 0;
-        dio->logical_offset_in_bio = 0;
+        sdio->logical_offset_in_bio = 0;
 }
 /*
 * Release any resources in case of a failure
 */
-static void dio_cleanup(struct dio *dio)
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-        while (dio_pages_present(dio))
+        while (dio_pages_present(sdio))
-                page_cache_release(dio_get_page(dio));
+                page_cache_release(dio_get_page(dio, sdio));
 }
 /*
@@ -518,11 +529,11 @@ static void dio_await_completion(struct dio *dio)
 *
 * This also helps to limit the peak amount of pinned userspace memory.
 */
-static int dio_bio_reap(struct dio *dio)
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
        int ret = 0;
-        if (dio->reap_counter++ >= 64) {
+        if (sdio->reap_counter++ >= 64) {
                while (dio->bio_list) {
                        unsigned long flags;
                        struct bio *bio;
@@ -536,14 +547,14 @@ static int dio_bio_reap(struct dio *dio)
                        if (ret == 0)
                                ret = ret2;
                }
-                dio->reap_counter = 0;
+                sdio->reap_counter = 0;
        }
        return ret;
 }
 /*
 * Call into the fs to map some more disk blocks.  We record the current number
- * of available blocks at dio->blocks_available.  These are in units of the
+ * of available blocks at sdio->blocks_available.  These are in units of the
 * fs blocksize, (1 << inode->i_blkbits).
 *
 * The fs is allowed to map lots of blocks at once.  If it wants to do that,
@@ -564,10 +575,10 @@ static int dio_bio_reap(struct dio *dio)
 * buffer_mapped().  However the direct-io code will only process holes one
 * block at a time - it will repeatedly call get_block() as it walks the hole.
 */
-static int get_more_blocks(struct dio *dio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+                           struct buffer_head *map_bh)
 {
        int ret;
-        struct buffer_head *map_bh = &dio->map_bh;
        sector_t fs_startblk;   /* Into file, in filesystem-sized blocks */
        unsigned long fs_count; /* Number of filesystem-sized blocks */
        unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -580,11 +591,11 @@ static int get_more_blocks(struct dio *dio)
         */
        ret = dio->page_errors;
        if (ret == 0) {
-                BUG_ON(dio->block_in_file >= dio->final_block_in_request);
+                BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
-                fs_startblk = dio->block_in_file >> dio->blkfactor;
+                fs_startblk = sdio->block_in_file >> sdio->blkfactor;
-                dio_count = dio->final_block_in_request - dio->block_in_file;
+                dio_count = sdio->final_block_in_request - sdio->block_in_file;
-                fs_count = dio_count >> dio->blkfactor;
+                fs_count = dio_count >> sdio->blkfactor;
-                blkmask = (1 << dio->blkfactor) - 1;
+                blkmask = (1 << sdio->blkfactor) - 1;
                if (dio_count & blkmask)        
                        fs_count++;
@@ -604,13 +615,16 @@ static int get_more_blocks(struct dio *dio)
                 */
                create = dio->rw & WRITE;
                if (dio->flags & DIO_SKIP_HOLES) {
-                        if (dio->block_in_file < (i_size_read(dio->inode) >>
+                        if (sdio->block_in_file < (i_size_read(dio->inode) >>
-                                                        dio->blkbits))
+                                                        sdio->blkbits))
                                create = 0;
                }
-                ret = (*dio->get_block)(dio->inode, fs_startblk,
+                ret = (*sdio->get_block)(dio->inode, fs_startblk,
                                                map_bh, create);
+                /* Store for completion */
+                dio->private = map_bh->b_private;
        }
        return ret;
 }
@@ -618,20 +632,21 @@ static int get_more_blocks(struct dio *dio)
 /*
 * There is no bio.  Make one now.
 */
-static int dio_new_bio(struct dio *dio, sector_t start_sector)
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
+                sector_t start_sector, struct buffer_head *map_bh)
 {
        sector_t sector;
        int ret, nr_pages;
-        ret = dio_bio_reap(dio);
+        ret = dio_bio_reap(dio, sdio);
        if (ret)
                goto out;
-        sector = start_sector << (dio->blkbits - 9);
+        sector = start_sector << (sdio->blkbits - 9);
-        nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+        nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
        nr_pages = min(nr_pages, BIO_MAX_PAGES);
        BUG_ON(nr_pages <= 0);
-        dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+        dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
-        dio->boundary = 0;
+        sdio->boundary = 0;
 out:
        return ret;
 }
@@ -643,21 +658,21 @@ out:
 *
 * Return zero on success.  Non-zero means the caller needs to start a new BIO.
 */
-static int dio_bio_add_page(struct dio *dio)
+static inline int dio_bio_add_page(struct dio_submit *sdio)
 {
        int ret;
-        ret = bio_add_page(dio->bio, dio->cur_page,
+        ret = bio_add_page(sdio->bio, sdio->cur_page,
-                        dio->cur_page_len, dio->cur_page_offset);
+                        sdio->cur_page_len, sdio->cur_page_offset);
-        if (ret == dio->cur_page_len) {
+        if (ret == sdio->cur_page_len) {
                /*
                 * Decrement count only, if we are done with this page
                 */
-                if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
+                if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
-                        dio->pages_in_io--;
+                        sdio->pages_in_io--;
-                page_cache_get(dio->cur_page);
+                page_cache_get(sdio->cur_page);
-                dio->final_block_in_bio = dio->cur_page_block +
+                sdio->final_block_in_bio = sdio->cur_page_block +
-                        (dio->cur_page_len >> dio->blkbits);
+                        (sdio->cur_page_len >> sdio->blkbits);
                ret = 0;
        } else {
                ret = 1;
@@ -675,14 +690,15 @@ static int dio_bio_add_page(struct dio *dio)
 * The caller of this function is responsible for removing cur_page from the
 * dio, and for dropping the refcount which came from that presence.
 */
-static int dio_send_cur_page(struct dio *dio)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+                struct buffer_head *map_bh)
 {
        int ret = 0;
-        if (dio->bio) {
+        if (sdio->bio) {
-                loff_t cur_offset = dio->cur_page_fs_offset;
+                loff_t cur_offset = sdio->cur_page_fs_offset;
-                loff_t bio_next_offset = dio->logical_offset_in_bio +
+                loff_t bio_next_offset = sdio->logical_offset_in_bio +
-                        dio->bio->bi_size;
+                        sdio->bio->bi_size;
                /*
                 * See whether this new request is contiguous with the old.
@@ -698,28 +714,28 @@ static int dio_send_cur_page(struct dio *dio)
                 * be the next logical offset in the bio, submit the bio we
                 * have.
                 */
-                if (dio->final_block_in_bio != dio->cur_page_block ||
+                if (sdio->final_block_in_bio != sdio->cur_page_block ||
                    cur_offset != bio_next_offset)
-                        dio_bio_submit(dio);
+                        dio_bio_submit(dio, sdio);
                /*
                 * Submit now if the underlying fs is about to perform a
                 * metadata read
                 */
-                else if (dio->boundary)
+                else if (sdio->boundary)
-                        dio_bio_submit(dio);
+                        dio_bio_submit(dio, sdio);
        }
-        if (dio->bio == NULL) {
+        if (sdio->bio == NULL) {
-                ret = dio_new_bio(dio, dio->cur_page_block);
+                ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
                if (ret)
                        goto out;
        }
-        if (dio_bio_add_page(dio) != 0) {
+        if (dio_bio_add_page(sdio) != 0) {
-                dio_bio_submit(dio);
+                dio_bio_submit(dio, sdio);
-                ret = dio_new_bio(dio, dio->cur_page_block);
+                ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
                if (ret == 0) {
-                        ret = dio_bio_add_page(dio);
+                        ret = dio_bio_add_page(sdio);
                        BUG_ON(ret != 0);
                }
        }
@@ -744,9 +760,10 @@ out:
 * If that doesn't work out then we put the old page into the bio and add this
 * page to the dio instead.
 */
-static int
+static inline int
-submit_page_section(struct dio *dio, struct page *page,
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
-                unsigned offset, unsigned len, sector_t blocknr)
+                    unsigned offset, unsigned len, sector_t blocknr,
+                    struct buffer_head *map_bh)
 {
        int ret = 0;
@@ -760,20 +777,20 @@ submit_page_section(struct dio *dio, struct page *page,
        /*
         * Can we just grow the current page's presence in the dio?
         */
-        if (    (dio->cur_page == page) &&
+        if (sdio->cur_page == page &&
-                (dio->cur_page_offset + dio->cur_page_len == offset) &&
+            sdio->cur_page_offset + sdio->cur_page_len == offset &&
-                (dio->cur_page_block +
+            sdio->cur_page_block +
-                        (dio->cur_page_len >> dio->blkbits) == blocknr)) {
+            (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
-                dio->cur_page_len += len;
+                sdio->cur_page_len += len;
                /*
-                 * If dio->boundary then we want to schedule the IO now to
+                 * If sdio->boundary then we want to schedule the IO now to
                 * avoid metadata seeks.
                 */
-                if (dio->boundary) {
+                if (sdio->boundary) {
-                        ret = dio_send_cur_page(dio);
+                        ret = dio_send_cur_page(dio, sdio, map_bh);
-                        page_cache_release(dio->cur_page);
+                        page_cache_release(sdio->cur_page);
-                        dio->cur_page = NULL;
+                        sdio->cur_page = NULL;
                }
                goto out;
        }
@@ -781,20 +798,20 @@ submit_page_section(struct dio *dio, struct page *page,
        /*
         * If there's a deferred page already there then send it.
         */
-        if (dio->cur_page) {
+        if (sdio->cur_page) {
-                ret = dio_send_cur_page(dio);
+                ret = dio_send_cur_page(dio, sdio, map_bh);
-                page_cache_release(dio->cur_page);
+                page_cache_release(sdio->cur_page);
-                dio->cur_page = NULL;
+                sdio->cur_page = NULL;
                if (ret)
                        goto out;
        }
        page_cache_get(page);           /* It is in dio */
-        dio->cur_page = page;
+        sdio->cur_page = page;
-        dio->cur_page_offset = offset;
+        sdio->cur_page_offset = offset;
-        dio->cur_page_len = len;
+        sdio->cur_page_len = len;
-        dio->cur_page_block = blocknr;
+        sdio->cur_page_block = blocknr;
-        dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+        sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
        return ret;
 }
@@ -804,16 +821,16 @@ out:
 * file blocks.  Only called for S_ISREG files - blockdevs do not set
 * buffer_new
 */
-static void clean_blockdev_aliases(struct dio *dio)
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
 {
        unsigned i;
        unsigned nblocks;
-        nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+        nblocks = map_bh->b_size >> dio->inode->i_blkbits;
        for (i = 0; i < nblocks; i++) {
-                unmap_underlying_metadata(dio->map_bh.b_bdev,
+                unmap_underlying_metadata(map_bh->b_bdev,
-                                        dio->map_bh.b_blocknr + i);
+                                          map_bh->b_blocknr + i);
        }
 }
@@ -826,19 +843,20 @@ static void clean_blockdev_aliases(struct dio *dio)
 * `end' is zero if we're doing the start of the IO, 1 at the end of the
 * IO.
 */
-static void dio_zero_block(struct dio *dio, int end)
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
+                int end, struct buffer_head *map_bh)
 {
        unsigned dio_blocks_per_fs_block;
        unsigned this_chunk_blocks;     /* In dio_blocks */
        unsigned this_chunk_bytes;
        struct page *page;
-        dio->start_zero_done = 1;
+        sdio->start_zero_done = 1;
-        if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+        if (!sdio->blkfactor || !buffer_new(map_bh))
                return;
-        dio_blocks_per_fs_block = 1 << dio->blkfactor;
+        dio_blocks_per_fs_block = 1 << sdio->blkfactor;
-        this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+        this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
        if (!this_chunk_blocks)
                return;
@@ -850,14 +868,14 @@ static void dio_zero_block(struct dio *dio, int end)
        if (end) 
                this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
-        this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+        this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
        page = ZERO_PAGE(0);
-        if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+        if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
-                                dio->next_block_for_io))
+                                sdio->next_block_for_io, map_bh))
                return;
-        dio->next_block_for_io += this_chunk_blocks;
+        sdio->next_block_for_io += this_chunk_blocks;
 }
 /*
@@ -876,20 +894,20 @@ static void dio_zero_block(struct dio *dio, int end)
 * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
 * fine alignment but still allows this function to work in PAGE_SIZE units.
 */
-static int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+                        struct buffer_head *map_bh)
 {
-        const unsigned blkbits = dio->blkbits;
+        const unsigned blkbits = sdio->blkbits;
        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
        struct page *page;
        unsigned block_in_page;
-        struct buffer_head *map_bh = &dio->map_bh;
        int ret = 0;
        /* The I/O can start at any block offset within the first page */
-        block_in_page = dio->first_block_in_page;
+        block_in_page = sdio->first_block_in_page;
-        while (dio->block_in_file < dio->final_block_in_request) {
+        while (sdio->block_in_file < sdio->final_block_in_request) {
-                page = dio_get_page(dio);
+                page = dio_get_page(dio, sdio);
                if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        goto out;
@@ -901,14 +919,14 @@ static int do_direct_IO(struct dio *dio)
                        unsigned this_chunk_blocks;     /* # of blocks */
                        unsigned u;
-                        if (dio->blocks_available == 0) {
+                        if (sdio->blocks_available == 0) {
                                /*
                                 * Need to go and map some more disk
                                 */
                                unsigned long blkmask;
                                unsigned long dio_remainder;
-                                ret = get_more_blocks(dio);
+                                ret = get_more_blocks(dio, sdio, map_bh);
                                if (ret) {
                                        page_cache_release(page);
                                        goto out;
@@ -916,18 +934,18 @@ static int do_direct_IO(struct dio *dio)
                                if (!buffer_mapped(map_bh))
                                        goto do_holes;
-                                dio->blocks_available =
+                                sdio->blocks_available =
-                                                map_bh->b_size >> dio->blkbits;
+                                                map_bh->b_size >> sdio->blkbits;
-                                dio->next_block_for_io =
+                                sdio->next_block_for_io =
-                                        map_bh->b_blocknr << dio->blkfactor;
+                                        map_bh->b_blocknr << sdio->blkfactor;
                                if (buffer_new(map_bh))
-                                        clean_blockdev_aliases(dio);
+                                        clean_blockdev_aliases(dio, map_bh);
-                                if (!dio->blkfactor)
+                                if (!sdio->blkfactor)
                                        goto do_holes;
-                                blkmask = (1 << dio->blkfactor) - 1;
+                                blkmask = (1 << sdio->blkfactor) - 1;
-                                dio_remainder = (dio->block_in_file & blkmask);
+                                dio_remainder = (sdio->block_in_file & blkmask);
                                /*
                                 * If we are at the start of IO and that IO
@@ -941,8 +959,8 @@ static int do_direct_IO(struct dio *dio)
                                 * on-disk
                                 */
                                if (!buffer_new(map_bh))
-                                        dio->next_block_for_io += dio_remainder;
+                                        sdio->next_block_for_io += dio_remainder;
-                                dio->blocks_available -= dio_remainder;
+                                sdio->blocks_available -= dio_remainder;
                        }
 do_holes:
                        /* Handle holes */
@@ -961,7 +979,7 @@ do_holes:
                                 */
                                i_size_aligned = ALIGN(i_size_read(dio->inode),
                                                        1 << blkbits);
-                                if (dio->block_in_file >=
+                                if (sdio->block_in_file >=
                                                i_size_aligned >> blkbits) {
                                        /* We hit eof */
                                        page_cache_release(page);
@@ -969,7 +987,7 @@ do_holes:
                                }
                                zero_user(page, block_in_page << blkbits,
                                                1 << blkbits);
-                                dio->block_in_file++;
+                                sdio->block_in_file++;
                                block_in_page++;
                                goto next_block;
                        }
@@ -979,38 +997,41 @@ do_holes:
                         * is finer than the underlying fs, go check to see if
                         * we must zero out the start of this block.
                         */
-                        if (unlikely(dio->blkfactor && !dio->start_zero_done))
+                        if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
-                                dio_zero_block(dio, 0);
+                                dio_zero_block(dio, sdio, 0, map_bh);
                        /*
                         * Work out, in this_chunk_blocks, how much disk we
                         * can add to this page
                         */
-                        this_chunk_blocks = dio->blocks_available;
+                        this_chunk_blocks = sdio->blocks_available;
                        u = (PAGE_SIZE - offset_in_page) >> blkbits;
                        if (this_chunk_blocks > u)
                                this_chunk_blocks = u;
-                        u = dio->final_block_in_request - dio->block_in_file;
+                        u = sdio->final_block_in_request - sdio->block_in_file;
                        if (this_chunk_blocks > u)
                                this_chunk_blocks = u;
                        this_chunk_bytes = this_chunk_blocks << blkbits;
                        BUG_ON(this_chunk_bytes == 0);
-                        dio->boundary = buffer_boundary(map_bh);
+                        sdio->boundary = buffer_boundary(map_bh);
-                        ret = submit_page_section(dio, page, offset_in_page,
+                        ret = submit_page_section(dio, sdio, page,
-                                this_chunk_bytes, dio->next_block_for_io);
+                                                  offset_in_page,
+                                                  this_chunk_bytes,
+                                                  sdio->next_block_for_io,
+                                                  map_bh);
                        if (ret) {
                                page_cache_release(page);
                                goto out;
                        }
-                        dio->next_block_for_io += this_chunk_blocks;
+                        sdio->next_block_for_io += this_chunk_blocks;
-                        dio->block_in_file += this_chunk_blocks;
+                        sdio->block_in_file += this_chunk_blocks;
                        block_in_page += this_chunk_blocks;
-                        dio->blocks_available -= this_chunk_blocks;
+                        sdio->blocks_available -= this_chunk_blocks;
 next_block:
-                        BUG_ON(dio->block_in_file > dio->final_block_in_request);
+                        BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
-                        if (dio->block_in_file == dio->final_block_in_request)
+                        if (sdio->block_in_file == sdio->final_block_in_request)
                                break;
                }
@@ -1022,135 +1043,10 @@ out:
        return ret;
 }
-static ssize_t
+static inline int drop_refcount(struct dio *dio)
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
-        const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-        unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-        dio_submit_t submit_io, struct dio *dio)
 {
-        unsigned long user_addr; 
+        int ret2;
        unsigned long flags;
-        int seg;
-        ssize_t ret = 0;
-        ssize_t ret2;
-        size_t bytes;
-        dio->inode = inode;
-        dio->rw = rw;
-        dio->blkbits = blkbits;
-        dio->blkfactor = inode->i_blkbits - blkbits;
-        dio->block_in_file = offset >> blkbits;
-        dio->get_block = get_block;
-        dio->end_io = end_io;
-        dio->submit_io = submit_io;
-        dio->final_block_in_bio = -1;
-        dio->next_block_for_io = -1;
-        dio->iocb = iocb;
-        dio->i_size = i_size_read(inode);
-        spin_lock_init(&dio->bio_lock);
-        dio->refcount = 1;
-        /*
-         * In case of non-aligned buffers, we may need 2 more
-         * pages since we need to zero out first and last block.
-         */
-        if (unlikely(dio->blkfactor))
-                dio->pages_in_io = 2;
-        for (seg = 0; seg < nr_segs; seg++) {
-                user_addr = (unsigned long)iov[seg].iov_base;
-                dio->pages_in_io +=
-                        ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
-                                - user_addr/PAGE_SIZE);
-        }
-        for (seg = 0; seg < nr_segs; seg++) {
-                user_addr = (unsigned long)iov[seg].iov_base;
-                dio->size += bytes = iov[seg].iov_len;
-                /* Index into the first page of the first block */
-                dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-                dio->final_block_in_request = dio->block_in_file +
-                                                (bytes >> blkbits);
-                /* Page fetching state */
-                dio->head = 0;
-                dio->tail = 0;
-                dio->curr_page = 0;
-                dio->total_pages = 0;
-                if (user_addr & (PAGE_SIZE-1)) {
-                        dio->total_pages++;
-                        bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-                }
-                dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-                dio->curr_user_address = user_addr;
-        
-                ret = do_direct_IO(dio);
-                dio->result += iov[seg].iov_len -
-                        ((dio->final_block_in_request - dio->block_in_file) <<
-                                        blkbits);
-                if (ret) {
-                        dio_cleanup(dio);
-                        break;
-                }
-        } /* end iovec loop */
-        if (ret == -ENOTBLK) {
-                /*
-                 * The remaining part of the request will be
-                 * be handled by buffered I/O when we return
-                 */
-                ret = 0;
-        }
-        /*
-         * There may be some unwritten disk at the end of a part-written
-         * fs-block-sized block.  Go zero that now.
-         */
-        dio_zero_block(dio, 1);
-        if (dio->cur_page) {
-                ret2 = dio_send_cur_page(dio);
-                if (ret == 0)
-                        ret = ret2;
-                page_cache_release(dio->cur_page);
-                dio->cur_page = NULL;
-        }
-        if (dio->bio)
-                dio_bio_submit(dio);
-        /*
-         * It is possible that, we return short IO due to end of file.
-         * In that case, we need to release all the pages we got hold on.
-         */
-        dio_cleanup(dio);
-        /*
-         * All block lookups have been performed. For READ requests
-         * we can let i_mutex go now that its achieved its purpose
-         * of protecting us from looking up uninitialized blocks.
-         */
-        if (rw == READ && (dio->flags & DIO_LOCKING))
-                mutex_unlock(&dio->inode->i_mutex);
-        /*
-         * The only time we want to leave bios in flight is when a successful
-         * partial aio read or full aio write have been setup.  In that case
-         * bio completion will call aio_complete.  The only time it's safe to
-         * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
-         * This had *better* be the only place that raises -EIOCBQUEUED.
-         */
-        BUG_ON(ret == -EIOCBQUEUED);
-        if (dio->is_async && ret == 0 && dio->result &&
-            ((rw & READ) || (dio->result == dio->size)))
-                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED)
-                dio_await_completion(dio);
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1166,14 +1062,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        spin_lock_irqsave(&dio->bio_lock, flags);
        ret2 = --dio->refcount;
        spin_unlock_irqrestore(&dio->bio_lock, flags);
+        return ret2;
-        if (ret2 == 0) {
-                ret = dio_complete(dio, offset, ret, false);
-                kfree(dio);
-        } else
-                BUG_ON(ret != -EIOCBQUEUED);
-        return ret;
 }
 /*
@@ -1195,6 +1084,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 * expected that filesystem provide exclusion between new direct I/O
 * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
 * but other filesystems need to take care of this on their own.
+ *
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. This is important
+ * for the whole file.
 */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1211,6 +1105,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        ssize_t retval = -EINVAL;
        loff_t end = offset;
        struct dio *dio;
+        struct dio_submit sdio = { 0, };
+        unsigned long user_addr;
+        size_t bytes;
+        struct buffer_head map_bh = { 0, };
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
@@ -1244,7 +1142,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (rw == READ && end == offset)
                return 0;
-        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+        dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
        retval = -ENOMEM;
        if (!dio)
                goto out;
@@ -1268,7 +1166,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                                              end - 1);
                        if (retval) {
                                mutex_unlock(&inode->i_mutex);
-                                kfree(dio);
+                                kmem_cache_free(dio_cache, dio);
                                goto out;
                        }
                }
@@ -1288,11 +1186,141 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
                (end > i_size_read(inode)));
-        retval = direct_io_worker(rw, iocb, inode, iov, offset,
+        retval = 0;
-                                nr_segs, blkbits, get_block, end_io,
-                                submit_io, dio);
+        dio->inode = inode;
+        dio->rw = rw;
+        sdio.blkbits = blkbits;
+        sdio.blkfactor = inode->i_blkbits - blkbits;
+        sdio.block_in_file = offset >> blkbits;
+        sdio.get_block = get_block;
+        dio->end_io = end_io;
+        sdio.submit_io = submit_io;
+        sdio.final_block_in_bio = -1;
+        sdio.next_block_for_io = -1;
+        dio->iocb = iocb;
+        dio->i_size = i_size_read(inode);
+        spin_lock_init(&dio->bio_lock);
+        dio->refcount = 1;
+        /*
+         * In case of non-aligned buffers, we may need 2 more
+         * pages since we need to zero out first and last block.
+         */
+        if (unlikely(sdio.blkfactor))
+                sdio.pages_in_io = 2;
+        for (seg = 0; seg < nr_segs; seg++) {
+                user_addr = (unsigned long)iov[seg].iov_base;
+                sdio.pages_in_io +=
+                        ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+                                PAGE_SIZE - user_addr / PAGE_SIZE);
+        }
+        for (seg = 0; seg < nr_segs; seg++) {
+                user_addr = (unsigned long)iov[seg].iov_base;
+                sdio.size += bytes = iov[seg].iov_len;
+                /* Index into the first page of the first block */
+                sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+                sdio.final_block_in_request = sdio.block_in_file +
+                                                (bytes >> blkbits);
+                /* Page fetching state */
+                sdio.head = 0;
+                sdio.tail = 0;
+                sdio.curr_page = 0;
+                sdio.total_pages = 0;
+                if (user_addr & (PAGE_SIZE-1)) {
+                        sdio.total_pages++;
+                        bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+                }
+                sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+                sdio.curr_user_address = user_addr;
+                retval = do_direct_IO(dio, &sdio, &map_bh);
+                dio->result += iov[seg].iov_len -
+                        ((sdio.final_block_in_request - sdio.block_in_file) <<
+                                        blkbits);
+                if (retval) {
+                        dio_cleanup(dio, &sdio);
+                        break;
+                }
+        } /* end iovec loop */
+        if (retval == -ENOTBLK) {
+                /*
+                 * The remaining part of the request will be
+                 * be handled by buffered I/O when we return
+                 */
+                retval = 0;
+        }
+        /*
+         * There may be some unwritten disk at the end of a part-written
+         * fs-block-sized block.  Go zero that now.
+         */
+        dio_zero_block(dio, &sdio, 1, &map_bh);
+        if (sdio.cur_page) {
+                ssize_t ret2;
+                ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+                if (retval == 0)
+                        retval = ret2;
+                page_cache_release(sdio.cur_page);
+                sdio.cur_page = NULL;
+        }
+        if (sdio.bio)
+                dio_bio_submit(dio, &sdio);
+        /*
+         * It is possible that, we return short IO due to end of file.
+         * In that case, we need to release all the pages we got hold on.
+         */
+        dio_cleanup(dio, &sdio);
+        /*
+         * All block lookups have been performed. For READ requests
+         * we can let i_mutex go now that its achieved its purpose
+         * of protecting us from looking up uninitialized blocks.
+         */
+        if (rw == READ && (dio->flags & DIO_LOCKING))
+                mutex_unlock(&dio->inode->i_mutex);
+        /*
+         * The only time we want to leave bios in flight is when a successful
+         * partial aio read or full aio write have been setup.  In that case
+         * bio completion will call aio_complete.  The only time it's safe to
+         * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+         * This had *better* be the only place that raises -EIOCBQUEUED.
+         */
+        BUG_ON(retval == -EIOCBQUEUED);
+        if (dio->is_async && retval == 0 && dio->result &&
+            ((rw & READ) || (dio->result == sdio.size)))
+                retval = -EIOCBQUEUED;
+        if (retval != -EIOCBQUEUED)
+                dio_await_completion(dio);
+        if (drop_refcount(dio) == 0) {
+                retval = dio_complete(dio, offset, retval, false);
+                kmem_cache_free(dio_cache, dio);
+        } else
+                BUG_ON(retval != -EIOCBQUEUED);
 out:
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
+static __init int dio_init(void)
+{
+        dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+        return 0;
+}
+module_init(dio_init)
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1cd6d9d3e29a..cc16562654de 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
 config ECRYPT_FS
        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-        depends on EXPERIMENTAL && KEYS && CRYPTO
+        depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
        select CRYPTO_ECB
        select CRYPTO_CBC
        select CRYPTO_MD5
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 08a2b52bf565..ac1ad48c2376 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1973,7 +1973,7 @@ pki_encrypt_session_key(struct key *auth_tok_key,
 {
        struct ecryptfs_msg_ctx *msg_ctx = NULL;
        char *payload = NULL;
-        size_t payload_len;
+        size_t payload_len = 0;
        struct ecryptfs_message *msg;
        int rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f1bb747d77d..b4a6befb1216 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
       ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_check_dev_ruid,
       ecryptfs_opt_err };
 static const match_table_t tokens = {
@@ -191,6 +192,7 @@ static const match_table_t tokens = {
        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
        {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
+        {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
        {ecryptfs_opt_err, NULL}
 };
@@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat(
 * ecryptfs_parse_options
 * @sb: The ecryptfs super block
 * @options: The options passed to the kernel
+ * @check_ruid: set to 1 if device uid should be checked against the ruid
 *
 * Parse mount options:
 * debug=N         - ecryptfs_verbosity level for debug output
@@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat(
 *
 * Returns zero on success; non-zero on error
 */
-static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
+                                  uid_t *check_ruid)
 {
        char *p;
        int rc = 0;
@@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
        char *cipher_key_bytes_src;
        char *fn_cipher_key_bytes_src;
+        *check_ruid = 0;
        if (!options) {
                rc = -EINVAL;
                goto out;
@@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
                        mount_crypt_stat->flags |=
                                ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
                        break;
+                case ecryptfs_opt_check_dev_ruid:
+                        *check_ruid = 1;
+                        break;
                case ecryptfs_opt_err:
                default:
                        printk(KERN_WARNING
@@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        const char *err = "Getting sb failed";
        struct inode *inode;
        struct path path;
+        uid_t check_ruid;
        int rc;
        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
-        rc = ecryptfs_parse_options(sbi, raw_data);
+        rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
        if (rc) {
                err = "Error parsing options";
                goto out;
@@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                        "known incompatibilities\n");
                goto out_free;
        }
+        if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
+                rc = -EPERM;
+                printk(KERN_ERR "Mount of device (uid: %d) not owned by "
+                       "requested user (uid: %d)\n",
+                       path.dentry->d_inode->i_uid, current_uid());
+                goto out_free;
+        }
        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        s->s_blocksize = path.dentry->d_sb->s_blocksize;
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 85d430963116..3745f7c2b9c2 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -39,15 +39,16 @@
 int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
                         loff_t offset, size_t size)
 {
-        struct ecryptfs_inode_info *inode_info;
+        struct file *lower_file;
        mm_segment_t fs_save;
        ssize_t rc;
-        inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
+        lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
-        BUG_ON(!inode_info->lower_file);
+        if (!lower_file)
+                return -EIO;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_write(inode_info->lower_file, data, size, &offset);
+        rc = vfs_write(lower_file, data, size, &offset);
        set_fs(fs_save);
        mark_inode_dirty_sync(ecryptfs_inode);
        return rc;
@@ -225,15 +226,16 @@ out:
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
                        struct inode *ecryptfs_inode)
 {
-        struct ecryptfs_inode_info *inode_info =
+        struct file *lower_file;
-                ecryptfs_inode_to_private(ecryptfs_inode);
        mm_segment_t fs_save;
        ssize_t rc;
-        BUG_ON(!inode_info->lower_file);
+        lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+        if (!lower_file)
+                return -EIO;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_read(inode_info->lower_file, data, size, &offset);
+        rc = vfs_read(lower_file, data, size, &offset);
        set_fs(fs_save);
        return rc;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index fe047d966dc5..9026fc91fe3b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -700,7 +700,7 @@ static const struct file_operations eventpoll_fops = {
        .llseek         = noop_llseek,
 };
-/* Fast test to see if the file is an evenpoll file */
+/* Fast test to see if the file is an eventpoll file */
 static inline int is_file_epoll(struct file *f)
 {
        return f->f_op == &eventpoll_fops;
diff --git a/fs/exec.c b/fs/exec.c
index da80612a35f4..25dcbe5fc356 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1459,6 +1459,23 @@ static int do_execve_common(const char *filename,
        struct files_struct *displaced;
        bool clear_in_exec;
        int retval;
+        const struct cred *cred = current_cred();
+        /*
+         * We move the actual failure in case of RLIMIT_NPROC excess from
+         * set*uid() to execve() because too many poorly written programs
+         * don't check setuid() return code.  Here we additionally recheck
+         * whether NPROC limit is still exceeded.
+         */
+        if ((current->flags & PF_NPROC_EXCEEDED) &&
+            atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+                retval = -EAGAIN;
+                goto out_ret;
+        }
+        /* We're below the limit (still or again), so we don't want to make
+         * further execve() calls fail. */
+        current->flags &= ~PF_NPROC_EXCEEDED;
        retval = unshare_files(&displaced);
        if (retval)
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855a6c44..352ba149d23e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -13,7 +13,8 @@
 #
 # ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
 exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae4149291..fa9a286c8771 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
 config ORE
        tristate
+        depends on EXOFS_FS
+        select ASYNC_XOR
+        default SCSI_OSD_ULD
 config EXOFS_FS
        tristate "exofs: OSD based file system support"
        depends on SCSI_OSD_ULD
-        select ORE
        help
          EXOFS is a file system that uses an OSD storage device,
          as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442ec7445..51f4b4c40f09 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -53,6 +53,10 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
+struct exofs_dev {
+        struct ore_dev ored;
+        unsigned did;
+};
 /*
 * our extension to the in-memory superblock
 */
@@ -66,13 +70,9 @@ struct exofs_sb_info {
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
-        struct pnfs_osd_data_map data_map;      /* Default raid to use
-                                                 * FIXME: Needed ?
-                                                 */
        struct ore_layout       layout;         /* Default files layout       */
        struct ore_comp one_comp;               /* id & cred of partition id=0*/
-        struct ore_components comps;            /* comps for the partition    */
+        struct ore_components oc;               /* comps for the partition    */
-        struct osd_dev  *_min_one_dev[1];       /* Place holder for one dev   */
 };
 /*
@@ -86,7 +86,7 @@ struct exofs_i_info {
        uint32_t       i_dir_start_lookup; /* which page to start lookup      */
        uint64_t       i_commit_size;      /* the object's written length     */
        struct ore_comp one_comp;          /* same component for all devices  */
-        struct ore_components comps;       /* inode view of the device table  */
+        struct ore_components oc;          /* inode view of the device table  */
 };
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@ extern const struct inode_operations exofs_fast_symlink_inode_operations;
 * bigger and that the device table repeats twice.
 * See: exofs_read_lookup_dev_table()
 */
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
                                    struct ore_comp *one_comp,
                                    struct exofs_sb_info *sbi, osd_id oid)
 {
@@ -217,13 +217,15 @@ static inline void exofs_init_comps(struct ore_components *comps,
        one_comp->obj.id = oid;
        exofs_make_credential(one_comp->cred, &one_comp->obj);
-        comps->numdevs = sbi->comps.numdevs;
+        oc->first_dev = 0;
-        comps->single_comp = EC_SINGLE_COMP;
+        oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
-        comps->comps = one_comp;
+                                                        sbi->layout.group_count;
+        oc->single_comp = EC_SINGLE_COMP;
+        oc->comps = one_comp;
        /* Round robin device view of the table */
-        first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
+        first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
-        comps->ods = sbi->comps.ods + first_dev;
+        oc->ods = &sbi->oc.ods[first_dev];
 }
 #endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38fc2349..3e5f3a6be90a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,11 +37,7 @@
 #define EXOFS_DBGMSG2(M...) do {} while (0)
-enum { BIO_MAX_PAGES_KMALLOC =
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
-                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-        MAX_PAGES_KMALLOC =
-                PAGE_SIZE / sizeof(struct page *),
-};
 unsigned exofs_max_io_pages(struct ore_layout *layout,
                            unsigned expected_pages)
@@ -49,8 +45,7 @@ unsigned exofs_max_io_pages(struct ore_layout *layout,
        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
        /* TODO: easily support bio chaining */
-        pages =  min_t(unsigned, pages,
+        pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
-                       layout->group_width * BIO_MAX_PAGES_KMALLOC);
        return pages;
 }
@@ -68,6 +63,7 @@ struct page_collect {
        bool read_4_write; /* This means two things: that the read is sync
                            * And the pages should not be unlocked.
                            */
+        struct page *that_locked_page;
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
        pcol->length = 0;
        pcol->pg_first = -1;
        pcol->read_4_write = false;
+        pcol->that_locked_page = NULL;
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@ static void _pcol_reset(struct page_collect *pcol)
        pcol->length = 0;
        pcol->pg_first = -1;
        pcol->ios = NULL;
+        pcol->that_locked_page = NULL;
        /* this is probably the end of the loop but in writes
         * it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page,
        return 0;
 }
+enum {PAGE_WAS_NOT_IN_IO = 17};
 static int update_read_page(struct page *page, int ret)
 {
-        if (ret == 0) {
+        switch (ret) {
+        case 0:
                /* Everything is OK */
                SetPageUptodate(page);
                if (PageError(page))
                        ClearPageError(page);
-        } else if (ret == -EFAULT) {
+                break;
+        case -EFAULT:
                /* In this case we were trying to read something that wasn't on
                 * disk yet - return a page full of zeroes.  This should be OK,
                 * because the object should be empty (if there was a write
@@ -167,16 +168,22 @@ static int update_read_page(struct page *page, int ret)
                SetPageUptodate(page);
                if (PageError(page))
                        ClearPageError(page);
-                ret = 0; /* recovered error */
                EXOFS_DBGMSG("recovered read error\n");
-        } else /* Error */
+                /* fall through */
+        case PAGE_WAS_NOT_IN_IO:
+                ret = 0; /* recovered error */
+                break;
+        default:
                SetPageError(page);
+        }
        return ret;
 }
 static void update_write_page(struct page *page, int ret)
 {
+        if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+                return; /* don't pass start don't collect $200 */
        if (ret) {
                mapping_set_error(page->mapping, ret);
                SetPageError(page);
@@ -190,15 +197,16 @@ static void update_write_page(struct page *page, int ret)
 static int __readpages_done(struct page_collect *pcol)
 {
        int i;
-        u64 resid;
        u64 good_bytes;
        u64 length = 0;
-        int ret = ore_check_io(pcol->ios, &resid);
+        int ret = ore_check_io(pcol->ios, NULL);
-        if (likely(!ret))
+        if (likely(!ret)) {
                good_bytes = pcol->length;
-        else
+                ret = PAGE_WAS_NOT_IN_IO;
-                good_bytes = pcol->length - resid;
+        } else {
+                good_bytes = 0;
+        }
        EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
                     " length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
        }
 }
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+        struct page_collect *pcol_src, struct page_collect *pcol)
+{
+        /* length was wrong or offset was not page aligned */
+        BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+        if (pcol_src->nr_pages > ios->nr_pages) {
+                struct page **src_page;
+                unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+                unsigned long len_less = pcol_src->length - ios->length;
+                unsigned i;
+                int ret;
+                /* This IO was trimmed */
+                pcol_src->nr_pages = ios->nr_pages;
+                pcol_src->length = ios->length;
+                /* Left over pages are passed to the next io */
+                pcol->expected_pages += pages_less;
+                pcol->nr_pages = pages_less;
+                pcol->length = len_less;
+                src_page = pcol_src->pages + pcol_src->nr_pages;
+                pcol->pg_first = (*src_page)->index;
+                ret = pcol_try_alloc(pcol);
+                if (unlikely(ret))
+                        return ret;
+                for (i = 0; i < pages_less; ++i)
+                        pcol->pages[i] = *src_page++;
+                EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+                        "pages_less=0x%x expected_pages=0x%x "
+                        "next_offset=0x%llx next_len=0x%lx\n",
+                        pcol_src->nr_pages, pages_less, pcol->expected_pages,
+                        pcol->pg_first * PAGE_SIZE, pcol->length);
+        }
+        return 0;
+}
 static int read_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@ static int read_exec(struct page_collect *pcol)
                return 0;
        if (!pcol->ios) {
-                int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+                int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
                                             pcol->pg_first << PAGE_CACHE_SHIFT,
                                             pcol->length, &pcol->ios);
@@ -280,7 +328,6 @@ static int read_exec(struct page_collect *pcol)
        ios = pcol->ios;
        ios->pages = pcol->pages;
-        ios->nr_pages = pcol->nr_pages;
        if (pcol->read_4_write) {
                ore_read(pcol->ios);
@@ -296,17 +343,23 @@ static int read_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
        ios->done = readpages_done;
        ios->private = pcol_copy;
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+        if (unlikely(ret))
+                goto err;
+        EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+                pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
        ret = ore_read(ios);
        if (unlikely(ret))
                goto err;
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-                  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-        /* pages ownership was passed to pcol_copy */
-        _pcol_reset(pcol);
        return 0;
 err:
@@ -341,6 +394,8 @@ static int readpage_strip(void *data, struct page *page)
                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
                          page->index);
+        pcol->that_locked_page = page;
        if (page->index < end_index)
                len = PAGE_CACHE_SIZE;
        else if (page->index == end_index)
@@ -429,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping,
                return ret;
        }
+        ret = read_exec(&pcol);
+        if (unlikely(ret))
+                return ret;
        return read_exec(&pcol);
 }
@@ -462,17 +521,18 @@ static void writepages_done(struct ore_io_state *ios, void *p)
 {
        struct page_collect *pcol = p;
        int i;
-        u64 resid;
        u64  good_bytes;
        u64  length = 0;
-        int ret = ore_check_io(ios, &resid);
+        int ret = ore_check_io(ios, NULL);
        atomic_dec(&pcol->sbi->s_curr_pending);
-        if (likely(!ret))
+        if (likely(!ret)) {
                good_bytes = pcol->length;
-        else
+                ret = PAGE_WAS_NOT_IN_IO;
-                good_bytes = pcol->length - resid;
+        } else {
+                good_bytes = 0;
+        }
        EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
                     " length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@ static void writepages_done(struct ore_io_state *ios, void *p)
        EXOFS_DBGMSG2("writepages_done END\n");
 }
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+        struct page_collect *pcol = priv;
+        pgoff_t index = offset / PAGE_SIZE;
+        if (!pcol->that_locked_page ||
+            (pcol->that_locked_page->index != index)) {
+                struct page *page = find_get_page(pcol->inode->i_mapping, index);
+                if (!page) {
+                        page = find_or_create_page(pcol->inode->i_mapping,
+                                                   index, GFP_NOFS);
+                        if (unlikely(!page)) {
+                                EXOFS_DBGMSG("grab_cache_page Failed "
+                                        "index=0x%llx\n", _LLU(index));
+                                return NULL;
+                        }
+                        unlock_page(page);
+                }
+                if (PageDirty(page) || PageWriteback(page))
+                        *uptodate = true;
+                else
+                        *uptodate = PageUptodate(page);
+                EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+                return page;
+        } else {
+                EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+                             pcol->that_locked_page->index);
+                *uptodate = true;
+                return pcol->that_locked_page;
+        }
+}
+static void __r4w_put_page(void *priv, struct page *page)
+{
+        struct page_collect *pcol = priv;
+        if (pcol->that_locked_page != page) {
+                EXOFS_DBGMSG("index=0x%lx\n", page->index);
+                page_cache_release(page);
+                return;
+        }
+        EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+static const struct _ore_r4w_op _r4w_op = {
+        .get_page = &__r4w_get_page,
+        .put_page = &__r4w_put_page,
+};
 static int write_exec(struct page_collect *pcol)
 {
        struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@ static int write_exec(struct page_collect *pcol)
                return 0;
        BUG_ON(pcol->ios);
-        ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+        ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
                                 pcol->pg_first << PAGE_CACHE_SHIFT,
                                 pcol->length, &pcol->ios);
        if (unlikely(ret))
                goto err;
@@ -534,10 +643,20 @@ static int write_exec(struct page_collect *pcol)
        ios = pcol->ios;
        ios->pages = pcol_copy->pages;
-        ios->nr_pages = pcol_copy->nr_pages;
        ios->done = writepages_done;
+        ios->r4w = &_r4w_op;
        ios->private = pcol_copy;
+        /* pages ownership was passed to pcol_copy */
+        _pcol_reset(pcol);
+        ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+        if (unlikely(ret))
+                goto err;
+        EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+                pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
        ret = ore_write(ios);
        if (unlikely(ret)) {
                EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@ static int write_exec(struct page_collect *pcol)
        }
        atomic_inc(&pcol->sbi->s_curr_pending);
-        EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-                  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-                  pcol->length);
-        /* pages ownership was passed to pcol_copy */
-        _pcol_reset(pcol);
        return 0;
 err:
@@ -689,14 +803,33 @@ static int exofs_writepages(struct address_space *mapping,
        _pcol_init(&pcol, expected_pages, mapping->host);
        ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-        if (ret) {
+        if (unlikely(ret)) {
                EXOFS_ERR("write_cache_pages => %d\n", ret);
                return ret;
        }
-        return write_exec(&pcol);
+        ret = write_exec(&pcol);
+        if (unlikely(ret))
+                return ret;
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                return write_exec(&pcol); /* pump the last reminder */
+        } else if (pcol.nr_pages) {
+                /* not SYNC let the reminder join the next writeout */
+                unsigned i;
+                for (i = 0; i < pcol.nr_pages; i++) {
+                        struct page *page = pcol.pages[i];
+                        end_page_writeback(page);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                }
+        }
+        return 0;
 }
+/*
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct page_collect pcol;
@@ -712,7 +845,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
        return write_exec(&pcol);
 }
+*/
 /* i_mutex held using inode->i_size directly */
 static void _write_failed(struct inode *inode, loff_t to)
 {
@@ -818,7 +951,7 @@ static void exofs_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
-        .writepage      = exofs_writepage,
+        .writepage      = NULL,
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
        .write_end      = exofs_write_end,
@@ -860,7 +993,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+        ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
        if (likely(!ret))
                truncate_setsize(inode, newsize);
@@ -927,14 +1060,14 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
        struct exofs_on_disk_inode_layout *layout;
        int ret;
-        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
        }
-        attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+        attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
-        attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+        attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
        ios->in_attr = attrs;
        ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                return inode;
        oi = exofs_i(inode);
        __oi_init(oi);
-        exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+        exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
                         exofs_oi_objno(oi));
        /* read the inode from the osd */
@@ -1172,13 +1305,13 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        spin_unlock(&sbi->s_next_gen_lock);
        insert_inode_hash(inode);
-        exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+        exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
                         exofs_oi_objno(oi));
        exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
        mark_inode_dirty(inode);
-        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
                return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
        } else
                memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
-        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                goto free_args;
@@ -1350,7 +1483,7 @@ void exofs_evict_inode(struct inode *inode)
        /* ignore the error, attempt a remove anyway */
        /* Now Remove the OSD objects */
-        ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
                return;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af88198..fcfa86ae6faf 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -24,76 +24,287 @@
 #include <linux/slab.h>
 #include <asm/div64.h>
+#include <linux/lcm.h>
-#include <scsi/osd_ore.h>
+#include "ore_raid.h"
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+MODULE_LICENSE("GPL");
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ *    members to be operatonals for the ore. The needed parameters are those
+ *    that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ *    for example stripe_unit must be a multple of the system PAGE_SIZE,
+ *    and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+enum { BIO_MAX_PAGES_KMALLOC =
+                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
-#ifdef CONFIG_EXOFS_DEBUG
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
-#define ORE_DBGMSG(fmt, a...) \
+{
-        printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+        u64 stripe_length;
-#else
-#define ORE_DBGMSG(fmt, a...) \
+        switch (layout->raid_algorithm) {
-        do { if (0) printk(fmt, ##a); } while (0)
+        case PNFS_OSD_RAID_0:
-#endif
+                layout->parity = 0;
+                break;
+        case PNFS_OSD_RAID_5:
+                layout->parity = 1;
+                break;
+        case PNFS_OSD_RAID_PQ:
+        case PNFS_OSD_RAID_4:
+        default:
+                ORE_ERR("Only RAID_0/5 for now\n");
+                return -EINVAL;
+        }
+        if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+                ORE_ERR("Stripe Unit(0x%llx)"
+                          " must be Multples of PAGE_SIZE(0x%lx)\n",
+                          _LLU(layout->stripe_unit), PAGE_SIZE);
+                return -EINVAL;
+        }
+        if (layout->group_width) {
+                if (!layout->group_depth) {
+                        ORE_ERR("group_depth == 0 && group_width != 0\n");
+                        return -EINVAL;
+                }
+                if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+                        ORE_ERR("Data Map wrong, "
+                                "numdevs=%d < group_width=%d * mirrors=%d\n",
+                                total_comps, layout->group_width,
+                                layout->mirrors_p1);
+                        return -EINVAL;
+                }
+                layout->group_count = total_comps / layout->mirrors_p1 /
+                                                layout->group_width;
+        } else {
+                if (layout->group_depth) {
+                        printk(KERN_NOTICE "Warning: group_depth ignored "
+                                "group_width == 0 && group_depth == %lld\n",
+                                _LLU(layout->group_depth));
+                }
+                layout->group_width = total_comps / layout->mirrors_p1;
+                layout->group_depth = -1;
+                layout->group_count = 1;
+        }
-/* u64 has problems with printk this will cast it to unsigned long long */
+        stripe_length = (u64)layout->group_width * layout->stripe_unit;
-#define _LLU(x) (unsigned long long)(x)
+        if (stripe_length >= (1ULL << 32)) {
+                ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+                        _LLU(stripe_length));
+                return -EINVAL;
+        }
-#define ORE_DBGMSG2(M...) do {} while (0)
+        layout->max_io_length =
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+                (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+                                                        layout->group_width;
+        if (layout->parity) {
+                unsigned stripe_length =
+                                (layout->group_width - layout->parity) *
+                                layout->stripe_unit;
-MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
+                layout->max_io_length /= stripe_length;
-MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
+                layout->max_io_length *= stripe_length;
-MODULE_LICENSE("GPL");
+        }
+        return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
-        return ios->comps->comps[index & ios->comps->single_comp].cred;
+        return ios->oc->comps[index & ios->oc->single_comp].cred;
 }
 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 {
-        return &ios->comps->comps[index & ios->comps->single_comp].obj;
+        return &ios->oc->comps[index & ios->oc->single_comp].obj;
 }
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
-        return ios->comps->ods[index];
+        ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+                    ios->oc->first_dev, ios->oc->numdevs, index,
+                    ios->oc->ods);
+        return ore_comp_dev(ios->oc, index);
 }
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int  _ore_get_io_state(struct ore_layout *layout,
+                        struct ore_components *oc, unsigned numdevs,
+                        unsigned sgs_per_dev, unsigned num_par_pages,
+                        struct ore_io_state **pios)
+{
+        struct ore_io_state *ios;
+        struct page **pages;
+        struct osd_sg_entry *sgilist;
+        struct __alloc_all_io_state {
+                struct ore_io_state ios;
+                struct ore_per_dev_state per_dev[numdevs];
+                union {
+                        struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+                        struct page *pages[num_par_pages];
+                };
+        } *_aios;
+        if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+                _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+                if (unlikely(!_aios)) {
+                        ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+                                   sizeof(*_aios));
+                        *pios = NULL;
+                        return -ENOMEM;
+                }
+                pages = num_par_pages ? _aios->pages : NULL;
+                sgilist = sgs_per_dev ? _aios->sglist : NULL;
+                ios = &_aios->ios;
+        } else {
+                struct __alloc_small_io_state {
+                        struct ore_io_state ios;
+                        struct ore_per_dev_state per_dev[numdevs];
+                } *_aio_small;
+                union __extra_part {
+                        struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+                        struct page *pages[num_par_pages];
+                } *extra_part;
+                _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+                if (unlikely(!_aio_small)) {
+                        ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+                                   sizeof(*_aio_small));
+                        *pios = NULL;
+                        return -ENOMEM;
+                }
+                extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+                if (unlikely(!extra_part)) {
+                        ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+                                   sizeof(*extra_part));
+                        kfree(_aio_small);
+                        *pios = NULL;
+                        return -ENOMEM;
+                }
+                pages = num_par_pages ? extra_part->pages : NULL;
+                sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+                /* In this case the per_dev[0].sgilist holds the pointer to
+                 * be freed
+                 */
+                ios = &_aio_small->ios;
+                ios->extra_part_alloc = true;
+        }
+        if (pages) {
+                ios->parity_pages = pages;
+                ios->max_par_pages = num_par_pages;
+        }
+        if (sgilist) {
+                unsigned d;
+                for (d = 0; d < numdevs; ++d) {
+                        ios->per_dev[d].sglist = sgilist;
+                        sgilist += sgs_per_dev;
+                }
+                ios->sgs_per_dev = sgs_per_dev;
+        }
+        ios->layout = layout;
+        ios->oc = oc;
+        *pios = ios;
+        return 0;
+}
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
                      bool is_reading, u64 offset, u64 length,
                      struct ore_io_state **pios)
 {
        struct ore_io_state *ios;
+        unsigned numdevs = layout->group_width * layout->mirrors_p1;
+        unsigned sgs_per_dev = 0, max_par_pages = 0;
+        int ret;
-        /*TODO: Maybe use kmem_cach per sbi of size
+        if (layout->parity && length) {
-         * exofs_io_state_size(layout->s_numdevs)
+                unsigned data_devs = layout->group_width - layout->parity;
-         */
+                unsigned stripe_size = layout->stripe_unit * data_devs;
-        ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
+                unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
-        if (unlikely(!ios)) {
+                u32 remainder;
-                ORE_DBGMSG("Failed kzalloc bytes=%d\n",
+                u64 num_stripes;
-                             ore_io_state_size(comps->numdevs));
+                u64 num_raid_units;
-                *pios = NULL;
-                return -ENOMEM;
+                num_stripes = div_u64_rem(length, stripe_size, &remainder);
+                if (remainder)
+                        ++num_stripes;
+                num_raid_units =  num_stripes * layout->parity;
+                if (is_reading) {
+                        /* For reads add per_dev sglist array */
+                        /* TODO: Raid 6 we need twice more. Actually:
+                        *         num_stripes / LCMdP(W,P);
+                        *         if (W%P != 0) num_stripes *= parity;
+                        */
+                        /* first/last seg is split */
+                        num_raid_units += layout->group_width;
+                        sgs_per_dev = div_u64(num_raid_units, data_devs);
+                } else {
+                        /* For Writes add parity pages array. */
+                        max_par_pages = num_raid_units * pages_in_unit *
+                                                sizeof(struct page *);
+                }
        }
-        ios->layout = layout;
+        ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
-        ios->comps = comps;
+                                pios);
-        ios->offset = offset;
+        if (unlikely(ret))
-        ios->length = length;
+                return ret;
+        ios = *pios;
        ios->reading = is_reading;
+        ios->offset = offset;
+        if (length) {
+                ore_calc_stripe_info(layout, offset, length, &ios->si);
+                ios->length = ios->si.length;
+                ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+                if (layout->parity)
+                        _ore_post_alloc_raid_stuff(ios);
+        }
-        *pios = ios;
        return 0;
 }
 EXPORT_SYMBOL(ore_get_rw_state);
-int  ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
+/* Allocate an io_state for all the devices in the comps array
-                      struct ore_io_state **ios)
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+                      struct ore_io_state **pios)
 {
-        return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+        return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
@@ -111,6 +322,7 @@ void ore_put_io_state(struct ore_io_state *ios)
                                bio_put(per_dev->bio);
                }
+                _ore_free_raid_stuff(ios);
                kfree(ios);
        }
 }
@@ -138,7 +350,7 @@ static void _done_io(struct osd_request *or, void *p)
        kref_put(&ios->kref, _last_io);
 }
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
 {
        DECLARE_COMPLETION_ONSTACK(wait);
        bool sync = (ios->done == NULL);
@@ -198,7 +410,7 @@ static void _clear_bio(struct bio *bio)
        }
 }
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 {
        enum osd_err_priority acumulated_osd_err = 0;
        int acumulated_lin_err = 0;
@@ -206,7 +418,8 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
        for (i = 0; i < ios->numdevs; i++) {
                struct osd_sense_info osi;
-                struct osd_request *or = ios->per_dev[i].or;
+                struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+                struct osd_request *or = per_dev->or;
                int ret;
                if (unlikely(!or))
@@ -218,29 +431,31 @@ int ore_check_io(struct ore_io_state *ios, u64 *resid)
                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
                        /* start read offset passed endof file */
-                        _clear_bio(ios->per_dev[i].bio);
+                        _clear_bio(per_dev->bio);
                        ORE_DBGMSG("start read offset passed end of file "
                                "offset=0x%llx, length=0x%llx\n",
-                                _LLU(ios->per_dev[i].offset),
+                                _LLU(per_dev->offset),
-                                _LLU(ios->per_dev[i].length));
+                                _LLU(per_dev->length));
                        continue; /* we recovered */
                }
+                if (on_dev_error) {
+                        u64 residual = ios->reading ?
+                                        or->in.residual : or->out.residual;
+                        u64 offset = (ios->offset + ios->length) - residual;
+                        struct ore_dev *od = ios->oc->ods[
+                                        per_dev->dev - ios->oc->first_dev];
+                        on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+                                     offset, residual);
+                }
                if (osi.osd_err_pri >= acumulated_osd_err) {
                        acumulated_osd_err = osi.osd_err_pri;
                        acumulated_lin_err = ret;
                }
        }
-        /* TODO: raid specific residual calculations */
-        if (resid) {
-                if (likely(!acumulated_lin_err))
-                        *resid = 0;
-                else
-                        *resid = ios->length;
-        }
        return acumulated_lin_err;
 }
 EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +463,65 @@ EXPORT_SYMBOL(ore_check_io);
 /*
 * L - logical offset into the file
 *
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ *      D = group_width - parity
 *
- *      U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ *      U =  stripe_unit * D
 *
 * T - The number of bytes striped within a group of component objects
 *     (before advancing to the next group)
- *
+ *      T = U * group_depth
- *      T = stripe_unit * group_width * group_depth
 *
 * S - The number of bytes striped across all component objects
 *     before the pattern repeats
+ *      S = T * group_count
 *
- *      S = stripe_unit * group_width * group_depth * group_count
+ * M - The "major" (i.e., across all components) cycle number
- *
- * M - The "major" (i.e., across all components) stripe number
- *
 *      M = L / S
 *
- * G - Counts the groups from the beginning of the major stripe
+ * G - Counts the groups from the beginning of the major cycle
- *
 *      G = (L - (M * S)) / T   [or (L % S) / T]
 *
 * H - The byte offset within the group
- *
 *      H = (L - (M * S)) % T   [or (L % S) % T]
 *
 * N - The "minor" (i.e., across the group) stripe number
- *
 *      N = H / U
 *
 * C - The component index coresponding to L
 *
- *      C = (H - (N * U)) / stripe_unit + G * group_width
+ *      C = (H - (N * U)) / stripe_unit + G * D
- *      [or (L % U) / stripe_unit + G * group_width]
+ *      [or (L % U) / stripe_unit + G * D]
 *
 * O - The component offset coresponding to L
- *
 *      O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ *          divide by parity
+ *      LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ *     (Note parity cycle always starts at a group's boundary)
+ *      R = N % LCMdP
+ *
+ * I = the first parity device index
+ *      I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ *      Craid = (group_width + C - R*parity) % group_width
+ *      (We add the group_width to avoid negative numbers modulo math)
 */
-struct _striping_info {
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-        u64 obj_offset;
+                          u64 length, struct ore_striping_info *si)
-        u64 group_length;
-        u64 M; /* for truncate */
-        unsigned dev;
-        unsigned unit_off;
-};
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-                              struct _striping_info *si)
 {
        u32     stripe_unit = layout->stripe_unit;
        u32     group_width = layout->group_width;
        u64     group_depth = layout->group_depth;
+        u32     parity      = layout->parity;
-        u32     U = stripe_unit * group_width;
+        u32     D = group_width - parity;
+        u32     U = D * stripe_unit;
        u64     T = U * group_depth;
        u64     S = T * layout->group_count;
        u64     M = div64_u64(file_offset, S);
@@ -318,39 +537,65 @@ static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
        u32     N = div_u64(H, U);
        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
-        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+        u32     C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-        si->dev *= layout->mirrors_p1;
        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
        si->obj_offset = si->unit_off + (N * stripe_unit) +
                                  (M * group_depth * stripe_unit);
-        si->group_length = T - H;
+        if (parity) {
+                u32 LCMdP = lcm(group_width, parity) / parity;
+                /* R     = N % LCMdP; */
+                u32 RxP   = (N % LCMdP) * parity;
+                u32 first_dev = C - C % group_width;
+                si->par_dev = (group_width + group_width - parity - RxP) %
+                              group_width + first_dev;
+                si->dev = (group_width + C - RxP) % group_width + first_dev;
+                si->bytes_in_stripe = U;
+                si->first_stripe_start = M * S + G * T + N * U;
+        } else {
+                /* Make the math correct see _prepare_one_group */
+                si->par_dev = group_width;
+                si->dev = C;
+        }
+        si->dev *= layout->mirrors_p1;
+        si->par_dev *= layout->mirrors_p1;
+        si->offset = file_offset;
+        si->length = T - H;
+        if (si->length > length)
+                si->length = length;
        si->M = M;
 }
+EXPORT_SYMBOL(ore_calc_stripe_info);
-static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct ore_per_dev_state *per_dev,
+                         unsigned pgbase, struct page **pages,
-                int cur_len)
+                         struct ore_per_dev_state *per_dev, int cur_len)
 {
        unsigned pg = *cur_pg;
        struct request_queue *q =
                        osd_request_queue(_ios_od(ios, per_dev->dev));
+        unsigned len = cur_len;
-        per_dev->length += cur_len;
+        int ret;
        if (per_dev->bio == NULL) {
                unsigned pages_in_stripe = ios->layout->group_width *
                                        (ios->layout->stripe_unit / PAGE_SIZE);
-                unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
+                unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
-                                                ios->layout->group_width;
+                                        (ios->layout->group_width -
+                                         ios->layout->parity);
+                unsigned bio_size = (nr_pages + pages_in_stripe) /
+                                        ios->layout->group_width;
                per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
                if (unlikely(!per_dev->bio)) {
                        ORE_DBGMSG("Failed to allocate BIO size=%u\n",
                                     bio_size);
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        goto out;
                }
        }
@@ -358,64 +603,90 @@ static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
                unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
                unsigned added_len;
-                BUG_ON(ios->nr_pages <= pg);
                cur_len -= pglen;
-                added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+                added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
                                            pglen, pgbase);
-                if (unlikely(pglen != added_len))
+                if (unlikely(pglen != added_len)) {
-                        return -ENOMEM;
+                        ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+                                   per_dev->bio->bi_vcnt);
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                _add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
                pgbase = 0;
                ++pg;
        }
        BUG_ON(cur_len);
+        per_dev->length += len;
        *cur_pg = pg;
-        return 0;
+        ret = 0;
+out:    /* we fail the complete unit on an error eg don't advance
+         * per_dev->length and cur_pg. This means that we might have a bigger
+         * bio than the CDB requested length (per_dev->length). That's fine
+         * only the oposite is fatal.
+         */
+        return ret;
 }
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
+static int _prepare_for_striping(struct ore_io_state *ios)
-                              struct _striping_info *si)
 {
+        struct ore_striping_info *si = &ios->si;
        unsigned stripe_unit = ios->layout->stripe_unit;
        unsigned mirrors_p1 = ios->layout->mirrors_p1;
-        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+        unsigned group_width = ios->layout->group_width;
+        unsigned devs_in_group = group_width * mirrors_p1;
        unsigned dev = si->dev;
        unsigned first_dev = dev - (dev % devs_in_group);
-        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+        unsigned dev_order;
        unsigned cur_pg = ios->pages_consumed;
+        u64 length = ios->length;
        int ret = 0;
+        if (!ios->pages) {
+                ios->numdevs = ios->layout->mirrors_p1;
+                return 0;
+        }
+        BUG_ON(length > si->length);
+        dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+        si->cur_comp = dev_order;
+        si->cur_pg = si->unit_off / PAGE_SIZE;
        while (length) {
-                struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+                unsigned comp = dev - first_dev;
+                struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
                        per_dev->dev = dev;
-                        if (dev < si->dev) {
+                        if (dev == si->dev) {
-                                per_dev->offset = si->obj_offset + stripe_unit -
+                                WARN_ON(dev == si->par_dev);
-                                                                   si->unit_off;
-                                cur_len = stripe_unit;
-                        } else if (dev == si->dev) {
                                per_dev->offset = si->obj_offset;
                                cur_len = stripe_unit - si->unit_off;
                                page_off = si->unit_off & ~PAGE_MASK;
                                BUG_ON(page_off && (page_off != ios->pgbase));
-                        } else { /* dev > si->dev */
+                        } else {
-                                per_dev->offset = si->obj_offset - si->unit_off;
+                                if (si->cur_comp > dev_order)
+                                        per_dev->offset =
+                                                si->obj_offset - si->unit_off;
+                                else /* si->cur_comp < dev_order */
+                                        per_dev->offset =
+                                                si->obj_offset + stripe_unit -
+                                                                   si->unit_off;
                                cur_len = stripe_unit;
                        }
-                        if (max_comp < dev)
-                                max_comp = dev;
                } else {
                        cur_len = stripe_unit;
                }
                if (cur_len >= length)
                        cur_len = length;
-                ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+                ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
-                                       cur_len);
+                                           per_dev, cur_len);
                if (unlikely(ret))
                        goto out;
@@ -423,60 +694,60 @@ static int _prepare_one_group(struct ore_io_state *ios, u64 length,
                dev = (dev % devs_in_group) + first_dev;
                length -= cur_len;
-        }
-out:
-        ios->numdevs = max_comp + mirrors_p1;
-        ios->pages_consumed = cur_pg;
-        return ret;
-}
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
-        u64 length = ios->length;
-        u64 offset = ios->offset;
-        struct _striping_info si;
-        int ret = 0;
-        if (!ios->pages) {
+                si->cur_comp = (si->cur_comp + 1) % group_width;
-                if (ios->kern_buff) {
+                if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
-                        struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+                        if (!length && ios->sp2d) {
+                                /* If we are writing and this is the very last
+                                 * stripe. then operate on parity dev.
+                                 */
+                                dev = si->par_dev;
+                        }
+                        if (ios->sp2d)
+                                /* In writes cur_len just means if it's the
+                                 * last one. See _ore_add_parity_unit.
+                                 */
+                                cur_len = length;
+                        per_dev = &ios->per_dev[dev - first_dev];
+                        if (!per_dev->length) {
+                                /* Only/always the parity unit of the first
+                                 * stripe will be empty. So this is a chance to
+                                 * initialize the per_dev info.
+                                 */
+                                per_dev->dev = dev;
+                                per_dev->offset = si->obj_offset - si->unit_off;
+                        }
-                        _calc_stripe_info(ios->layout, ios->offset, &si);
+                        ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
-                        per_dev->offset = si.obj_offset;
+                        if (unlikely(ret))
-                        per_dev->dev = si.dev;
+                                        goto out;
-                        /* no cross device without page array */
+                        /* Rotate next par_dev backwards with wraping */
-                        BUG_ON((ios->layout->group_width > 1) &&
+                        si->par_dev = (devs_in_group + si->par_dev -
-                               (si.unit_off + ios->length >
+                                       ios->layout->parity * mirrors_p1) %
-                                ios->layout->stripe_unit));
+                                      devs_in_group + first_dev;
+                        /* Next stripe, start fresh */
+                        si->cur_comp = 0;
+                        si->cur_pg = 0;
                }
-                ios->numdevs = ios->layout->mirrors_p1;
-                return 0;
-        }
-        while (length) {
-                _calc_stripe_info(ios->layout, offset, &si);
-                if (length < si.group_length)
-                        si.group_length = length;
-                ret = _prepare_one_group(ios, si.group_length, &si);
-                if (unlikely(ret))
-                        goto out;
-                offset += si.group_length;
-                length -= si.group_length;
        }
 out:
-        return ret;
+        ios->numdevs = devs_in_group;
+        ios->pages_consumed = cur_pg;
+        if (unlikely(ret)) {
+                if (length == ios->length)
+                        return ret;
+                else
+                        ios->length -= length;
+        }
+        return 0;
 }
 int ore_create(struct ore_io_state *ios)
 {
        int i, ret;
-        for (i = 0; i < ios->comps->numdevs; i++) {
+        for (i = 0; i < ios->oc->numdevs; i++) {
                struct osd_request *or;
                or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +772,7 @@ int ore_remove(struct ore_io_state *ios)
 {
        int i, ret;
-        for (i = 0; i < ios->comps->numdevs; i++) {
+        for (i = 0; i < ios->oc->numdevs; i++) {
                struct osd_request *or;
                or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +814,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                        goto out;
                }
                per_dev->or = or;
-                per_dev->offset = master_dev->offset;
                if (ios->pages) {
                        struct bio *bio;
@@ -562,6 +832,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                __bio_clone(bio, master_dev->bio);
                                bio->bi_bdev = NULL;
                                bio->bi_next = NULL;
+                                per_dev->offset = master_dev->offset;
                                per_dev->length = master_dev->length;
                                per_dev->bio =  bio;
                                per_dev->dev = dev;
@@ -579,7 +850,15 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                     _LLU(per_dev->offset),
                                     _LLU(per_dev->length), dev);
                } else if (ios->kern_buff) {
-                        ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+                        per_dev->offset = ios->si.obj_offset;
+                        per_dev->dev = ios->si.dev + dev;
+                        /* no cross device without page array */
+                        BUG_ON((ios->layout->group_width > 1) &&
+                               (ios->si.unit_off + ios->length >
+                                ios->layout->stripe_unit));
+                        ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
                                                 per_dev->offset,
                                                 ios->kern_buff, ios->length);
                        if (unlikely(ret))
@@ -588,7 +867,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                      "length=0x%llx dev=%d\n",
                                     _LLU(_ios_obj(ios, dev)->id),
                                     _LLU(per_dev->offset),
-                                     _LLU(ios->length), dev);
+                                     _LLU(ios->length), per_dev->dev);
                } else {
                        osd_req_set_attributes(or, _ios_obj(ios, dev));
                        ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +893,14 @@ int ore_write(struct ore_io_state *ios)
        int i;
        int ret;
+        if (unlikely(ios->sp2d && !ios->r4w)) {
+                /* A library is attempting a RAID-write without providing
+                 * a pages lock interface.
+                 */
+                WARN_ON_ONCE(1);
+                return -ENOTSUPP;
+        }
        ret = _prepare_for_striping(ios);
        if (unlikely(ret))
                return ret;
@@ -629,7 +916,7 @@ int ore_write(struct ore_io_state *ios)
 }
 EXPORT_SYMBOL(ore_write);
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 {
        struct osd_request *or;
        struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +935,27 @@ static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
        per_dev->or = or;
        if (ios->pages) {
-                osd_req_read(or, obj, per_dev->offset,
+                if (per_dev->cur_sg) {
-                                per_dev->bio, per_dev->length);
+                        /* finalize the last sg_entry */
+                        _ore_add_sg_seg(per_dev, 0, false);
+                        if (unlikely(!per_dev->cur_sg))
+                                return 0; /* Skip parity only device */
+                        osd_req_read_sg(or, obj, per_dev->bio,
+                                        per_dev->sglist, per_dev->cur_sg);
+                } else {
+                        /* The no raid case */
+                        osd_req_read(or, obj, per_dev->offset,
+                                     per_dev->bio, per_dev->length);
+                }
                ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-                             " dev=%d\n", _LLU(obj->id),
+                             " dev=%d sg_len=%d\n", _LLU(obj->id),
                             _LLU(per_dev->offset), _LLU(per_dev->length),
-                             first_dev);
+                             first_dev, per_dev->cur_sg);
-        } else if (ios->kern_buff) {
-                int ret = osd_req_read_kern(or, obj, per_dev->offset,
-                                            ios->kern_buff, ios->length);
-                ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-                              "length=0x%llx dev=%d ret=>%d\n",
-                              _LLU(obj->id), _LLU(per_dev->offset),
-                              _LLU(ios->length), first_dev, ret);
-                if (unlikely(ret))
-                        return ret;
        } else {
+                BUG_ON(ios->kern_buff);
                osd_req_get_attributes(or, obj);
                ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
                              _LLU(obj->id),
@@ -688,7 +980,7 @@ int ore_read(struct ore_io_state *ios)
                return ret;
        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-                ret = _read_mirror(ios, i);
+                ret = _ore_read_mirror(ios, i);
                if (unlikely(ret))
                        return ret;
        }
@@ -744,31 +1036,29 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
 }
 struct _trunc_info {
-        struct _striping_info si;
+        struct ore_striping_info si;
        u64 prev_group_obj_off;
        u64 next_group_obj_off;
        unsigned first_group_dev;
        unsigned nex_group_dev;
-        unsigned max_devs;
 };
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
-                       struct _trunc_info *ti)
+                             struct _trunc_info *ti)
 {
        unsigned stripe_unit = layout->stripe_unit;
-        _calc_stripe_info(layout, file_offset, &ti->si);
+        ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
        ti->prev_group_obj_off = ti->si.M * stripe_unit;
        ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
        ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
        ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-        ti->max_devs = layout->group_width * layout->group_count;
 }
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
                   u64 size)
 {
        struct ore_io_state *ios;
@@ -779,22 +1069,22 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
        struct _trunc_info ti;
        int i, ret;
-        ret = ore_get_io_state(layout, comps, &ios);
+        ret = ore_get_io_state(layout, oc, &ios);
        if (unlikely(ret))
                return ret;
        _calc_trunk_info(ios->layout, size, &ti);
-        size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+        size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
                             GFP_KERNEL);
        if (unlikely(!size_attrs)) {
                ret = -ENOMEM;
                goto out;
        }
-        ios->numdevs = ios->comps->numdevs;
+        ios->numdevs = ios->oc->numdevs;
-        for (i = 0; i < ti.max_devs; ++i) {
+        for (i = 0; i < ios->numdevs; ++i) {
                struct exofs_trunc_attr *size_attr = &size_attrs[i];
                u64 obj_size;
@@ -815,7 +1105,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
                size_attr->attr.val_ptr = &size_attr->newsize;
                ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
-                             _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+                             _LLU(oc->comps->obj.id), _LLU(obj_size), i);
                ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
                                        &size_attr->attr);
                if (unlikely(ret))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 000000000000..29c47e5c4a86
--- /dev/null
+++ b/fs/exofs/ore_raid.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *      "Free Software Foundation <info@fsf.org>"
+ */
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+#include "ore_raid.h"
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+struct page *_raid_page_alloc(void)
+{
+        return alloc_page(GFP_KERNEL);
+}
+void _raid_page_free(struct page *p)
+{
+        __free_page(p);
+}
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+        /* Cache some hot path repeated calculations */
+        unsigned parity;
+        unsigned data_devs;
+        unsigned pages_in_unit;
+        bool needed ;
+        /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+        struct __1_page_stripe {
+                bool alloc;
+                unsigned write_count;
+                struct async_submit_ctl submit;
+                struct dma_async_tx_descriptor *tx;
+                /* The size of this array is data_devs + parity */
+                struct page **pages;
+                struct page **scribble;
+                /* bool array, size of this array is data_devs */
+                char *page_is_read;
+        } _1p_stripes[];
+};
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+                       unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+        struct __stripe_pages_2d *sp2d;
+        unsigned data_devs = group_width - parity;
+        struct _alloc_all_bytes {
+                struct __alloc_stripe_pages_2d {
+                        struct __stripe_pages_2d sp2d;
+                        struct __1_page_stripe _1p_stripes[pages_in_unit];
+                } __asp2d;
+                struct __alloc_1p_arrays {
+                        struct page *pages[group_width];
+                        struct page *scribble[group_width];
+                        char page_is_read[data_devs];
+                } __a1pa[pages_in_unit];
+        } *_aab;
+        struct __alloc_1p_arrays *__a1pa;
+        struct __alloc_1p_arrays *__a1pa_end;
+        const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+        unsigned num_a1pa, alloc_size, i;
+        /* FIXME: check these numbers in ore_verify_layout */
+        BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+        BUG_ON(sizeof__a1pa > PAGE_SIZE);
+        if (sizeof(*_aab) > PAGE_SIZE) {
+                num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+                alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+        } else {
+                num_a1pa = pages_in_unit;
+                alloc_size = sizeof(*_aab);
+        }
+        _aab = kzalloc(alloc_size, GFP_KERNEL);
+        if (unlikely(!_aab)) {
+                ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+                return -ENOMEM;
+        }
+        sp2d = &_aab->__asp2d.sp2d;
+        *psp2d = sp2d; /* From here Just call _sp2d_free */
+        __a1pa = _aab->__a1pa;
+        __a1pa_end = __a1pa + num_a1pa;
+        for (i = 0; i < pages_in_unit; ++i) {
+                if (unlikely(__a1pa >= __a1pa_end)) {
+                        num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+                                                        pages_in_unit - i);
+                        __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+                        if (unlikely(!__a1pa)) {
+                                ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+                                           num_a1pa);
+                                return -ENOMEM;
+                        }
+                        __a1pa_end = __a1pa + num_a1pa;
+                        /* First *pages is marked for kfree of the buffer */
+                        sp2d->_1p_stripes[i].alloc = true;
+                }
+                sp2d->_1p_stripes[i].pages = __a1pa->pages;
+                sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+                sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+                ++__a1pa;
+        }
+        sp2d->parity = parity;
+        sp2d->data_devs = data_devs;
+        sp2d->pages_in_unit = pages_in_unit;
+        return 0;
+}
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+                        const struct _ore_r4w_op *r4w, void *priv)
+{
+        unsigned data_devs = sp2d->data_devs;
+        unsigned group_width = data_devs + sp2d->parity;
+        unsigned p;
+        if (!sp2d->needed)
+                return;
+        for (p = 0; p < sp2d->pages_in_unit; p++) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                if (_1ps->write_count < group_width) {
+                        unsigned c;
+                        for (c = 0; c < data_devs; c++)
+                                if (_1ps->page_is_read[c]) {
+                                        struct page *page = _1ps->pages[c];
+                                        r4w->put_page(priv, page);
+                                        _1ps->page_is_read[c] = false;
+                                }
+                }
+                memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+                _1ps->write_count = 0;
+                _1ps->tx = NULL;
+        }
+        sp2d->needed = false;
+}
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+        unsigned i;
+        if (!sp2d)
+                return;
+        for (i = 0; i < sp2d->pages_in_unit; ++i) {
+                if (sp2d->_1p_stripes[i].alloc)
+                        kfree(sp2d->_1p_stripes[i].pages);
+        }
+        kfree(sp2d);
+}
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+        unsigned p;
+        for (p = 0; p < sp2d->pages_in_unit; p++) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                if (_1ps->write_count)
+                        return p;
+        }
+        return ~0;
+}
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+        unsigned p;
+        for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                if (_1ps->write_count)
+                        return p;
+        }
+        return ~0;
+}
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+        unsigned p;
+        for (p = 0; p < sp2d->pages_in_unit; p++) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                if (!_1ps->write_count)
+                        continue;
+                init_async_submit(&_1ps->submit,
+                        ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+                        NULL,
+                        NULL, NULL,
+                        (addr_conv_t *)_1ps->scribble);
+                /* TODO: raid6 */
+                _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+                                     0, sp2d->data_devs, PAGE_SIZE,
+                                     &_1ps->submit);
+        }
+        for (p = 0; p < sp2d->pages_in_unit; p++) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                /* NOTE: We wait for HW synchronously (I don't have such HW
+                 * to test with.) Is parallelism needed with today's multi
+                 * cores?
+                 */
+                async_tx_issue_pending(_1ps->tx);
+        }
+}
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+                       struct ore_striping_info *si, struct page *page)
+{
+        struct __1_page_stripe *_1ps;
+        sp2d->needed = true;
+        _1ps = &sp2d->_1p_stripes[si->cur_pg];
+        _1ps->pages[si->cur_comp] = page;
+        ++_1ps->write_count;
+        si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+        /* si->cur_comp is advanced outside at main loop */
+}
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+                     bool not_last)
+{
+        struct osd_sg_entry *sge;
+        ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+                     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+                     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+                     _LLU(per_dev->offset), per_dev->length,
+                     per_dev->last_sgs_total);
+        if (!per_dev->cur_sg) {
+                sge = per_dev->sglist;
+                /* First time we prepare two entries */
+                if (per_dev->length) {
+                        ++per_dev->cur_sg;
+                        sge->offset = per_dev->offset;
+                        sge->len = per_dev->length;
+                } else {
+                        /* Here the parity is the first unit of this object.
+                         * This happens every time we reach a parity device on
+                         * the same stripe as the per_dev->offset. We need to
+                         * just skip this unit.
+                         */
+                        per_dev->offset += cur_len;
+                        return;
+                }
+        } else {
+                /* finalize the last one */
+                sge = &per_dev->sglist[per_dev->cur_sg - 1];
+                sge->len = per_dev->length - per_dev->last_sgs_total;
+        }
+        if (not_last) {
+                /* Partly prepare the next one */
+                struct osd_sg_entry *next_sge = sge + 1;
+                ++per_dev->cur_sg;
+                next_sge->offset = sge->offset + sge->len + cur_len;
+                /* Save cur len so we know how mutch was added next time */
+                per_dev->last_sgs_total = per_dev->length;
+                next_sge->len = 0;
+        } else if (!sge->len) {
+                /* Optimize for when the last unit is a parity */
+                --per_dev->cur_sg;
+        }
+}
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+        struct ore_layout *layout = ios->layout;
+        int ret;
+        /* We want to only read those pages not in cache so worst case
+         * is a stripe populated with every other page
+         */
+        unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+        ret = _ore_get_io_state(layout, ios->oc,
+                                layout->group_width * layout->mirrors_p1,
+                                sgs_per_dev, 0, &ios->ios_read_4_write);
+        return ret;
+}
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+                                struct ore_striping_info *si, struct page *page)
+{
+        struct request_queue *q;
+        struct ore_per_dev_state *per_dev;
+        struct ore_io_state *read_ios;
+        unsigned first_dev = si->dev - (si->dev %
+                          (ios->layout->group_width * ios->layout->mirrors_p1));
+        unsigned comp = si->dev - first_dev;
+        unsigned added_len;
+        if (!ios->ios_read_4_write) {
+                int ret = _alloc_read_4_write(ios);
+                if (unlikely(ret))
+                        return ret;
+        }
+        read_ios = ios->ios_read_4_write;
+        read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+        per_dev = &read_ios->per_dev[comp];
+        if (!per_dev->length) {
+                per_dev->bio = bio_kmalloc(GFP_KERNEL,
+                                           ios->sp2d->pages_in_unit);
+                if (unlikely(!per_dev->bio)) {
+                        ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+                                     ios->sp2d->pages_in_unit);
+                        return -ENOMEM;
+                }
+                per_dev->offset = si->obj_offset;
+                per_dev->dev = si->dev;
+        } else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+                u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+                _ore_add_sg_seg(per_dev, gap, true);
+        }
+        q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+        added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+        if (unlikely(added_len != PAGE_SIZE)) {
+                ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+                              per_dev->bio->bi_vcnt);
+                return -ENOMEM;
+        }
+        per_dev->length += PAGE_SIZE;
+        return 0;
+}
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+        struct bio_vec *bv;
+        unsigned i, d;
+        /* loop on all devices all pages */
+        for (d = 0; d < ios->numdevs; d++) {
+                struct bio *bio = ios->per_dev[d].bio;
+                if (!bio)
+                        continue;
+                __bio_for_each_segment(bv, bio, i, 0) {
+                        struct page *page = bv->bv_page;
+                        SetPageUptodate(page);
+                        if (PageError(page))
+                                ClearPageError(page);
+                }
+        }
+}
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+        struct ore_io_state *ios_read;
+        struct ore_striping_info read_si;
+        struct __stripe_pages_2d *sp2d = ios->sp2d;
+        u64 offset = ios->si.first_stripe_start;
+        u64 last_stripe_end;
+        unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+        unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+        int ret;
+        if (offset == ios->offset) /* Go to start collect $200 */
+                goto read_last_stripe;
+        min_p = _sp2d_min_pg(sp2d);
+        max_p = _sp2d_max_pg(sp2d);
+        for (c = 0; ; c++) {
+                ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+                read_si.obj_offset += min_p * PAGE_SIZE;
+                offset += min_p * PAGE_SIZE;
+                for (p = min_p; p <= max_p; p++) {
+                        struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                        struct page **pp = &_1ps->pages[c];
+                        bool uptodate;
+                        if (*pp)
+                                /* to-be-written pages start here */
+                                goto read_last_stripe;
+                        *pp = ios->r4w->get_page(ios->private, offset,
+                                                 &uptodate);
+                        if (unlikely(!*pp))
+                                return -ENOMEM;
+                        if (!uptodate)
+                                _add_to_read_4_write(ios, &read_si, *pp);
+                        /* Mark read-pages to be cache_released */
+                        _1ps->page_is_read[c] = true;
+                        read_si.obj_offset += PAGE_SIZE;
+                        offset += PAGE_SIZE;
+                }
+                offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+        }
+read_last_stripe:
+        offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+                                PAGE_SIZE * PAGE_SIZE;
+        last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+                                 * bytes_in_stripe;
+        if (offset == last_stripe_end) /* Optimize for the aligned case */
+                goto read_it;
+        ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+        p = read_si.unit_off / PAGE_SIZE;
+        c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+                       ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+        BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+        /* unaligned IO must be within a single stripe */
+        if (min_p == sp2d->pages_in_unit) {
+                /* Didn't do it yet */
+                min_p = _sp2d_min_pg(sp2d);
+                max_p = _sp2d_max_pg(sp2d);
+        }
+        while (offset < last_stripe_end) {
+                struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+                if ((min_p <= p) && (p <= max_p)) {
+                        struct page *page;
+                        bool uptodate;
+                        BUG_ON(_1ps->pages[c]);
+                        page = ios->r4w->get_page(ios->private, offset,
+                                                  &uptodate);
+                        if (unlikely(!page))
+                                return -ENOMEM;
+                        _1ps->pages[c] = page;
+                        /* Mark read-pages to be cache_released */
+                        _1ps->page_is_read[c] = true;
+                        if (!uptodate)
+                                _add_to_read_4_write(ios, &read_si, page);
+                }
+                offset += PAGE_SIZE;
+                if (p == (sp2d->pages_in_unit - 1)) {
+                        ++c;
+                        p = 0;
+                        ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+                } else {
+                        read_si.obj_offset += PAGE_SIZE;
+                        ++p;
+                }
+        }
+read_it:
+        ios_read = ios->ios_read_4_write;
+        if (!ios_read)
+                return 0;
+        /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+         * to check for per_dev->bio
+         */
+        ios_read->pages = ios->pages;
+        /* Now read these devices */
+        for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+                ret = _ore_read_mirror(ios_read, i);
+                if (unlikely(ret))
+                        return ret;
+        }
+        ret = ore_io_execute(ios_read); /* Synchronus execution */
+        if (unlikely(ret)) {
+                ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+                return ret;
+        }
+        _mark_read4write_pages_uptodate(ios_read, ret);
+        return 0;
+}
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+                            struct ore_striping_info *si,
+                            struct ore_per_dev_state *per_dev,
+                            unsigned cur_len)
+{
+        if (ios->reading) {
+                BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+                _ore_add_sg_seg(per_dev, cur_len, true);
+        } else {
+                struct __stripe_pages_2d *sp2d = ios->sp2d;
+                struct page **pages = ios->parity_pages + ios->cur_par_page;
+                unsigned num_pages;
+                unsigned array_start = 0;
+                unsigned i;
+                int ret;
+                si->cur_pg = _sp2d_min_pg(sp2d);
+                num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+                if (!cur_len) /* If last stripe operate on parity comp */
+                        si->cur_comp = sp2d->data_devs;
+                if (!per_dev->length) {
+                        per_dev->offset += si->cur_pg * PAGE_SIZE;
+                        /* If first stripe, Read in all read4write pages
+                         * (if needed) before we calculate the first parity.
+                         */
+                        _read_4_write(ios);
+                }
+                for (i = 0; i < num_pages; i++) {
+                        pages[i] = _raid_page_alloc();
+                        if (unlikely(!pages[i]))
+                                return -ENOMEM;
+                        ++(ios->cur_par_page);
+                }
+                BUG_ON(si->cur_comp != sp2d->data_devs);
+                BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+                ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
+                                           per_dev, num_pages * PAGE_SIZE);
+                if (unlikely(ret))
+                        return ret;
+                /* TODO: raid6 if (last_parity_dev) */
+                _gen_xor_unit(sp2d);
+                _sp2d_reset(sp2d, ios->r4w, ios->private);
+        }
+        return 0;
+}
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+        struct ore_layout *layout = ios->layout;
+        if (ios->parity_pages) {
+                unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+                unsigned stripe_size = ios->si.bytes_in_stripe;
+                u64 last_stripe, first_stripe;
+                if (_sp2d_alloc(pages_in_unit, layout->group_width,
+                                layout->parity, &ios->sp2d)) {
+                        return -ENOMEM;
+                }
+                BUG_ON(ios->offset % PAGE_SIZE);
+                /* Round io down to last full strip */
+                first_stripe = div_u64(ios->offset, stripe_size);
+                last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+                /* If an IO spans more then a single stripe it must end at
+                 * a stripe boundary. The reminder at the end is pushed into the
+                 * next IO.
+                 */
+                if (last_stripe != first_stripe) {
+                        ios->length = last_stripe * stripe_size - ios->offset;
+                        BUG_ON(!ios->length);
+                        ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+                                        PAGE_SIZE;
+                        ios->si.length = ios->length; /*make it consistent */
+                }
+        }
+        return 0;
+}
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+        if (ios->sp2d) { /* writing and raid */
+                unsigned i;
+                for (i = 0; i < ios->cur_par_page; i++) {
+                        struct page *page = ios->parity_pages[i];
+                        if (page)
+                                _raid_page_free(page);
+                }
+                if (ios->extra_part_alloc)
+                        kfree(ios->parity_pages);
+                /* If IO returned an error pages might need unlocking */
+                _sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+                _sp2d_free(ios->sp2d);
+        } else {
+                /* Will only be set if raid reading && sglist is big */
+                if (ios->extra_part_alloc)
+                        kfree(ios->per_dev[0].sglist);
+        }
+        if (ios->ios_read_4_write)
+                ore_put_io_state(ios->ios_read_4_write);
+}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 000000000000..2ffd2c3c6e46
--- /dev/null
+++ b/fs/exofs/ore_raid.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *      "Free Software Foundation <info@fsf.org>"
+ */
+#include <scsi/osd_ore.h>
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+        printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+        do { if (0) printk(fmt, ##a); } while (0)
+#endif
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+                                  unsigned par_dev, unsigned dev)
+{
+        unsigned first_dev = dev - dev % devs_in_group;
+        dev -= first_dev;
+        par_dev -= first_dev;
+        if (devs_in_group == par_dev) /* The raid 0 case */
+                return dev / mirrors_p1;
+        /* raid4/5/6 case */
+        return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+               mirrors_p1;
+}
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+                 bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+                     struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+                       struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+                                struct ore_striping_info *si, struct page *page)
+{
+        if (!sp2d) /* Inline the fast path */
+                return; /* Hay no raid stuff */
+        _ore_add_stripe_page(sp2d, si, page);
+}
+/* ios.c stuff needed by ios_raid.c */
+int  _ore_get_io_state(struct ore_layout *layout,
+                        struct ore_components *oc, unsigned numdevs,
+                        unsigned sgs_per_dev, unsigned num_par_pages,
+                        struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+                unsigned pgbase, struct page **pages,
+                struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 274894053b02..057b237b8b69 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -266,7 +266,7 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
        struct ore_io_state *ios;
        int ret;
-        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
@@ -321,7 +321,7 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
        struct ore_io_state *ios;
        int ret;
-        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
                return ret;
@@ -355,12 +355,12 @@ static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
        struct ore_comp one_comp;
-        struct ore_components comps;
+        struct ore_components oc;
        struct ore_io_state *ios;
        int ret = -ENOMEM;
@@ -378,9 +378,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
         * the writeable info is set in exofs_sbi_write_stats() above.
         */
-        exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+        exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
-        ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &oc, &ios);
        if (unlikely(ret))
                goto out;
@@ -429,19 +429,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
                msg, dev_path ?: "", odi->osdname, _LLU(pid));
 }
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-        while (sbi->comps.numdevs) {
+        unsigned numdevs = sbi->oc.numdevs;
-                int i = --sbi->comps.numdevs;
-                struct osd_dev *od = sbi->comps.ods[i];
+        while (numdevs) {
+                unsigned i = --numdevs;
+                struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
                if (od) {
-                        sbi->comps.ods[i] = NULL;
+                        ore_comp_set_dev(&sbi->oc, i, NULL);
                        osduld_put_device(od);
                }
        }
-        if (sbi->comps.ods != sbi->_min_one_dev)
+        kfree(sbi->oc.ods);
-                kfree(sbi->comps.ods);
        kfree(sbi);
 }
@@ -468,7 +469,7 @@ static void exofs_put_super(struct super_block *sb)
                                  msecs_to_jiffies(100));
        }
-        _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+        _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
                            sbi->one_comp.obj.partition);
        bdi_destroy(&sbi->bdi);
@@ -479,76 +480,20 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
                                    struct exofs_device_table *dt)
 {
-        u64 stripe_length;
+        int ret;
-        sbi->data_map.odm_num_comps   =
+        sbi->layout.stripe_unit =
-                                le32_to_cpu(dt->dt_data_map.cb_num_comps);
-        sbi->data_map.odm_stripe_unit =
                                le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
-        sbi->data_map.odm_group_width =
+        sbi->layout.group_width =
                                le32_to_cpu(dt->dt_data_map.cb_group_width);
-        sbi->data_map.odm_group_depth =
+        sbi->layout.group_depth =
                                le32_to_cpu(dt->dt_data_map.cb_group_depth);
-        sbi->data_map.odm_mirror_cnt  =
+        sbi->layout.mirrors_p1  =
-                                le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
+                                le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
-        sbi->data_map.odm_raid_algorithm  =
+        sbi->layout.raid_algorithm  =
                                le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
-/* FIXME: Only raid0 for now. if not so, do not mount */
+        ret = ore_verify_layout(numdevs, &sbi->layout);
-        if (sbi->data_map.odm_num_comps != numdevs) {
-                EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
-                          sbi->data_map.odm_num_comps, numdevs);
-                return -EINVAL;
-        }
-        if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
-                EXOFS_ERR("Only RAID_0 for now\n");
-                return -EINVAL;
-        }
-        if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
-                EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
-                          numdevs, sbi->data_map.odm_mirror_cnt);
-                return -EINVAL;
-        }
-        if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
-                EXOFS_ERR("Stripe Unit(0x%llx)"
-                          " must be Multples of PAGE_SIZE(0x%lx)\n",
-                          _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
-                return -EINVAL;
-        }
-        sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
-        sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-        if (sbi->data_map.odm_group_width) {
-                sbi->layout.group_width = sbi->data_map.odm_group_width;
-                sbi->layout.group_depth = sbi->data_map.odm_group_depth;
-                if (!sbi->layout.group_depth) {
-                        EXOFS_ERR("group_depth == 0 && group_width != 0\n");
-                        return -EINVAL;
-                }
-                sbi->layout.group_count = sbi->data_map.odm_num_comps /
-                                                sbi->layout.mirrors_p1 /
-                                                sbi->data_map.odm_group_width;
-        } else {
-                if (sbi->data_map.odm_group_depth) {
-                        printk(KERN_NOTICE "Warning: group_depth ignored "
-                                "group_width == 0 && group_depth == %d\n",
-                                sbi->data_map.odm_group_depth);
-                        sbi->data_map.odm_group_depth = 0;
-                }
-                sbi->layout.group_width = sbi->data_map.odm_num_comps /
-                                                        sbi->layout.mirrors_p1;
-                sbi->layout.group_depth = -1;
-                sbi->layout.group_count = 1;
-        }
-        stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
-        if (stripe_length >= (1ULL << 32)) {
-                EXOFS_ERR("Total Stripe length(0x%llx)"
-                          " >= 32bit is not supported\n", _LLU(stripe_length));
-                return -EINVAL;
-        }
        EXOFS_DBGMSG("exofs: layout: "
                "num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +503,8 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
                sbi->layout.group_width,
                _LLU(sbi->layout.group_depth),
                sbi->layout.mirrors_p1,
-                sbi->data_map.odm_raid_algorithm);
+                sbi->layout.raid_algorithm);
-        return 0;
+        return ret;
 }
 static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +550,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
        return !(odi->systemid_len || odi->osdname_len);
 }
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+                      struct exofs_dev **peds)
+{
+        struct __alloc_ore_devs_and_exofs_devs {
+                /* Twice bigger table: See exofs_init_comps() and comment at
+                 * exofs_read_lookup_dev_table()
+                 */
+                struct ore_dev *oreds[numdevs * 2 - 1];
+                struct exofs_dev eds[numdevs];
+        } *aoded;
+        struct exofs_dev *eds;
+        unsigned i;
+        aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+        if (unlikely(!aoded)) {
+                EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+                          numdevs);
+                return -ENOMEM;
+        }
+        sbi->oc.ods = aoded->oreds;
+        *peds = eds = aoded->eds;
+        for (i = 0; i < numdevs; ++i)
+                aoded->oreds[i] = &eds[i].ored;
+        return 0;
+}
 static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
                                       struct osd_dev *fscb_od,
                                       unsigned table_count)
 {
        struct ore_comp comp;
        struct exofs_device_table *dt;
+        struct exofs_dev *eds;
        unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
                                             sizeof(*dt);
        unsigned numdevs, i;
@@ -623,7 +596,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
                return -ENOMEM;
        }
-        sbi->comps.numdevs = 0;
+        sbi->oc.numdevs = 0;
        comp.obj.partition = sbi->one_comp.obj.partition;
        comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +620,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
        if (unlikely(ret))
                goto out;
-        if (likely(numdevs > 1)) {
+        ret = __alloc_dev_table(sbi, numdevs, &eds);
-                unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
+        if (unlikely(ret))
+                goto out;
-                /* Twice bigger table: See exofs_init_comps() and below
+        /* exofs round-robins the device table view according to inode
-                 * comment
+         * number. We hold a: twice bigger table hence inodes can point
-                 */
+         * to any device and have a sequential view of the table
-                sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
+         * starting at this device. See exofs_init_comps()
-                if (unlikely(!sbi->comps.ods)) {
+         */
-                        EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+        memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
-                                  numdevs);
+                (numdevs - 1) * sizeof(sbi->oc.ods[0]));
-                        ret = -ENOMEM;
-                        goto out;
-                }
-        }
        for (i = 0; i < numdevs; i++) {
                struct exofs_fscb fscb;
@@ -676,13 +645,16 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
                printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
                       i, odi.osdname);
+                /* the exofs id is currently the table index */
+                eds[i].did = i;
                /* On all devices the device table is identical. The user can
                 * specify any one of the participating devices on the command
                 * line. We always keep them in device-table order.
                 */
                if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-                        sbi->comps.ods[i] = fscb_od;
+                        eds[i].ored.od = fscb_od;
-                        ++sbi->comps.numdevs;
+                        ++sbi->oc.numdevs;
                        fscb_od = NULL;
                        continue;
                }
@@ -695,8 +667,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
                        goto out;
                }
-                sbi->comps.ods[i] = od;
+                eds[i].ored.od = od;
-                ++sbi->comps.numdevs;
+                ++sbi->oc.numdevs;
                /* Read the fscb of the other devices to make sure the FS
                 * partition is there.
@@ -718,21 +690,10 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 out:
        kfree(dt);
-        if (likely(!ret)) {
+        if (unlikely(fscb_od && !ret)) {
-                unsigned numdevs = sbi->comps.numdevs;
-                if (unlikely(fscb_od)) {
                        EXOFS_ERR("ERROR: Bad device-table container device not present\n");
                        osduld_put_device(fscb_od);
                        return -EINVAL;
-                }
-                /* exofs round-robins the device table view according to inode
-                 * number. We hold a: twice bigger table hence inodes can point
-                 * to any device and have a sequential view of the table
-                 * starting at this device. See exofs_init_comps()
-                 */
-                for (i = 0; i < numdevs - 1; ++i)
-                        sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
        }
        return ret;
 }
@@ -783,10 +744,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->one_comp.obj.partition = opts->pid;
        sbi->one_comp.obj.id = 0;
        exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-        sbi->comps.numdevs = 1;
+        sbi->oc.numdevs = 1;
-        sbi->comps.single_comp = EC_SINGLE_COMP;
+        sbi->oc.single_comp = EC_SINGLE_COMP;
-        sbi->comps.comps = &sbi->one_comp;
+        sbi->oc.comps = &sbi->one_comp;
-        sbi->comps.ods = sbi->_min_one_dev;
        /* fill in some other data by hand */
        memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +795,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                if (unlikely(ret))
                        goto free_sbi;
        } else {
-                sbi->comps.ods[0] = od;
+                struct exofs_dev *eds;
+                ret = __alloc_dev_table(sbi, 1, &eds);
+                if (unlikely(ret))
+                        goto free_sbi;
+                ore_comp_set_dev(&sbi->oc, 0, od);
        }
        __sbi_read_stats(sbi);
@@ -875,7 +841,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+        _exofs_print_device("Mounting", opts->dev_name,
+                            ore_comp_dev(&sbi->oc, 0),
                            sbi->one_comp.obj.partition);
        return 0;
@@ -924,7 +891,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
        uint64_t used = ULLONG_MAX;
        int ret;
-        ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+        ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
        if (ret) {
                EXOFS_DBGMSG("ore_get_io_state failed.\n");
                return ret;
@@ -981,7 +948,7 @@ static const struct super_operations exofs_sops = {
 * EXPORT OPERATIONS
 *****************************************************************************/
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
 {
        unsigned long ino = exofs_parent_ino(child);
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 5d979b4347b0..c922adc8ef41 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -46,28 +46,30 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-int
+int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-ext2_init_security(struct inode *inode, struct inode *dir,
+                    void *fs_info)
-                   const struct qstr *qstr)
 {
-        int err;
+        const struct xattr *xattr;
-        size_t len;
+        int err = 0;
-        void *value;
-        char *name;
-        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-        if (err) {
+                err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
-                if (err == -EOPNOTSUPP)
+                                     xattr->name, xattr->value,
-                        return 0;
+                                     xattr->value_len, 0);
-                return err;
+                if (err < 0)
+                        break;
        }
-        err = ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY,
-                             name, value, len, 0);
-        kfree(name);
-        kfree(value);
        return err;
 }
+int
+ext2_init_security(struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &ext2_initxattrs, NULL);
+}
 const struct xattr_handler ext2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext2_xattr_security_list,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 04da6acde85d..12661e1deedd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
                return bh;
        if (buffer_uptodate(bh))
                return bh;
-        ll_rw_block(READ_META, 1, &bh);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -2807,7 +2807,7 @@ make_io:
                trace_ext3_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_META, bh);
+                submit_bh(READ | REQ_META | REQ_PRIO, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6e18a0b7750d..0629e09f6511 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -922,7 +922,8 @@ restart:
                                bh = ext3_getblk(NULL, dir, b++, 0, &err);
                                bh_use[ra_max] = bh;
                                if (bh)
-                                        ll_rw_block(READ_META, 1, &bh);
+                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
+                                                    1, &bh);
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2209,9 +2210,11 @@ static int ext3_symlink (struct inode * dir,
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
-                 * group descriptor, sb, inode block, quota blocks.
+                 * group descriptor, sb, inode block, quota blocks, and
+                 * possibly selinux xattr blocks.
                 */
-                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+                credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          EXT3_XATTR_TRANS_BLOCKS;
        } else {
                /*
                 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index b8d9f83aa5c5..3c218b8a51d4 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -48,28 +48,32 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-int
+int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                    void *fs_info)
-                   const struct qstr *qstr)
 {
-        int err;
+        const struct xattr *xattr;
-        size_t len;
+        handle_t *handle = fs_info;
-        void *value;
+        int err = 0;
-        char *name;
-        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-        if (err) {
+                err = ext3_xattr_set_handle(handle, inode,
-                if (err == -EOPNOTSUPP)
+                                            EXT3_XATTR_INDEX_SECURITY,
-                        return 0;
+                                            xattr->name, xattr->value,
-                return err;
+                                            xattr->value_len, 0);
+                if (err < 0)
+                        break;
        }
-        err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
-                                    name, value, len, 0);
-        kfree(name);
-        kfree(value);
        return err;
 }
+int
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &ext3_initxattrs, handle);
+}
 const struct xattr_handler ext3_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext3_xattr_security_list,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e717dfd2f2b4..b7d7bd0f066e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -175,6 +175,7 @@ struct mpage_da_data {
 */
 #define EXT4_IO_END_UNWRITTEN   0x0001
 #define EXT4_IO_END_ERROR       0x0002
+#define EXT4_IO_END_QUEUED      0x0004
 struct ext4_io_page {
        struct page     *p_page;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757689b6..5802fa1dab18 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-        if (!S_ISREG(inode->i_mode))
-                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
+        if (!S_ISREG(inode->i_mode))
+                return 0;
        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e4095e988eba..b9548f477bb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -224,53 +224,8 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
        else
                maxbytes = inode->i_sb->s_maxbytes;
-        mutex_lock(&inode->i_mutex);
-        switch (origin) {
-        case SEEK_END:
-                offset += inode->i_size;
-                break;
-        case SEEK_CUR:
-                if (offset == 0) {
-                        mutex_unlock(&inode->i_mutex);
-                        return file->f_pos;
-                }
-                offset += file->f_pos;
-                break;
-        case SEEK_DATA:
-                /*
-                 * In the generic case the entire file is data, so as long as
-                 * offset isn't at the end of the file then the offset is data.
-                 */
-                if (offset >= inode->i_size) {
-                        mutex_unlock(&inode->i_mutex);
-                        return -ENXIO;
-                }
-                break;
-        case SEEK_HOLE:
-                /*
-                 * There is a virtual hole at the end of the file, so as long as
-                 * offset isn't i_size or larger, return i_size.
-                 */
-                if (offset >= inode->i_size) {
-                        mutex_unlock(&inode->i_mutex);
-                        return -ENXIO;
-                }
-                offset = inode->i_size;
-                break;
-        }
-        if (offset < 0 || offset > maxbytes) {
-                mutex_unlock(&inode->i_mutex);
-                return -EINVAL;
-        }
-        if (offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
-        mutex_unlock(&inode->i_mutex);
-        return offset;
+        return generic_file_llseek_size(file, offset, origin, maxbytes);
 }
 const struct file_operations ext4_file_operations = {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8602cde5b5a..0962642119c0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -800,12 +800,17 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
        }
 retry:
-        if (rw == READ && ext4_should_dioread_nolock(inode))
+        if (rw == READ && ext4_should_dioread_nolock(inode)) {
+                if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+                        mutex_lock(&inode->i_mutex);
+                        ext4_flush_completed_IO(inode);
+                        mutex_unlock(&inode->i_mutex);
+                }
                ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL, NULL, 0);
-        else {
+        } else {
                ret = blockdev_direct_IO(rw, iocb, inode, iov,
                                 offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d47264cafee0..986e2388f031 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -120,6 +120,9 @@ void ext4_evict_inode(struct inode *inode)
        int err;
        trace_ext4_evict_inode(inode);
+        ext4_ioend_wait(inode);
        if (inode->i_nlink) {
                /*
                 * When journalling data dirty buffers are tracked only in the
@@ -644,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                return bh;
        if (buffer_uptodate(bh))
                return bh;
-        ll_rw_block(READ_META, 1, &bh);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -983,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
+        BUG_ON(!ext4_handle_valid(handle));
        if (copied < len) {
                if (!PageUptodate(page))
                        copied = 0;
@@ -1283,7 +1288,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
-                        else
+                        else if (buffer_uninit(page_bufs)) {
+                                ext4_set_bh_endio(page_bufs, inode);
+                                err = block_write_full_page_endio(page,
+                                        noalloc_get_block_write,
+                                        mpd->wbc, ext4_end_io_buffer_write);
+                        } else
                                err = block_write_full_page(page,
                                        noalloc_get_block_write, mpd->wbc);
@@ -1699,6 +1709,8 @@ static int __ext4_journalled_writepage(struct page *page,
                goto out;
        }
+        BUG_ON(!ext4_handle_valid(handle));
        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                do_journal_get_write_access);
@@ -2668,8 +2680,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
-        io_end->flag = EXT4_IO_END_UNWRITTEN;
+        /*
+         * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+         * but being more careful is always safe for the future change.
+         */
        inode = io_end->inode;
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -3279,7 +3298,7 @@ make_io:
                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_META, bh);
+                submit_bh(READ | REQ_META | REQ_PRIO, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 565a154e22d4..1c924faeb6c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -922,7 +922,8 @@ restart:
                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
                                bh_use[ra_max] = bh;
                                if (bh)
-                                        ll_rw_block(READ_META, 1, &bh);
+                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
+                                                    1, &bh);
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2253,9 +2254,11 @@ static int ext4_symlink(struct inode *dir,
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
-                 * group descriptor, sb, inode block, quota blocks.
+                 * group descriptor, sb, inode block, quota blocks, and
+                 * possibly selinux xattr blocks.
                 */
-                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+                credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+                          EXT4_XATTR_TRANS_BLOCKS;
        } else {
                /*
                 * Fast symlink. We have to add entry to directory
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 430c401d0895..92f38ee13f8a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work)
        unsigned long           flags;
        int                     ret;
-        mutex_lock(&inode->i_mutex);
+        if (!mutex_trylock(&inode->i_mutex)) {
+                /*
+                 * Requeue the work instead of waiting so that the work
+                 * items queued after this can be processed.
+                 */
+                queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
+                /*
+                 * To prevent the ext4-dio-unwritten thread from keeping
+                 * requeueing end_io requests and occupying cpu for too long,
+                 * yield the cpu if it sees an end_io request that has already
+                 * been requeued.
+                 */
+                if (io->flag & EXT4_IO_END_QUEUED)
+                        yield();
+                io->flag |= EXT4_IO_END_QUEUED;
+                return;
+        }
        ret = ext4_end_io_nolock(io);
        if (ret < 0) {
                mutex_unlock(&inode->i_mutex);
@@ -334,8 +350,10 @@ submit_and_retry:
        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
            (io_end->pages[io_end->num_io_pages-1] != io_page))
                goto submit_and_retry;
-        if (buffer_uninit(bh))
+        if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-                io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                io_end->flag |= EXT4_IO_END_UNWRITTEN;
+                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+        }
        io->io_end->size += bh->b_size;
        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4687fea0c00f..44d0c8db2239 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -919,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head)
 static void ext4_destroy_inode(struct inode *inode)
 {
-        ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 007c3bfbf094..34e4350dd4d9 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -48,28 +48,32 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-int
+int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                    void *fs_info)
-                   const struct qstr *qstr)
 {
-        int err;
+        const struct xattr *xattr;
-        size_t len;
+        handle_t *handle = fs_info;
-        void *value;
+        int err = 0;
-        char *name;
-        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-        if (err) {
+                err = ext4_xattr_set_handle(handle, inode,
-                if (err == -EOPNOTSUPP)
+                                            EXT4_XATTR_INDEX_SECURITY,
-                        return 0;
+                                            xattr->name, xattr->value,
-                return err;
+                                            xattr->value_len, 0);
+                if (err < 0)
+                        break;
        }
-        err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
-                                    name, value, len, 0);
-        kfree(name);
-        kfree(value);
        return err;
 }
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &ext4_initxattrs, handle);
+}
 const struct xattr_handler ext4_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext4_xattr_security_list,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4ad64732cbce..5efbd5d7701a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
-        struct msdos_dir_entry *de;
+        struct msdos_dir_entry *uninitialized_var(de);
        int err, free_slots, i, nr_bhs;
        loff_t pos, i_pos;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5942fec22c65..1726d7303047 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
-                fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
+                fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
-                       "case sensitive!\n");
+                       "case sensitive!");
        }
        /* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
        sbi->free_clusters = -1;        /* Don't know yet */
        sbi->free_clus_valid = 0;
        sbi->prev_free = FAT_START_ENT;
+        sb->s_maxbytes = 0xffffffff;
        if (!sbi->fat_length && b->fat32_length) {
                struct fat_boot_fsinfo *fsinfo;
@@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                sbi->fat_length = le32_to_cpu(b->fat32_length);
                sbi->root_cluster = le32_to_cpu(b->root_cluster);
-                sb->s_maxbytes = 0xffffffff;
                /* MC - if info_sector is 0, don't multiply by 0 */
                sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
                if (sbi->fsinfo_sector == 0)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 640fc229df10..5cb8614508c3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
        forget->forget_one.nlookup = nlookup;
        spin_lock(&fc->lock);
-        fc->forget_list_tail->next = forget;
+        if (fc->connected) {
-        fc->forget_list_tail = forget;
+                fc->forget_list_tail->next = forget;
-        wake_up(&fc->waitq);
+                fc->forget_list_tail = forget;
-        kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+                wake_up(&fc->waitq);
+                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+        } else {
+                kfree(forget);
+        }
        spin_unlock(&fc->lock);
 }
@@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
        if (outarg.namelen > FUSE_NAME_MAX)
                goto err;
+        err = -EINVAL;
+        if (size != sizeof(outarg) + outarg.namelen + 1)
+                goto err;
        name.name = buf;
        name.len = outarg.namelen;
        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d480d9af46c9..594f07a81c28 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/compat.h>
+#include <linux/swap.h>
 static const struct file_operations fuse_direct_io_file_operations;
@@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode)
        req = ff->reserved_req;
        fuse_prepare_release(ff, file->f_flags, opcode);
+        if (ff->flock) {
+                struct fuse_release_in *inarg = &req->misc.release.in;
+                inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+                inarg->lock_owner = fuse_lock_owner_id(ff->fc,
+                                                       (fl_owner_t) file);
+        }
        /* Hold vfsmount and dentry until release is finished */
        path_get(&file->f_path);
        req->misc.release.path = file->f_path;
@@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
        return req->misc.write.out.size;
 }
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
-{
-        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = grab_cache_page_write_begin(mapping, index, flags);
-        if (!*pagep)
-                return -ENOMEM;
-        return 0;
-}
 void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
@@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos)
        spin_unlock(&fc->lock);
 }
-static int fuse_buffered_write(struct file *file, struct inode *inode,
-                               loff_t pos, unsigned count, struct page *page)
-{
-        int err;
-        size_t nres;
-        struct fuse_conn *fc = get_fuse_conn(inode);
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        struct fuse_req *req;
-        if (is_bad_inode(inode))
-                return -EIO;
-        /*
-         * Make sure writepages on the same page are not mixed up with
-         * plain writes.
-         */
-        fuse_wait_on_page_writeback(inode, page->index);
-        req = fuse_get_req(fc);
-        if (IS_ERR(req))
-                return PTR_ERR(req);
-        req->in.argpages = 1;
-        req->num_pages = 1;
-        req->pages[0] = page;
-        req->page_offset = offset;
-        nres = fuse_send_write(req, file, pos, count, NULL);
-        err = req->out.h.error;
-        fuse_put_request(fc, req);
-        if (!err && !nres)
-                err = -EIO;
-        if (!err) {
-                pos += nres;
-                fuse_write_update_size(inode, pos);
-                if (count == PAGE_CACHE_SIZE)
-                        SetPageUptodate(page);
-        }
-        fuse_invalidate_attr(inode);
-        return err ? err : nres;
-}
-static int fuse_write_end(struct file *file, struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        struct inode *inode = mapping->host;
-        int res = 0;
-        if (copied)
-                res = fuse_buffered_write(file, inode, pos, copied, page);
-        unlock_page(page);
-        page_cache_release(page);
-        return res;
-}
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
                                    struct inode *inode, loff_t pos,
                                    size_t count)
@@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                pagefault_enable();
                flush_dcache_page(page);
+                mark_page_accessed(page);
                if (!tmp) {
                        unlock_page(page);
                        page_cache_release(page);
@@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
        struct fuse_conn *fc = get_fuse_conn(inode);
        int err;
-        if (fc->no_lock) {
+        if (fc->no_flock) {
                err = flock_lock_file_wait(file, fl);
        } else {
+                struct fuse_file *ff = file->private_data;
                /* emulate flock with POSIX locks */
                fl->fl_owner = (fl_owner_t) file;
+                ff->flock = true;
                err = fuse_setlk(file, fl, 1);
        }
@@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops  = {
        .readpage       = fuse_readpage,
        .writepage      = fuse_writepage,
        .launder_page   = fuse_launder_page,
-        .write_begin    = fuse_write_begin,
-        .write_end      = fuse_write_end,
        .readpages      = fuse_readpages,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .bmap           = fuse_bmap,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c6aa2d4b8517..cf6db0a93219 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -135,6 +135,9 @@ struct fuse_file {
        /** Wait queue head for poll */
        wait_queue_head_t poll_wait;
+        /** Has flock been performed on this file? */
+        bool flock:1;
 };
 /** One input argument of a request */
@@ -448,7 +451,7 @@ struct fuse_conn {
        /** Is removexattr not implemented by fs? */
        unsigned no_removexattr:1;
-        /** Are file locking primitives not implemented by fs? */
+        /** Are posix file locking primitives not implemented by fs? */
        unsigned no_lock:1;
        /** Is access not implemented by fs? */
@@ -472,6 +475,9 @@ struct fuse_conn {
        /** Don't apply umask to creation modes */
        unsigned dont_mask:1;
+        /** Are BSD file locking primitives not implemented by fs? */
+        unsigned no_flock:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 38f84cd48b67..add96f6ffda5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -71,7 +71,7 @@ struct fuse_mount_data {
        unsigned blksize;
 };
-struct fuse_forget_link *fuse_alloc_forget()
+struct fuse_forget_link *fuse_alloc_forget(void)
 {
        return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
 }
@@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->async_read = 1;
                        if (!(arg->flags & FUSE_POSIX_LOCKS))
                                fc->no_lock = 1;
+                        if (arg->minor >= 17) {
+                                if (!(arg->flags & FUSE_FLOCK_LOCKS))
+                                        fc->no_flock = 1;
+                        } else {
+                                if (!(arg->flags & FUSE_POSIX_LOCKS))
+                                        fc->no_flock = 1;
+                        }
                        if (arg->flags & FUSE_ATOMIC_O_TRUNC)
                                fc->atomic_o_trunc = 1;
                        if (arg->minor >= 9) {
@@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
+                        fc->no_flock = 1;
                }
                fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
@@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
+                FUSE_FLOCK_LOCKS;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 34501b64bc47..65978d7885c8 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -82,7 +82,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
                iattr.ia_valid = ATTR_MODE;
                iattr.ia_mode = mode;
-                error = gfs2_setattr_simple(GFS2_I(inode), &iattr);
+                error = gfs2_setattr_simple(inode, &iattr);
        }
        return error;
@@ -160,6 +160,7 @@ out:
 int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 {
+        struct inode *inode = &ip->i_inode;
        struct posix_acl *acl;
        char *data;
        unsigned int len;
@@ -169,7 +170,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (!acl)
-                return gfs2_setattr_simple(ip, attr);
+                return gfs2_setattr_simple(inode, attr);
        error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
        if (error)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index f9fbbe96c222..4858e1fed8b1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -663,7 +663,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (&ip->i_inode == sdp->sd_rindex)
                rblocks += 2 * RES_STATFS;
        if (alloc_required)
-                rblocks += gfs2_rg_blocks(al);
+                rblocks += gfs2_rg_blocks(ip);
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -787,7 +787,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        u64 to = pos + copied;
        void *kaddr;
        unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
-        struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
        BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
        kaddr = kmap_atomic(page, KM_USER0);
@@ -804,7 +803,6 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        if (copied) {
                if (inode->i_size < to)
                        i_size_write(inode, to);
-                gfs2_dinode_out(ip, di);
                mark_inode_dirty(inode);
        }
@@ -873,10 +871,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                gfs2_page_add_databufs(ip, page, from, to);
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (ret > 0) {
-                gfs2_dinode_out(ip, dibh->b_data);
-                mark_inode_dirty(inode);
-        }
        if (inode == sdp->sd_rindex) {
                adjust_fs_space(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7878c473ae62..41d494d79709 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
@@ -36,11 +37,6 @@ struct metapath {
        __u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
-typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
-                             struct buffer_head *bh, __be64 *top,
-                             __be64 *bottom, unsigned int height,
-                             void *data);
 struct strip_mine {
        int sm_first;
        unsigned int sm_height;
@@ -273,6 +269,30 @@ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp
        return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
 }
+static void gfs2_metapath_ra(struct gfs2_glock *gl,
+                             const struct buffer_head *bh, const __be64 *pos)
+{
+        struct buffer_head *rabh;
+        const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
+        const __be64 *t;
+        for (t = pos; t < endp; t++) {
+                if (!*t)
+                        continue;
+                rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
+                if (trylock_buffer(rabh)) {
+                        if (!buffer_uptodate(rabh)) {
+                                rabh->b_end_io = end_buffer_read_sync;
+                                submit_bh(READA | REQ_META, rabh);
+                                continue;
+                        }
+                        unlock_buffer(rabh);
+                }
+                brelse(rabh);
+        }
+}
 /**
 * lookup_metapath - Walk the metadata tree to a specific point
 * @ip: The inode
@@ -432,12 +452,14 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct super_block *sb = sdp->sd_vfs;
        struct buffer_head *dibh = mp->mp_bh[0];
        u64 bn, dblock = 0;
        unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
        unsigned dblks = 0;
        unsigned ptrs_per_blk;
        const unsigned end_of_metadata = height - 1;
+        int ret;
        int eob = 0;
        enum alloc_state state;
        __be64 *ptr;
@@ -540,6 +562,15 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                        dblock = bn;
                        while (n-- > 0)
                                *ptr++ = cpu_to_be64(bn++);
+                        if (buffer_zeronew(bh_map)) {
+                                ret = sb_issue_zeroout(sb, dblock, dblks,
+                                                       GFP_NOFS);
+                                if (ret) {
+                                        fs_err(sdp,
+                                               "Failed to zero data buffers\n");
+                                        clear_buffer_zeronew(bh_map);
+                                }
+                        }
                        break;
                }
        } while ((state != ALLOC_DATA) || !dblock);
@@ -668,76 +699,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 }
 /**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @bc: the call to make for each piece of metadata
- * @data: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-                          struct metapath *mp, unsigned int height,
-                          u64 block, int first, block_call_t bc,
-                          void *data)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct buffer_head *bh = NULL;
-        __be64 *top, *bottom;
-        u64 bn;
-        int error;
-        int mh_size = sizeof(struct gfs2_meta_header);
-        if (!height) {
-                error = gfs2_meta_inode_buffer(ip, &bh);
-                if (error)
-                        return error;
-                dibh = bh;
-                top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-                bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-        } else {
-                error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
-                if (error)
-                        return error;
-                top = (__be64 *)(bh->b_data + mh_size) +
-                                  (first ? mp->mp_list[height] : 0);
-                bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-        }
-        error = bc(ip, dibh, bh, top, bottom, height, data);
-        if (error)
-                goto out;
-        if (height < ip->i_height - 1)
-                for (; top < bottom; top++, first = 0) {
-                        if (!*top)
-                                continue;
-                        bn = be64_to_cpu(*top);
-                        error = recursive_scan(ip, dibh, mp, height + 1, bn,
-                                               first, bc, data);
-                        if (error)
-                                break;
-                }
-out:
-        brelse(bh);
-        return error;
-}
-/**
 * do_strip - Look for a layer a particular layer of the file and strip it off
 * @ip: the inode
 * @dibh: the dinode buffer
@@ -752,9 +713,8 @@ out:
 static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-                    unsigned int height, void *data)
+                    unsigned int height, struct strip_mine *sm)
 {
-        struct strip_mine *sm = data;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrp_list rlist;
        u64 bn, bstart;
@@ -783,11 +743,6 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        else if (ip->i_depth)
                revokes = sdp->sd_inptrs;
-        if (ip != GFS2_I(sdp->sd_rindex))
-                error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
-        else if (!sdp->sd_rgrps)
-                error = gfs2_ri_update(ip);
        if (error)
                return error;
@@ -805,7 +760,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                        blen++;
                else {
                        if (bstart)
-                                gfs2_rlist_add(sdp, &rlist, bstart);
+                                gfs2_rlist_add(ip, &rlist, bstart);
                        bstart = bn;
                        blen = 1;
@@ -813,7 +768,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        }
        if (bstart)
-                gfs2_rlist_add(sdp, &rlist, bstart);
+                gfs2_rlist_add(ip, &rlist, bstart);
        else
                goto out; /* Nothing to do */
@@ -887,12 +842,82 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-        if (ip != GFS2_I(sdp->sd_rindex))
-                gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
 /**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @sm: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+                          struct metapath *mp, unsigned int height,
+                          u64 block, int first, struct strip_mine *sm)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *bh = NULL;
+        __be64 *top, *bottom;
+        u64 bn;
+        int error;
+        int mh_size = sizeof(struct gfs2_meta_header);
+        if (!height) {
+                error = gfs2_meta_inode_buffer(ip, &bh);
+                if (error)
+                        return error;
+                dibh = bh;
+                top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+                bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+        } else {
+                error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+                if (error)
+                        return error;
+                top = (__be64 *)(bh->b_data + mh_size) +
+                                  (first ? mp->mp_list[height] : 0);
+                bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+        }
+        error = do_strip(ip, dibh, bh, top, bottom, height, sm);
+        if (error)
+                goto out;
+        if (height < ip->i_height - 1) {
+                gfs2_metapath_ra(ip->i_gl, bh, top);
+                for (; top < bottom; top++, first = 0) {
+                        if (!*top)
+                                continue;
+                        bn = be64_to_cpu(*top);
+                        error = recursive_scan(ip, dibh, mp, height + 1, bn,
+                                               first, sm);
+                        if (error)
+                                break;
+                }
+        }
+out:
+        brelse(bh);
+        return error;
+}
+/**
 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
 *
 * This is partly borrowed from ext3.
@@ -1031,7 +1056,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
                sm.sm_first = !!size;
                sm.sm_height = height;
-                error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+                error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
                if (error)
                        break;
        }
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1cc2f8ec52a2..8ccad2467cb6 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -240,16 +240,15 @@ fail:
        return error;
 }
-static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf,
-                                 u64 offset, unsigned int size)
+                                 unsigned int size)
 {
        struct buffer_head *dibh;
        int error;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                offset += sizeof(struct gfs2_dinode);
+                memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
-                memcpy(buf, dibh->b_data + offset, size);
                brelse(dibh);
        }
@@ -261,13 +260,12 @@ static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
 * gfs2_dir_read_data - Read a data from a directory inode
 * @ip: The GFS2 Inode
 * @buf: The buffer to place result into
- * @offset: File offset to begin jdata_readng from
 * @size: Amount of data to transfer
 *
 * Returns: The amount of data actually copied or the error
 */
-static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
-                              unsigned int size, unsigned ra)
+                              unsigned int size)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        u64 lblock, dblock;
@@ -275,24 +273,14 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
        unsigned int o;
        int copied = 0;
        int error = 0;
-        u64 disksize = i_size_read(&ip->i_inode);
-        if (offset >= disksize)
-                return 0;
-        if (offset + size > disksize)
-                size = disksize - offset;
-        if (!size)
-                return 0;
        if (gfs2_is_stuffed(ip))
-                return gfs2_dir_read_stuffed(ip, buf, offset, size);
+                return gfs2_dir_read_stuffed(ip, buf, size);
        if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
                return -EINVAL;
-        lblock = offset;
+        lblock = 0;
        o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
        while (copied < size) {
@@ -311,8 +299,6 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
                        if (error || !dblock)
                                goto fail;
                        BUG_ON(extlen < 1);
-                        if (!ra)
-                                extlen = 1;
                        bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
                } else {
                        error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
@@ -328,7 +314,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
                extlen--;
                memcpy(buf, bh->b_data + o, amount);
                brelse(bh);
-                buf += amount;
+                buf += (amount/sizeof(__be64));
                copied += amount;
                lblock++;
                o = sizeof(struct gfs2_meta_header);
@@ -371,7 +357,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
        if (hc == NULL)
                return ERR_PTR(-ENOMEM);
-        ret = gfs2_dir_read_data(ip, (char *)hc, 0, hsize, 1);
+        ret = gfs2_dir_read_data(ip, hc, hsize);
        if (ret < 0) {
                kfree(hc);
                return ERR_PTR(ret);
@@ -1695,7 +1681,6 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
        const struct qstr *name = &dentry->d_name;
        struct gfs2_dirent *dent, *prev = NULL;
        struct buffer_head *bh;
-        int error;
        /* Returns _either_ the entry (if its first in block) or the
           previous entry otherwise */
@@ -1724,22 +1709,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
        }
        brelse(bh);
-        error = gfs2_meta_inode_buffer(dip, &bh);
-        if (error)
-                return error;
        if (!dip->i_entries)
                gfs2_consist_inode(dip);
-        gfs2_trans_add_bh(dip->i_gl, bh, 1);
        dip->i_entries--;
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
        if (S_ISDIR(dentry->d_inode->i_mode))
                drop_nlink(&dip->i_inode);
-        gfs2_dinode_out(dip, bh->b_data);
-        brelse(bh);
        mark_inode_dirty(&dip->i_inode);
-        return error;
+        return 0;
 }
 /**
@@ -1829,10 +1807,6 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        if (error)
                goto out_put;
-        error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
-        if (error)
-                goto out_qs;
        /*  Count the number of leaves  */
        bh = leaf_bh;
@@ -1847,7 +1821,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
                if (blk != leaf_no)
                        brelse(bh);
-                gfs2_rlist_add(sdp, &rlist, blk);
+                gfs2_rlist_add(dip, &rlist, blk);
                l_blocks++;
        }
@@ -1911,8 +1885,6 @@ out_rg_gunlock:
        gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
        gfs2_rlist_free(&rlist);
-        gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
-out_qs:
        gfs2_quota_unhold(dip);
 out_put:
        gfs2_alloc_put(dip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index edeb9e802903..ce36a56dfeac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -59,15 +59,24 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
        struct gfs2_holder i_gh;
        loff_t error;
-        if (origin == 2) {
+        switch (origin) {
+        case SEEK_END: /* These reference inode->i_size */
+        case SEEK_DATA:
+        case SEEK_HOLE:
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = generic_file_llseek_unlocked(file, offset, origin);
+                        error = generic_file_llseek(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
-        } else
+                break;
-                error = generic_file_llseek_unlocked(file, offset, origin);
+        case SEEK_CUR:
+        case SEEK_SET:
+                error = generic_file_llseek(file, offset, origin);
+                break;
+        default:
+                error = -EINVAL;
+        }
        return error;
 }
@@ -357,8 +366,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned int data_blocks, ind_blocks, rblocks;
        struct gfs2_holder gh;
        struct gfs2_alloc *al;
+        loff_t size;
        int ret;
+        /* Wait if fs is frozen. This is racy so we check again later on
+         * and retry if the fs has been frozen after the page lock has
+         * been acquired
+         */
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        ret = gfs2_glock_nq(&gh);
        if (ret)
@@ -367,8 +383,15 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
+        if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+                lock_page(page);
+                if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+                        ret = -EAGAIN;
+                        unlock_page(page);
+                }
                goto out_unlock;
+        }
        ret = -ENOMEM;
        al = gfs2_alloc_get(ip);
        if (al == NULL)
@@ -388,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                rblocks += data_blocks ? data_blocks : 1;
        if (ind_blocks || data_blocks) {
                rblocks += RES_STATFS + RES_QUOTA;
-                rblocks += gfs2_rg_blocks(al);
+                rblocks += gfs2_rg_blocks(ip);
        }
        ret = gfs2_trans_begin(sdp, rblocks, 0);
        if (ret)
@@ -396,21 +419,29 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        lock_page(page);
        ret = -EINVAL;
-        last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+        size = i_size_read(inode);
-        if (page->index > last_index)
+        last_index = (size - 1) >> PAGE_CACHE_SHIFT;
-                goto out_unlock_page;
+        /* Check page index against inode size */
+        if (size == 0 || (page->index > last_index))
+                goto out_trans_end;
+        ret = -EAGAIN;
+        /* If truncated, we must retry the operation, we may have raced
+         * with the glock demotion code.
+         */
+        if (!PageUptodate(page) || page->mapping != inode->i_mapping)
+                goto out_trans_end;
+        /* Unstuff, if required, and allocate backing blocks for page */
        ret = 0;
-        if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+        if (gfs2_is_stuffed(ip))
-                goto out_unlock_page;
-        if (gfs2_is_stuffed(ip)) {
                ret = gfs2_unstuff_dinode(ip, page);
-                if (ret)
+        if (ret == 0)
-                        goto out_unlock_page;
+                ret = gfs2_allocate_page_backing(page);
-        }
-        ret = gfs2_allocate_page_backing(page);
-out_unlock_page:
+out_trans_end:
-        unlock_page(page);
+        if (ret)
+                unlock_page(page);
        gfs2_trans_end(sdp);
 out_trans_fail:
        gfs2_inplace_release(ip);
@@ -422,11 +453,17 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out:
        gfs2_holder_uninit(&gh);
-        if (ret == -ENOMEM)
+        if (ret == 0) {
-                ret = VM_FAULT_OOM;
+                set_page_dirty(page);
-        else if (ret)
+                /* This check must be post dropping of transaction lock */
-                ret = VM_FAULT_SIGBUS;
+                if (inode->i_sb->s_frozen == SB_UNFROZEN) {
-        return ret;
+                        wait_on_page_writeback(page);
+                } else {
+                        ret = -EAGAIN;
+                        unlock_page(page);
+                }
+        }
+        return block_page_mkwrite_return(ret);
 }
 static const struct vm_operations_struct gfs2_vm_ops = {
@@ -551,8 +588,16 @@ static int gfs2_close(struct inode *inode, struct file *file)
 * @end: the end position in the file to sync
 * @datasync: set if we can ignore timestamp changes
 *
- * The VFS will flush data for us. We only need to worry
+ * We split the data flushing here so that we don't wait for the data
- * about metadata here.
+ * until after we've also sent the metadata to disk. Note that for
+ * data=ordered, we will write & wait for the data at the log flush
+ * stage anyway, so this is unlikely to make much of a difference
+ * except in the data=writeback case.
+ *
+ * If the fdatawrite fails due to any reason except -EIO, we will
+ * continue the remainder of the fsync, although we'll still report
+ * the error at the end. This is to match filemap_write_and_wait_range()
+ * behaviour.
 *
 * Returns: errno
 */
@@ -560,30 +605,34 @@ static int gfs2_close(struct inode *inode, struct file *file)
 static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
                      int datasync)
 {
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
        struct gfs2_inode *ip = GFS2_I(inode);
-        int ret;
+        int ret, ret1 = 0;
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (mapping->nrpages) {
-        if (ret)
+                ret1 = filemap_fdatawrite_range(mapping, start, end);
-                return ret;
+                if (ret1 == -EIO)
-        mutex_lock(&inode->i_mutex);
+                        return ret1;
+        }
        if (datasync)
                sync_state &= ~I_DIRTY_SYNC;
        if (sync_state) {
                ret = sync_inode_metadata(inode, 1);
-                if (ret) {
+                if (ret)
-                        mutex_unlock(&inode->i_mutex);
                        return ret;
-                }
+                if (gfs2_is_jdata(ip))
-                gfs2_ail_flush(ip->i_gl);
+                        filemap_write_and_wait(mapping);
+                gfs2_ail_flush(ip->i_gl, 1);
        }
-        mutex_unlock(&inode->i_mutex);
+        if (mapping->nrpages)
-        return 0;
+                ret = filemap_fdatawait_range(mapping, start, end);
+        return ret ? ret : ret1;
 }
 /**
@@ -620,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
-static int empty_write_end(struct page *page, unsigned from,
-                           unsigned to, int mode)
-{
-        struct inode *inode = page->mapping->host;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *bh;
-        unsigned offset, blksize = 1 << inode->i_blkbits;
-        pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-        zero_user(page, from, to-from);
-        mark_page_accessed(page);
-        if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
-                if (!gfs2_is_writeback(ip))
-                        gfs2_page_add_databufs(ip, page, from, to);
-                block_commit_write(page, from, to);
-                return 0;
-        }
-        offset = 0;
-        bh = page_buffers(page);
-        while (offset < to) {
-                if (offset >= from) {
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                        clear_buffer_new(bh);
-                        write_dirty_buffer(bh, WRITE);
-                }
-                offset += blksize;
-                bh = bh->b_this_page;
-        }
-        offset = 0;
-        bh = page_buffers(page);
-        while (offset < to) {
-                if (offset >= from) {
-                        wait_on_buffer(bh);
-                        if (!buffer_uptodate(bh))
-                                return -EIO;
-                }
-                offset += blksize;
-                bh = bh->b_this_page;
-        }
-        return 0;
-}
-static int needs_empty_write(sector_t block, struct inode *inode)
-{
-        int error;
-        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        bh_map.b_size = 1 << inode->i_blkbits;
-        error = gfs2_block_map(inode, block, &bh_map, 0);
-        if (unlikely(error))
-                return error;
-        return !buffer_mapped(&bh_map);
-}
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
-                              int mode)
-{
-        struct inode *inode = page->mapping->host;
-        unsigned start, end, next, blksize;
-        sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        int ret;
-        blksize = 1 << inode->i_blkbits;
-        next = end = 0;
-        while (next < from) {
-                next += blksize;
-                block++;
-        }
-        start = next;
-        do {
-                next += blksize;
-                ret = needs_empty_write(block, inode);
-                if (unlikely(ret < 0))
-                        return ret;
-                if (ret == 0) {
-                        if (end) {
-                                ret = __block_write_begin(page, start, end - start,
-                                                          gfs2_block_map);
-                                if (unlikely(ret))
-                                        return ret;
-                                ret = empty_write_end(page, start, end, mode);
-                                if (unlikely(ret))
-                                        return ret;
-                                end = 0;
-                        }
-                        start = next;
-                }
-                else
-                        end = next;
-                block++;
-        } while (next < to);
-        if (end) {
-                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(ret))
-                        return ret;
-                ret = empty_write_end(page, start, end, mode);
-                if (unlikely(ret))
-                        return ret;
-        }
-        return 0;
-}
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
                           int mode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct buffer_head *dibh;
        int error;
-        u64 start = offset >> PAGE_CACHE_SHIFT;
+        unsigned int nr_blks;
-        unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+        sector_t lblock = offset >> inode->i_blkbits;
-        u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t curr;
-        struct page *page;
-        unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-        unsigned int from, to;
-        if (!end_offset)
-                end_offset = PAGE_CACHE_SIZE;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (unlikely(error))
-                goto out;
+                return error;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -758,40 +690,31 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
                        goto out;
        }
-        curr = start;
+        while (len) {
-        offset = start << PAGE_CACHE_SHIFT;
+                struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        from = start_offset;
+                bh_map.b_size = len;
-        to = PAGE_CACHE_SIZE;
+                set_buffer_zeronew(&bh_map);
-        while (curr <= end) {
-                page = grab_cache_page_write_begin(inode->i_mapping, curr,
-                                                   AOP_FLAG_NOFS);
-                if (unlikely(!page)) {
-                        error = -ENOMEM;
-                        goto out;
-                }
-                if (curr == end)
+                error = gfs2_block_map(inode, lblock, &bh_map, 1);
-                        to = end_offset;
+                if (unlikely(error))
-                error = write_empty_blocks(page, from, to, mode);
-                if (!error && offset + to > inode->i_size &&
-                    !(mode & FALLOC_FL_KEEP_SIZE)) {
-                        i_size_write(inode, offset + to);
-                }
-                unlock_page(page);
-                page_cache_release(page);
-                if (error)
                        goto out;
-                curr++;
+                len -= bh_map.b_size;
-                offset += PAGE_CACHE_SIZE;
+                nr_blks = bh_map.b_size >> inode->i_blkbits;
-                from = 0;
+                lblock += nr_blks;
+                if (!buffer_new(&bh_map))
+                        continue;
+                if (unlikely(!buffer_zeronew(&bh_map))) {
+                        error = -EIO;
+                        goto out;
+                }
        }
+        if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+                i_size_write(inode, offset + len);
-        gfs2_dinode_out(ip, dibh->b_data);
        mark_inode_dirty(inode);
-        brelse(dibh);
 out:
+        brelse(dibh);
        return error;
 }
@@ -799,7 +722,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
                            unsigned int *data_blocks, unsigned int *ind_blocks)
 {
        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+        unsigned int max_blocks = ip->i_rgd->rd_free_clone;
        unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
        for (tmp = max_data; tmp > sdp->sd_diptrs;) {
@@ -831,6 +754,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
        int error;
        loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+        loff_t max_chunk_size = UINT_MAX & bsize_mask;
        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
        /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -884,11 +808,12 @@ retry:
                        goto out_qunlock;
                }
                max_bytes = bytes;
-                calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+                calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+                                &max_bytes, &data_blocks, &ind_blocks);
                al->al_requested = data_blocks + ind_blocks;
                rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
-                          RES_RG_HDR + gfs2_rg_blocks(al);
+                          RES_RG_HDR + gfs2_rg_blocks(ip);
                if (gfs2_is_jdata(ip))
                        rblocks += data_blocks ? data_blocks : 1;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index da21ecaafcc2..78418b4fa857 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,40 +28,55 @@
 #include "trans.h"
 #include "dir.h"
+static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
+{
+        fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+               bh, (unsigned long long)bh->b_blocknr, bh->b_state,
+               bh->b_page->mapping, bh->b_page->flags);
+        fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+               gl->gl_name.ln_type, gl->gl_name.ln_number,
+               gfs2_glock2aspace(gl));
+        gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+}
 /**
 * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
 * @gl: the glock
+ * @fsync: set when called from fsync (not all buffers will be clean)
 *
 * None of the buffers should be dirty, locked, or pinned.
 */
-static void __gfs2_ail_flush(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct list_head *head = &gl->gl_ail_list;
-        struct gfs2_bufdata *bd;
+        struct gfs2_bufdata *bd, *tmp;
        struct buffer_head *bh;
+        const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
+        sector_t blocknr;
+        gfs2_log_lock(sdp);
        spin_lock(&sdp->sd_ail_lock);
-        while (!list_empty(head)) {
+        list_for_each_entry_safe(bd, tmp, head, bd_ail_gl_list) {
-                bd = list_entry(head->next, struct gfs2_bufdata,
-                                bd_ail_gl_list);
                bh = bd->bd_bh;
-                gfs2_remove_from_ail(bd);
+                if (bh->b_state & b_state) {
-                bd->bd_bh = NULL;
+                        if (fsync)
+                                continue;
+                        gfs2_ail_error(gl, bh);
+                }
+                blocknr = bh->b_blocknr;
                bh->b_private = NULL;
-                spin_unlock(&sdp->sd_ail_lock);
+                gfs2_remove_from_ail(bd); /* drops ref on bh */
-                bd->bd_blkno = bh->b_blocknr;
+                bd->bd_bh = NULL;
-                gfs2_log_lock(sdp);
+                bd->bd_blkno = blocknr;
-                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
-                gfs2_trans_add_revoke(sdp, bd);
-                gfs2_log_unlock(sdp);
-                spin_lock(&sdp->sd_ail_lock);
+                gfs2_trans_add_revoke(sdp, bd);
        }
-        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+        BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
        spin_unlock(&sdp->sd_ail_lock);
+        gfs2_log_unlock(sdp);
 }
@@ -84,13 +99,13 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        BUG_ON(current->journal_info);
        current->journal_info = &tr;
-        __gfs2_ail_flush(gl);
+        __gfs2_ail_flush(gl, 0);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
 }
-void gfs2_ail_flush(struct gfs2_glock *gl)
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        unsigned int revokes = atomic_read(&gl->gl_ail_count);
@@ -102,7 +117,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
        ret = gfs2_trans_begin(sdp, 0, revokes);
        if (ret)
                return;
-        __gfs2_ail_flush(gl);
+        __gfs2_ail_flush(gl, fsync);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
 }
@@ -119,6 +134,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl)
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
        struct address_space *metamapping = gfs2_glock2aspace(gl);
+        struct gfs2_rgrpd *rgd;
        int error;
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -130,6 +146,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(metamapping);
        mapping_set_error(metamapping, error);
        gfs2_ail_empty_gl(gl);
+        spin_lock(&gl->gl_spin);
+        rgd = gl->gl_object;
+        if (rgd)
+                gfs2_free_clones(rgd);
+        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -430,33 +452,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 /**
- * rgrp_go_lock - operation done after an rgrp lock is locked by
- *    a first holder on this node.
- * @gl: the glock
- * @flags:
- *
- * Returns: errno
- */
-static int rgrp_go_lock(struct gfs2_holder *gh)
-{
-        return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
-}
-/**
- * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
- *    a last holder on this node.
- * @gl: the glock
- * @flags:
- *
- */
-static void rgrp_go_unlock(struct gfs2_holder *gh)
-{
-        gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
-}
-/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -558,8 +553,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
-        .go_lock = rgrp_go_lock,
+        .go_lock = gfs2_rgrp_go_lock,
-        .go_unlock = rgrp_go_unlock,
+        .go_unlock = gfs2_rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
        .go_flags = GLOF_ASPACE,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 6fce409b5a50..bf95a2dc1662 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,6 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 extern const struct gfs2_glock_operations *gfs2_glops_list[];
-extern void gfs2_ail_flush(struct gfs2_glock *gl);
+extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 892ac37de8ae..7389dfdcc9ef 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -18,6 +18,7 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist_bl.h>
 #include <linux/completion.h>
+#include <linux/rbtree.h>
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
@@ -78,8 +79,7 @@ struct gfs2_bitmap {
 };
 struct gfs2_rgrpd {
-        struct list_head rd_list;       /* Link with superblock */
+        struct rb_node rd_node;         /* Link with superblock */
-        struct list_head rd_list_mru;
        struct gfs2_glock *rd_gl;       /* Glock for this rgrp */
        u64 rd_addr;                    /* grp block disk address */
        u64 rd_data0;                   /* first data location */
@@ -91,10 +91,7 @@ struct gfs2_rgrpd {
        u32 rd_dinodes;
        u64 rd_igeneration;
        struct gfs2_bitmap *rd_bits;
-        struct mutex rd_mutex;
-        struct gfs2_log_element rd_le;
        struct gfs2_sbd *rd_sbd;
-        unsigned int rd_bh_count;
        u32 rd_last_alloc;
        u32 rd_flags;
 #define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
@@ -106,12 +103,15 @@ struct gfs2_rgrpd {
 enum gfs2_state_bits {
        BH_Pinned = BH_PrivateStart,
        BH_Escaped = BH_PrivateStart + 1,
+        BH_Zeronew = BH_PrivateStart + 2,
 };
 BUFFER_FNS(Pinned, pinned)
 TAS_BUFFER_FNS(Pinned, pinned)
 BUFFER_FNS(Escaped, escaped)
 TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
 struct gfs2_bufdata {
        struct buffer_head *bd_bh;
@@ -246,7 +246,6 @@ struct gfs2_glock {
 struct gfs2_alloc {
        /* Quota stuff */
        struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
        struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
        unsigned int al_qd_num;
@@ -255,18 +254,13 @@ struct gfs2_alloc {
        u32 al_alloced; /* Filled in by gfs2_alloc_*() */
        /* Filled in by gfs2_inplace_reserve() */
-        unsigned int al_line;
-        char *al_file;
-        struct gfs2_holder al_ri_gh;
        struct gfs2_holder al_rgd_gh;
-        struct gfs2_rgrpd *al_rgd;
 };
 enum {
        GIF_INVALID             = 0,
        GIF_QD_LOCKED           = 1,
+        GIF_ALLOC_FAILED        = 2,
        GIF_SW_PAGED            = 3,
 };
@@ -282,6 +276,7 @@ struct gfs2_inode {
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
+        struct gfs2_rgrpd *i_rgd;
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
        struct list_head i_trunc_list;
@@ -574,9 +569,7 @@ struct gfs2_sbd {
        int sd_rindex_uptodate;
        spinlock_t sd_rindex_spin;
        struct mutex sd_rindex_mutex;
-        struct list_head sd_rindex_list;
+        struct rb_root sd_rindex_tree;
-        struct list_head sd_rindex_mru_list;
-        struct gfs2_rgrpd *sd_rindex_forward;
        unsigned int sd_rgrps;
        unsigned int sd_max_rg_data;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 900cf986aadc..cfd4959b218c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -583,7 +583,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                        goto fail_quota_locks;
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         al->al_rgd->rd_length +
+                                         dip->i_rgd->rd_length +
                                         2 * RES_DINODE +
                                         RES_STATFS + RES_QUOTA, 0);
                if (error)
@@ -613,8 +613,7 @@ fail_end_trans:
        gfs2_trans_end(sdp);
 fail_ipreserv:
-        if (dip->i_alloc->al_rgd)
+        gfs2_inplace_release(dip);
-                gfs2_inplace_release(dip);
 fail_quota_locks:
        gfs2_quota_unlock(dip);
@@ -624,31 +623,29 @@ fail:
        return error;
 }
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                              const struct qstr *qstr)
+                    void *fs_info)
 {
-        int err;
+        const struct xattr *xattr;
-        size_t len;
+        int err = 0;
-        void *value;
-        char *name;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
-        err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+                                       xattr->value_len, 0,
-                                           &name, &value, &len);
+                                       GFS2_EATYPE_SECURITY);
+                if (err < 0)
-        if (err) {
+                        break;
-                if (err == -EOPNOTSUPP)
-                        return 0;
-                return err;
        }
-        err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
-                               GFS2_EATYPE_SECURITY);
-        kfree(value);
-        kfree(name);
        return err;
 }
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                              const struct qstr *qstr)
+{
+        return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+                                            &gfs2_initxattrs, NULL);
+}
 /**
 * gfs2_create_inode - Create a new inode
 * @dir: The parent directory
@@ -663,7 +660,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                             unsigned int mode, dev_t dev, const char *symname,
-                             unsigned int size)
+                             unsigned int size, int excl)
 {
        const struct qstr *name = &dentry->d_name;
        struct gfs2_holder ghs[2];
@@ -683,6 +680,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                goto fail;
        error = create_ok(dip, name, mode);
+        if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
+                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+                gfs2_glock_dq_uninit(ghs);
+                d_instantiate(dentry, inode);
+                return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+        }
        if (error)
                goto fail_gunlock;
@@ -725,21 +728,22 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
                brelse(bh);
        gfs2_trans_end(sdp);
-        if (dip->i_alloc->al_rgd)
+        gfs2_inplace_release(dip);
-                gfs2_inplace_release(dip);
        gfs2_quota_unlock(dip);
        gfs2_alloc_put(dip);
-        gfs2_glock_dq_uninit_m(2, ghs);
        mark_inode_dirty(inode);
+        gfs2_glock_dq_uninit_m(2, ghs);
        d_instantiate(dentry, inode);
        return 0;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
-        if (inode && !IS_ERR(inode))
-                iput(inode);
 fail_gunlock:
        gfs2_glock_dq_uninit(ghs);
+        if (inode && !IS_ERR(inode)) {
+                set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+                iput(inode);
+        }
 fail:
        if (bh)
                brelse(bh);
@@ -758,24 +762,10 @@ fail:
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
                       int mode, struct nameidata *nd)
 {
-        struct inode *inode;
+        int excl = 0;
-        int ret;
+        if (nd && (nd->flags & LOOKUP_EXCL))
+                excl = 1;
-        for (;;) {
+        return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
-                ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
-                if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
-                        return ret;
-                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-                if (inode) {
-                        if (!IS_ERR(inode))
-                                break;
-                        return PTR_ERR(inode);
-                }
-        }
-        d_instantiate(dentry, inode);
-        return 0;
 }
 /**
@@ -902,7 +892,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                        goto out_gunlock_q;
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         gfs2_rg_blocks(al) +
+                                         gfs2_rg_blocks(dip) +
                                         2 * RES_DINODE + RES_STATFS +
                                         RES_QUOTA, 0);
                if (error)
@@ -924,8 +914,9 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        inc_nlink(&ip->i_inode);
        ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_dinode_out(ip, dibh->b_data);
+        ihold(inode);
-        mark_inode_dirty(&ip->i_inode);
+        d_instantiate(dentry, inode);
+        mark_inode_dirty(inode);
 out_brelse:
        brelse(dibh);
@@ -947,11 +938,6 @@ out_child:
 out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
-        if (!error) {
-                ihold(inode);
-                d_instantiate(dentry, inode);
-                mark_inode_dirty(inode);
-        }
        return error;
 }
@@ -1024,8 +1010,6 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
                clear_nlink(inode);
        else
                drop_nlink(inode);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        gfs2_dinode_out(ip, bh->b_data);
        mark_inode_dirty(inode);
        if (inode->i_nlink == 0)
                gfs2_unlink_di(inode);
@@ -1053,13 +1037,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        struct buffer_head *bh;
        struct gfs2_holder ghs[3];
        struct gfs2_rgrpd *rgd;
-        struct gfs2_holder ri_gh;
        int error;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                return error;
        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
        gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
@@ -1116,7 +1095,6 @@ out_child:
        gfs2_glock_dq(ghs);
 out_parent:
        gfs2_holder_uninit(ghs);
-        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -1139,7 +1117,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
                return -ENAMETOOLONG;
-        return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+        return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
 }
 /**
@@ -1153,7 +1131,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-        return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+        return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
 }
 /**
@@ -1168,7 +1146,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
                      dev_t dev)
 {
-        return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+        return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
 }
 /*
@@ -1234,7 +1212,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
        struct gfs2_inode *nip = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
@@ -1248,10 +1226,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        return 0;
        }
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                return error;
        if (odip != ndip) {
                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
                                           0, &r_gh);
@@ -1388,12 +1362,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                al->al_requested = sdp->sd_max_dirres;
-                error = gfs2_inplace_reserve_ri(ndip);
+                error = gfs2_inplace_reserve(ndip);
                if (error)
                        goto out_gunlock_q;
                error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                         gfs2_rg_blocks(al) +
+                                         gfs2_rg_blocks(ndip) +
                                         4 * RES_DINODE + 4 * RES_LEAF +
                                         RES_STATFS + RES_QUOTA + 4, 0);
                if (error)
@@ -1459,7 +1433,6 @@ out_gunlock_r:
        if (r_gh.gh_gl)
                gfs2_glock_dq_uninit(&r_gh);
 out:
-        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -1563,21 +1536,10 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
-static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
-        struct inode *inode = &ip->i_inode;
-        struct buffer_head *dibh;
-        int error;
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                return error;
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        return 0;
 }
@@ -1589,19 +1551,19 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 * Returns: errno
 */
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+int gfs2_setattr_simple(struct inode *inode, struct iattr *attr)
 {
        int error;
        if (current->journal_info)
-                return __gfs2_setattr_simple(ip, attr);
+                return __gfs2_setattr_simple(inode, attr);
-        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+        error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0);
        if (error)
                return error;
-        error = __gfs2_setattr_simple(ip, attr);
+        error = __gfs2_setattr_simple(inode, attr);
-        gfs2_trans_end(GFS2_SB(&ip->i_inode));
+        gfs2_trans_end(GFS2_SB(inode));
        return error;
 }
@@ -1639,7 +1601,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
-        error = gfs2_setattr_simple(ip, attr);
+        error = gfs2_setattr_simple(inode, attr);
        if (error)
                goto out_end_trans;
@@ -1695,12 +1657,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
        else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
                error = gfs2_acl_chmod(ip, attr);
        else
-                error = gfs2_setattr_simple(ip, attr);
+                error = gfs2_setattr_simple(inode, attr);
 out:
-        gfs2_glock_dq_uninit(&i_gh);
        if (!error)
                mark_inode_dirty(inode);
+        gfs2_glock_dq_uninit(&i_gh);
        return error;
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 8d90e0c07672..276e7b52b658 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,7 +109,7 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip);
 extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
                                  int is_root);
 extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 85c62923ee29..598646434362 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-                submit_bh(WRITE_SYNC | REQ_META, bh);
+                submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
        else
-                submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+                submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 05bbb124699f..0301be655b12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -60,6 +60,29 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        trace_gfs2_pin(bd, 1);
 }
+static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
+{
+        return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP;
+}
+static void maybe_release_space(struct gfs2_bufdata *bd)
+{
+        struct gfs2_glock *gl = bd->bd_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_rgrpd *rgd = gl->gl_object;
+        unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
+        struct gfs2_bitmap *bi = rgd->rd_bits + index;
+        if (bi->bi_clone == 0)
+                return;
+        if (sdp->sd_args.ar_discard)
+                gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi);
+        memcpy(bi->bi_clone + bi->bi_offset,
+               bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+        clear_bit(GBF_FULL, &bi->bi_flags);
+        rgd->rd_free_clone = rgd->rd_free;
+}
 /**
 * gfs2_unpin - Unpin a buffer
 * @sdp: the filesystem the buffer belongs to
@@ -81,6 +104,9 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        mark_buffer_dirty(bh);
        clear_buffer_pinned(bh);
+        if (buffer_is_rgrp(bd))
+                maybe_release_space(bd);
        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail) {
                list_del(&bd->bd_ail_st_list);
@@ -469,42 +495,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
        gfs2_revoke_clean(sdp);
 }
-static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
-{
-        struct gfs2_rgrpd *rgd;
-        struct gfs2_trans *tr = current->journal_info;
-        tr->tr_touched = 1;
-        rgd = container_of(le, struct gfs2_rgrpd, rd_le);
-        gfs2_log_lock(sdp);
-        if (!list_empty(&le->le_list)){
-                gfs2_log_unlock(sdp);
-                return;
-        }
-        gfs2_rgrp_bh_hold(rgd);
-        sdp->sd_log_num_rg++;
-        list_add(&le->le_list, &sdp->sd_log_le_rg);
-        gfs2_log_unlock(sdp);
-}
-static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-{
-        struct list_head *head = &sdp->sd_log_le_rg;
-        struct gfs2_rgrpd *rgd;
-        while (!list_empty(head)) {
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
-                list_del_init(&rgd->rd_le.le_list);
-                sdp->sd_log_num_rg--;
-                gfs2_rgrp_repolish_clones(rgd);
-                gfs2_rgrp_bh_put(rgd);
-        }
-        gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
-}
 /**
 * databuf_lo_add - Add a databuf to the transaction.
 *
@@ -705,8 +695,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                brelse(bh_log);
                brelse(bh_ip);
-                if (error)
-                        break;
                sdp->sd_replayed_blocks++;
        }
@@ -771,8 +759,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
 };
 const struct gfs2_log_operations gfs2_rg_lops = {
-        .lo_add = rg_lo_add,
-        .lo_after_commit = rg_lo_after_commit,
        .lo_name = "rg",
 };
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 747238cd9f96..be29858900f6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
        struct buffer_head *bh, *head;
        int nr_underway = 0;
-        int write_op = REQ_META |
+        int write_op = REQ_META | REQ_PRIO |
                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
        }
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
-        submit_bh(READ_SYNC | REQ_META, bh);
+        submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
        if (!(flags & DIO_WAIT))
                return 0;
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+                ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3bc073a4cf82..7e823bbd2453 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -77,8 +77,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_rindex_spin);
        mutex_init(&sdp->sd_rindex_mutex);
-        INIT_LIST_HEAD(&sdp->sd_rindex_list);
+        sdp->sd_rindex_tree.rb_node = NULL;
-        INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
        INIT_LIST_HEAD(&sdp->sd_jindex_list);
        spin_lock_init(&sdp->sd_jindex_spin);
@@ -224,7 +223,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
        bio->bi_end_io = end_bio_io_page;
        bio->bi_private = page;
-        submit_bio(READ_SYNC | REQ_META, bio);
+        submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
        wait_on_page_locked(page);
        bio_put(bio);
        if (!PageUptodate(page)) {
@@ -652,7 +651,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                fs_err(sdp, "can't lookup journal index: %d\n", error);
                return PTR_ERR(sdp->sd_jindex);
        }
-        ip = GFS2_I(sdp->sd_jindex);
        /* Load in the journal index special file */
@@ -764,7 +762,6 @@ fail:
 static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
        int error = 0;
-        struct gfs2_inode *ip;
        struct inode *master = sdp->sd_master_dir->d_inode;
        if (undo)
@@ -789,7 +786,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
                fs_err(sdp, "can't get resource index inode: %d\n", error);
                goto fail_statfs;
        }
-        ip = GFS2_I(sdp->sd_rindex);
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 42e8d23bc047..7e528dc14f85 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -638,15 +638,18 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        unsigned long index = loc >> PAGE_CACHE_SHIFT;
        unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, pos;
-        struct buffer_head *bh, *dibh;
+        struct buffer_head *bh;
        struct page *page;
        void *kaddr, *ptr;
        struct gfs2_quota q, *qp;
        int err, nbytes;
        u64 size;
-        if (gfs2_is_stuffed(ip))
+        if (gfs2_is_stuffed(ip)) {
-                gfs2_unstuff_dinode(ip, NULL);
+                err = gfs2_unstuff_dinode(ip, NULL);
+                if (err)
+                        return err;
+        }
        memset(&q, 0, sizeof(struct gfs2_quota));
        err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
@@ -709,7 +712,7 @@ get_a_page:
                set_buffer_uptodate(bh);
        if (!buffer_uptodate(bh)) {
-                ll_rw_block(READ_META, 1, &bh);
+                ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        goto unlock_out;
@@ -736,22 +739,13 @@ get_a_page:
                goto get_a_page;
        }
-        /* Update the disk inode timestamp and size (if extended) */
-        err = gfs2_meta_inode_buffer(ip, &dibh);
-        if (err)
-                goto out;
        size = loc + sizeof(struct gfs2_quota);
        if (size > inode->i_size)
                i_size_write(inode, size);
        inode->i_mtime = inode->i_atime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-        gfs2_dinode_out(ip, dibh->b_data);
-        brelse(dibh);
        mark_inode_dirty(inode);
-out:
        return err;
 unlock_out:
        unlock_page(page);
        page_cache_release(page);
@@ -822,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                goto out_alloc;
        if (nalloc)
-                blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
+                blocks += gfs2_rg_blocks(ip) + nalloc * ind_blocks + RES_STATFS;
        error = gfs2_trans_begin(sdp, blocks, 0);
        if (error)
@@ -936,7 +930,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
        unsigned int x;
        int error = 0;
-        gfs2_quota_hold(ip, uid, gid);
+        error = gfs2_quota_hold(ip, uid, gid);
+        if (error)
+                return error;
        if (capable(CAP_SYS_RESOURCE) ||
            sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
@@ -1607,7 +1603,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                error = gfs2_inplace_reserve(ip);
                if (error)
                        goto out_alloc;
-                blocks += gfs2_rg_blocks(al);
+                blocks += gfs2_rg_blocks(ip);
        }
        /* Some quotas span block boundaries and can update two blocks,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7f8af1eb02de..96bd6d759f29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -15,6 +15,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/prefetch.h>
 #include <linux/blkdev.h>
+#include <linux/rbtree.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -328,18 +329,22 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 {
-        struct gfs2_rgrpd *rgd;
+        struct rb_node **newn;
+        struct gfs2_rgrpd *cur;
        spin_lock(&sdp->sd_rindex_spin);
+        newn = &sdp->sd_rindex_tree.rb_node;
-        list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+        while (*newn) {
-                if (rgrp_contains_block(rgd, blk)) {
+                cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node);
-                        list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+                if (blk < cur->rd_addr)
+                        newn = &((*newn)->rb_left);
+                else if (blk >= cur->rd_data0 + cur->rd_data)
+                        newn = &((*newn)->rb_right);
+                else {
                        spin_unlock(&sdp->sd_rindex_spin);
-                        return rgd;
+                        return cur;
                }
        }
        spin_unlock(&sdp->sd_rindex_spin);
        return NULL;
@@ -354,8 +359,15 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
 {
-        gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+        const struct rb_node *n;
-        return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+        struct gfs2_rgrpd *rgd;
+        spin_lock(&sdp->sd_rindex_spin);
+        n = rb_first(&sdp->sd_rindex_tree);
+        rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
 }
 /**
@@ -367,47 +379,60 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
 {
-        if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        const struct rb_node *n;
+        spin_lock(&sdp->sd_rindex_spin);
+        n = rb_next(&rgd->rd_node);
+        if (n == NULL)
+                n = rb_first(&sdp->sd_rindex_tree);
+        if (unlikely(&rgd->rd_node == n)) {
+                spin_unlock(&sdp->sd_rindex_spin);
                return NULL;
-        return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+        }
+        rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
+        spin_unlock(&sdp->sd_rindex_spin);
+        return rgd;
 }
-static void clear_rgrpdi(struct gfs2_sbd *sdp)
+void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 {
-        struct list_head *head;
+        int x;
+        for (x = 0; x < rgd->rd_length; x++) {
+                struct gfs2_bitmap *bi = rgd->rd_bits + x;
+                kfree(bi->bi_clone);
+                bi->bi_clone = NULL;
+        }
+}
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+        struct rb_node *n;
        struct gfs2_rgrpd *rgd;
        struct gfs2_glock *gl;
-        spin_lock(&sdp->sd_rindex_spin);
+        while ((n = rb_first(&sdp->sd_rindex_tree))) {
-        sdp->sd_rindex_forward = NULL;
+                rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
-        spin_unlock(&sdp->sd_rindex_spin);
-        head = &sdp->sd_rindex_list;
-        while (!list_empty(head)) {
-                rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
                gl = rgd->rd_gl;
-                list_del(&rgd->rd_list);
+                rb_erase(n, &sdp->sd_rindex_tree);
-                list_del(&rgd->rd_list_mru);
                if (gl) {
+                        spin_lock(&gl->gl_spin);
                        gl->gl_object = NULL;
+                        spin_unlock(&gl->gl_spin);
                        gfs2_glock_add_to_lru(gl);
                        gfs2_glock_put(gl);
                }
+                gfs2_free_clones(rgd);
                kfree(rgd->rd_bits);
                kmem_cache_free(gfs2_rgrpd_cachep, rgd);
        }
 }
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
-{
-        mutex_lock(&sdp->sd_rindex_mutex);
-        clear_rgrpdi(sdp);
-        mutex_unlock(&sdp->sd_rindex_mutex);
-}
 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
 {
        printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
@@ -524,22 +549,34 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        return total_data;
 }
-static void gfs2_rindex_in(struct gfs2_rgrpd *rgd, const void *buf)
+static void rgd_insert(struct gfs2_rgrpd *rgd)
 {
-        const struct gfs2_rindex *str = buf;
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
+        /* Figure out where to put new node */
+        while (*newn) {
+                struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
+                                                  rd_node);
+                parent = *newn;
+                if (rgd->rd_addr < cur->rd_addr)
+                        newn = &((*newn)->rb_left);
+                else if (rgd->rd_addr > cur->rd_addr)
+                        newn = &((*newn)->rb_right);
+                else
+                        return;
+        }
-        rgd->rd_addr = be64_to_cpu(str->ri_addr);
+        rb_link_node(&rgd->rd_node, parent, newn);
-        rgd->rd_length = be32_to_cpu(str->ri_length);
+        rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
-        rgd->rd_data0 = be64_to_cpu(str->ri_data0);
-        rgd->rd_data = be32_to_cpu(str->ri_data);
-        rgd->rd_bitbytes = be32_to_cpu(str->ri_bitbytes);
 }
 /**
 * read_rindex_entry - Pull in a new resource index entry from the disk
 * @gl: The glock covering the rindex inode
 *
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on success, > 0 on EOF, error code otherwise
 */
 static int read_rindex_entry(struct gfs2_inode *ip,
@@ -547,44 +584,53 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
-        char buf[sizeof(struct gfs2_rindex)];
+        struct gfs2_rindex buf;
        int error;
        struct gfs2_rgrpd *rgd;
-        error = gfs2_internal_read(ip, ra_state, buf, &pos,
+        if (pos >= i_size_read(&ip->i_inode))
+                return 1;
+        error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos,
                                   sizeof(struct gfs2_rindex));
-        if (!error)
-                return 0;
+        if (error != sizeof(struct gfs2_rindex))
-        if (error != sizeof(struct gfs2_rindex)) {
+                return (error == 0) ? 1 : error;
-                if (error > 0)
-                        error = -EIO;
-                return error;
-        }
        rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
        error = -ENOMEM;
        if (!rgd)
                return error;
-        mutex_init(&rgd->rd_mutex);
-        lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
        rgd->rd_sbd = sdp;
+        rgd->rd_addr = be64_to_cpu(buf.ri_addr);
+        rgd->rd_length = be32_to_cpu(buf.ri_length);
+        rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
+        rgd->rd_data = be32_to_cpu(buf.ri_data);
+        rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
-        list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
-        list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-        gfs2_rindex_in(rgd, buf);
        error = compute_bitstructs(rgd);
        if (error)
-                return error;
+                goto fail;
        error = gfs2_glock_get(sdp, rgd->rd_addr,
                               &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
        if (error)
-                return error;
+                goto fail;
        rgd->rd_gl->gl_object = rgd;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+        if (rgd->rd_data > sdp->sd_max_rg_data)
+                sdp->sd_max_rg_data = rgd->rd_data;
+        spin_lock(&sdp->sd_rindex_spin);
+        rgd_insert(rgd);
+        sdp->sd_rgrps++;
+        spin_unlock(&sdp->sd_rindex_spin);
+        return error;
+fail:
+        kfree(rgd->rd_bits);
+        kmem_cache_free(gfs2_rgrpd_cachep, rgd);
        return error;
 }
@@ -595,40 +641,28 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 * Returns: 0 on successful update, error code otherwise
 */
-int gfs2_ri_update(struct gfs2_inode *ip)
+static int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
-        u64 rgrp_count = i_size_read(inode);
-        struct gfs2_rgrpd *rgd;
-        unsigned int max_data = 0;
        int error;
-        do_div(rgrp_count, sizeof(struct gfs2_rindex));
-        clear_rgrpdi(sdp);
        file_ra_state_init(&ra_state, inode->i_mapping);
-        for (sdp->sd_rgrps = 0; sdp->sd_rgrps < rgrp_count; sdp->sd_rgrps++) {
+        do {
                error = read_rindex_entry(ip, &ra_state);
-                if (error) {
+        } while (error == 0);
-                        clear_rgrpdi(sdp);
-                        return error;
+        if (error < 0)
-                }
+                return error;
-        }
-        list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-                if (rgd->rd_data > max_data)
-                        max_data = rgd->rd_data;
-        sdp->sd_max_rg_data = max_data;
        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
 /**
- * gfs2_rindex_hold - Grab a lock on the rindex
+ * gfs2_rindex_update - Update the rindex if required
 * @sdp: The GFS2 superblock
- * @ri_gh: the glock holder
 *
 * We grab a lock on the rindex inode to make sure that it doesn't
 * change whilst we are performing an operation. We keep this lock
@@ -640,30 +674,29 @@ int gfs2_ri_update(struct gfs2_inode *ip)
 * special file, which might have been updated if someone expanded the
 * filesystem (via gfs2_grow utility), which adds new resource groups.
 *
- * Returns: 0 on success, error code otherwise
+ * Returns: 0 on succeess, error code otherwise
 */
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+int gfs2_rindex_update(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
        struct gfs2_glock *gl = ip->i_gl;
-        int error;
+        struct gfs2_holder ri_gh;
+        int error = 0;
-        error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
-        if (error)
-                return error;
        /* Read new copy from disk if we don't have the latest */
        if (!sdp->sd_rindex_uptodate) {
                mutex_lock(&sdp->sd_rindex_mutex);
-                if (!sdp->sd_rindex_uptodate) {
+                error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
+                if (error)
+                        return error;
+                if (!sdp->sd_rindex_uptodate)
                        error = gfs2_ri_update(ip);
-                        if (error)
+                gfs2_glock_dq_uninit(&ri_gh);
-                                gfs2_glock_dq_uninit(ri_gh);
-                }
                mutex_unlock(&sdp->sd_rindex_mutex);
        }
        return error;
 }
@@ -694,7 +727,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 }
 /**
- * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps
 * @rgd: the struct gfs2_rgrpd describing the RG to read in
 *
 * Read in all of a Resource Group's header and bitmap blocks.
@@ -703,8 +736,9 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 * Returns: errno
 */
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
 {
+        struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        struct gfs2_glock *gl = rgd->rd_gl;
        unsigned int length = rgd->rd_length;
@@ -712,17 +746,6 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        unsigned int x, y;
        int error;
-        mutex_lock(&rgd->rd_mutex);
-        spin_lock(&sdp->sd_rindex_spin);
-        if (rgd->rd_bh_count) {
-                rgd->rd_bh_count++;
-                spin_unlock(&sdp->sd_rindex_spin);
-                mutex_unlock(&rgd->rd_mutex);
-                return 0;
-        }
-        spin_unlock(&sdp->sd_rindex_spin);
        for (x = 0; x < length; x++) {
                bi = rgd->rd_bits + x;
                error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
@@ -747,15 +770,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                        clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+                rgd->rd_free_clone = rgd->rd_free;
        }
-        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_free;
-        rgd->rd_bh_count++;
-        spin_unlock(&sdp->sd_rindex_spin);
-        mutex_unlock(&rgd->rd_mutex);
        return 0;
 fail:
@@ -765,52 +782,32 @@ fail:
                bi->bi_bh = NULL;
                gfs2_assert_warn(sdp, !bi->bi_clone);
        }
-        mutex_unlock(&rgd->rd_mutex);
        return error;
 }
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
-{
-        struct gfs2_sbd *sdp = rgd->rd_sbd;
-        spin_lock(&sdp->sd_rindex_spin);
-        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
-        rgd->rd_bh_count++;
-        spin_unlock(&sdp->sd_rindex_spin);
-}
 /**
- * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
 * @rgd: the struct gfs2_rgrpd describing the RG to read in
 *
 */
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
 {
-        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
        int x, length = rgd->rd_length;
-        spin_lock(&sdp->sd_rindex_spin);
-        gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
-        if (--rgd->rd_bh_count) {
-                spin_unlock(&sdp->sd_rindex_spin);
-                return;
-        }
        for (x = 0; x < length; x++) {
                struct gfs2_bitmap *bi = rgd->rd_bits + x;
-                kfree(bi->bi_clone);
-                bi->bi_clone = NULL;
                brelse(bi->bi_bh);
                bi->bi_bh = NULL;
        }
-        spin_unlock(&sdp->sd_rindex_spin);
 }
-static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
-                                    const struct gfs2_bitmap *bi)
+                             struct buffer_head *bh,
+                             const struct gfs2_bitmap *bi)
 {
        struct super_block *sb = sdp->sd_vfs;
        struct block_device *bdev = sb->s_bdev;
@@ -823,7 +820,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
        unsigned int x;
        for (x = 0; x < bi->bi_len; x++) {
-                const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+                const u8 *orig = bh->b_data + bi->bi_offset + x;
                const u8 *clone = bi->bi_clone + bi->bi_offset + x;
                u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
                diff &= 0x55;
@@ -862,28 +859,6 @@ fail:
        sdp->sd_args.ar_discard = 0;
 }
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
-{
-        struct gfs2_sbd *sdp = rgd->rd_sbd;
-        unsigned int length = rgd->rd_length;
-        unsigned int x;
-        for (x = 0; x < length; x++) {
-                struct gfs2_bitmap *bi = rgd->rd_bits + x;
-                if (!bi->bi_clone)
-                        continue;
-                if (sdp->sd_args.ar_discard)
-                        gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
-                clear_bit(GBF_FULL, &bi->bi_flags);
-                memcpy(bi->bi_clone + bi->bi_offset,
-                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
-        }
-        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_free;
-        spin_unlock(&sdp->sd_rindex_spin);
-}
 /**
 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
 * @ip: the incore GFS2 inode structure
@@ -893,38 +868,35 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
 {
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error;
        BUG_ON(ip->i_alloc != NULL);
        ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS);
+        error = gfs2_rindex_update(sdp);
+        if (error)
+                fs_warn(sdp, "rindex update returns %d\n", error);
        return ip->i_alloc;
 }
 /**
 * try_rgrp_fit - See if a given reservation will fit in a given RG
 * @rgd: the RG data
- * @al: the struct gfs2_alloc structure describing the reservation
+ * @ip: the inode
 *
 * If there's room for the requested blocks to be allocated from the RG:
- *   Sets the $al_rgd field in @al.
 *
 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
 */
-static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
 {
-        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        const struct gfs2_alloc *al = ip->i_alloc;
-        int ret = 0;
        if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
                return 0;
+        if (rgd->rd_free_clone >= al->al_requested)
-        spin_lock(&sdp->sd_rindex_spin);
+                return 1;
-        if (rgd->rd_free_clone >= al->al_requested) {
+        return 0;
-                al->al_rgd = rgd;
-                ret = 1;
-        }
-        spin_unlock(&sdp->sd_rindex_spin);
-        return ret;
 }
 /**
@@ -992,76 +964,6 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 }
 /**
- * recent_rgrp_next - get next RG from "recent" list
- * @cur_rgd: current rgrp
- *
- * Returns: The next rgrp in the recent list
- */
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
-{
-        struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
-        struct list_head *head;
-        struct gfs2_rgrpd *rgd;
-        spin_lock(&sdp->sd_rindex_spin);
-        head = &sdp->sd_rindex_mru_list;
-        if (unlikely(cur_rgd->rd_list_mru.next == head)) {
-                spin_unlock(&sdp->sd_rindex_spin);
-                return NULL;
-        }
-        rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
-        spin_unlock(&sdp->sd_rindex_spin);
-        return rgd;
-}
-/**
- * forward_rgrp_get - get an rgrp to try next from full list
- * @sdp: The GFS2 superblock
- *
- * Returns: The rgrp to try next
- */
-static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
-{
-        struct gfs2_rgrpd *rgd;
-        unsigned int journals = gfs2_jindex_size(sdp);
-        unsigned int rg = 0, x;
-        spin_lock(&sdp->sd_rindex_spin);
-        rgd = sdp->sd_rindex_forward;
-        if (!rgd) {
-                if (sdp->sd_rgrps >= journals)
-                        rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
-                for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
-                     x++, rgd = gfs2_rgrpd_get_next(rgd))
-                        /* Do Nothing */;
-                sdp->sd_rindex_forward = rgd;
-        }
-        spin_unlock(&sdp->sd_rindex_spin);
-        return rgd;
-}
-/**
- * forward_rgrp_set - set the forward rgrp pointer
- * @sdp: the filesystem
- * @rgd: The new forward rgrp
- *
- */
-static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
-{
-        spin_lock(&sdp->sd_rindex_spin);
-        sdp->sd_rindex_forward = rgd;
-        spin_unlock(&sdp->sd_rindex_spin);
-}
-/**
 * get_local_rgrp - Choose and lock a rgrp for allocation
 * @ip: the inode to reserve space for
 * @rgp: the chosen and locked rgrp
@@ -1076,14 +978,18 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
        struct gfs2_alloc *al = ip->i_alloc;
-        int flags = LM_FLAG_TRY;
-        int skipped = 0;
-        int loops = 0;
        int error, rg_locked;
+        int loops = 0;
+        if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
+                rgd = begin = ip->i_rgd;
+        else
+                rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal);
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
+        if (rgd == NULL)
+                return -EBADSLT;
-        while (rgd) {
+        while (loops < 3) {
                rg_locked = 0;
                if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
@@ -1095,92 +1001,36 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                }
                switch (error) {
                case 0:
-                        if (try_rgrp_fit(rgd, al))
+                        if (try_rgrp_fit(rgd, ip)) {
-                                goto out;
+                                ip->i_rgd = rgd;
+                                return 0;
+                        }
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        /* fall through */
                case GLR_TRYFAILED:
-                        rgd = recent_rgrp_next(rgd);
+                        rgd = gfs2_rgrpd_get_next(rgd);
-                        break;
+                        if (rgd == begin)
+                                loops++;
-                default:
-                        return error;
-                }
-        }
-        /* Go through full list of rgrps */
-        begin = rgd = forward_rgrp_get(sdp);
-        for (;;) {
-                rg_locked = 0;
-                if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
-                        rg_locked = 1;
-                        error = 0;
-                } else {
-                        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
-                                                   &al->al_rgd_gh);
-                }
-                switch (error) {
-                case 0:
-                        if (try_rgrp_fit(rgd, al))
-                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
-                        if (!rg_locked)
-                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        break;
-                case GLR_TRYFAILED:
-                        skipped++;
                        break;
                default:
                        return error;
                }
-                rgd = gfs2_rgrpd_get_next(rgd);
-                if (!rgd)
-                        rgd = gfs2_rgrpd_get_first(sdp);
-                if (rgd == begin) {
-                        if (++loops >= 3)
-                                return -ENOSPC;
-                        if (!skipped)
-                                loops++;
-                        flags = 0;
-                        if (loops == 2)
-                                gfs2_log_flush(sdp, NULL);
-                }
        }
-out:
+        return -ENOSPC;
-        if (begin) {
-                spin_lock(&sdp->sd_rindex_spin);
-                list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
-                spin_unlock(&sdp->sd_rindex_spin);
-                rgd = gfs2_rgrpd_get_next(rgd);
-                if (!rgd)
-                        rgd = gfs2_rgrpd_get_first(sdp);
-                forward_rgrp_set(sdp, rgd);
-        }
-        return 0;
 }
 /**
- * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * gfs2_inplace_reserve - Reserve space in the filesystem
 * @ip: the inode to reserve space for
 *
 * Returns: errno
 */
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+int gfs2_inplace_reserve(struct gfs2_inode *ip)
-                           char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1191,45 +1041,22 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
-        if (hold_rindex) {
-                /* We need to hold the rindex unless the inode we're using is
-                   the rindex itself, in which case it's already held. */
-                if (ip != GFS2_I(sdp->sd_rindex))
-                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-                else if (!sdp->sd_rgrps) /* We may not have the rindex read
-                                            in, so: */
-                        error = gfs2_ri_update(ip);
-                if (error)
-                        return error;
-        }
-try_again:
        do {
                error = get_local_rgrp(ip, &last_unlinked);
-                /* If there is no space, flushing the log may release some */
+                if (error != -ENOSPC)
-                if (error) {
+                        break;
-                        if (ip == GFS2_I(sdp->sd_rindex) &&
+                /* Check that fs hasn't grown if writing to rindex */
-                            !sdp->sd_rindex_uptodate) {
+                if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
-                                error = gfs2_ri_update(ip);
+                        error = gfs2_ri_update(ip);
-                                if (error)
+                        if (error)
-                                        return error;
+                                break;
-                                goto try_again;
+                        continue;
-                        }
-                        gfs2_log_flush(sdp, NULL);
                }
-        } while (error && tries++ < 3);
+                /* Flushing the log may release space */
+                gfs2_log_flush(sdp, NULL);
-        if (error) {
+        } while (tries++ < 3);
-                if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
-                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                return error;
-        }
-        /* no error, so we have the rgrp set in the inode's allocation. */
-        al->al_file = file;
-        al->al_line = line;
-        return 0;
+        return error;
 }
 /**
@@ -1241,20 +1068,10 @@ try_again:
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
-        if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
-                fs_warn(sdp, "al_alloced = %u, al_requested = %u "
-                             "al_file = %s, al_line = %u\n",
-                             al->al_alloced, al->al_requested, al->al_file,
-                             al->al_line);
-        al->al_rgd = NULL;
        if (al->al_rgd_gh.gh_gl)
                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-        if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
-                gfs2_glock_dq_uninit(&al->al_ri_gh);
 }
 /**
@@ -1352,6 +1169,7 @@ do_search:
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
                buffer = bi->bi_bh->b_data + bi->bi_offset;
+                WARN_ON(!buffer_uptodate(bi->bi_bh));
                if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
                        buffer = bi->bi_clone + bi->bi_offset;
@@ -1371,6 +1189,7 @@ skip:
        if (blk == BFITNOENT)
                return blk;
        *n = 1;
        if (old_state == new_state)
                goto out;
@@ -1503,7 +1322,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        if (al == NULL)
                return -ECANCELED;
-        rgd = al->al_rgd;
+        rgd = ip->i_rgd;
        if (rgrp_contains_block(rgd, ip->i_goal))
                goal = ip->i_goal - rgd->rd_data0;
@@ -1518,7 +1337,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
-        ip->i_goal = block;
+        ip->i_goal = block + *n - 1;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error == 0) {
                struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
@@ -1539,9 +1358,7 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
        gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone -= *n;
-        spin_unlock(&sdp->sd_rindex_spin);
        trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
        *bn = block;
        return 0;
@@ -1564,7 +1381,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_alloc *al = dip->i_alloc;
-        struct gfs2_rgrpd *rgd = al->al_rgd;
+        struct gfs2_rgrpd *rgd = dip->i_rgd;
        u32 blk;
        u64 block;
        unsigned int n = 1;
@@ -1594,9 +1411,7 @@ int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
        gfs2_statfs_change(sdp, 0, -1, +1);
        gfs2_trans_add_unrevoke(sdp, block, 1);
-        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone--;
-        spin_unlock(&sdp->sd_rindex_spin);
        trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
        *bn = block;
        return 0;
@@ -1629,8 +1444,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-        gfs2_trans_add_rg(rgd);
        /* Directories keep their data in the metadata address space */
        if (meta || ip->i_depth)
                gfs2_meta_wipe(ip, bstart, blen);
@@ -1666,7 +1479,6 @@ void gfs2_unlink_di(struct inode *inode)
        trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
-        gfs2_trans_add_rg(rgd);
 }
 static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
@@ -1688,7 +1500,6 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_statfs_change(sdp, 0, +1, -1);
-        gfs2_trans_add_rg(rgd);
 }
@@ -1714,41 +1525,33 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 {
        struct gfs2_rgrpd *rgd;
-        struct gfs2_holder ri_gh, rgd_gh;
+        struct gfs2_holder rgd_gh;
-        struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
-        int ri_locked = 0;
        int error;
-        if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+        error = gfs2_rindex_update(sdp);
-                error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
-                if (error)
+                return error;
-                        goto fail;
-                ri_locked = 1;
-        }
        error = -EINVAL;
        rgd = gfs2_blk2rgrpd(sdp, no_addr);
        if (!rgd)
-                goto fail_rindex;
+                goto fail;
        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
        if (error)
-                goto fail_rindex;
+                goto fail;
        if (gfs2_get_block_type(rgd, no_addr) != type)
                error = -ESTALE;
        gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
-        if (ri_locked)
-                gfs2_glock_dq_uninit(&ri_gh);
 fail:
        return error;
 }
 /**
 * gfs2_rlist_add - add a RG to a list of RGs
- * @sdp: the filesystem
+ * @ip: the inode
 * @rlist: the list of resource groups
 * @block: the block
 *
@@ -1758,9 +1561,10 @@ fail:
 *
 */
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
                    u64 block)
 {
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
        struct gfs2_rgrpd **tmp;
        unsigned int new_space;
@@ -1769,12 +1573,15 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
        if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
                return;
-        rgd = gfs2_blk2rgrpd(sdp, block);
+        if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
+                rgd = ip->i_rgd;
+        else
+                rgd = gfs2_blk2rgrpd(sdp, block);
        if (!rgd) {
-                if (gfs2_consist(sdp))
+                fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
-                        fs_err(sdp, "block = %llu\n", (unsigned long long)block);
                return;
        }
+        ip->i_rgd = rgd;
        for (x = 0; x < rlist->rl_rgrps; x++)
                if (rlist->rl_rgd[x] == rgd)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index d253f9a8c70e..cf5c50180192 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,18 +18,15 @@ struct gfs2_holder;
 extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
-struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
-struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
+extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
-extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
-extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
@@ -39,16 +36,9 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
        ip->i_alloc = NULL;
 }
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip);
-                                  char *file, unsigned int line);
-#define gfs2_inplace_reserve(ip) \
-        gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
-#define gfs2_inplace_reserve_ri(ip) \
-        gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
@@ -66,11 +56,14 @@ struct gfs2_rgrp_list {
        struct gfs2_holder *rl_ghs;
 };
-extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
                           u64 block);
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
 extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+                                    struct buffer_head *bh,
+                                    const struct gfs2_bitmap *bi);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b7beadd9ba4c..71e420989f77 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -752,51 +752,77 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
        struct backing_dev_info *bdi = metamapping->backing_dev_info;
-        struct gfs2_holder gh;
+        int ret = 0;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        if (bdi->dirty_exceeded)
+                gfs2_ail1_flush(sdp, wbc);
+        else
+                filemap_fdatawrite(metamapping);
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                ret = filemap_fdatawait(metamapping);
+        if (ret)
+                mark_inode_dirty_sync(inode);
+        return ret;
+}
+/**
+ * gfs2_dirty_inode - check for atime updates
+ * @inode: The inode in question
+ * @flags: The type of dirty
+ *
+ * Unfortunately it can be called under any combination of inode
+ * glock and transaction lock, so we have to check carefully.
+ *
+ * At the moment this deals only with atime - it should be possible
+ * to expand that role in future, once a review of the locking has
+ * been carried out.
+ */
+static void gfs2_dirty_inode(struct inode *inode, int flags)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *bh;
-        struct timespec atime;
+        struct gfs2_holder gh;
-        struct gfs2_dinode *di;
+        int need_unlock = 0;
-        int ret = -EAGAIN;
+        int need_endtrans = 0;
-        int unlock_required = 0;
+        int ret;
-        /* Skip timestamp update, if this is from a memalloc */
+        if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
-        if (current->flags & PF_MEMALLOC)
+                return;
-                goto do_flush;
        if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
                ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-                if (ret)
+                if (ret) {
-                        goto do_flush;
+                        fs_err(sdp, "dirty_inode: glock %d\n", ret);
-                unlock_required = 1;
+                        return;
+                }
+                need_unlock = 1;
        }
-        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (ret)
+        if (current->journal_info == NULL) {
-                goto do_unlock;
+                ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+                if (ret) {
+                        fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret);
+                        goto out;
+                }
+                need_endtrans = 1;
+        }
        ret = gfs2_meta_inode_buffer(ip, &bh);
        if (ret == 0) {
-                di = (struct gfs2_dinode *)bh->b_data;
+                gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                atime.tv_sec = be64_to_cpu(di->di_atime);
+                gfs2_dinode_out(ip, bh->b_data);
-                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-                if (timespec_compare(&inode->i_atime, &atime) > 0) {
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        gfs2_dinode_out(ip, bh->b_data);
-                }
                brelse(bh);
        }
-        gfs2_trans_end(sdp);
-do_unlock:
+        if (need_endtrans)
-        if (unlock_required)
+                gfs2_trans_end(sdp);
+out:
+        if (need_unlock)
                gfs2_glock_dq_uninit(&gh);
-do_flush:
-        if (wbc->sync_mode == WB_SYNC_ALL)
-                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-        filemap_fdatawrite(metamapping);
-        if (bdi->dirty_exceeded)
-                gfs2_ail1_flush(sdp, wbc);
-        if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
-                ret = filemap_fdatawait(metamapping);
-        if (ret)
-                mark_inode_dirty_sync(inode);
-        return ret;
 }
 /**
@@ -1011,7 +1037,6 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
 static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
 {
-        struct gfs2_holder ri_gh;
        struct gfs2_rgrpd *rgd_next;
        struct gfs2_holder *gha, *gh;
        unsigned int slots = 64;
@@ -1024,10 +1049,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
        if (!gha)
                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
        rgd_next = gfs2_rgrpd_get_first(sdp);
        for (;;) {
@@ -1070,9 +1091,6 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host
                yield();
        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
        kfree(gha);
        return error;
 }
@@ -1124,6 +1142,10 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct gfs2_statfs_change_host sc;
        int error;
+        error = gfs2_rindex_update(sdp);
+        if (error)
+                return error;
        if (gfs2_tune_get(sdp, gt_statfs_slow))
                error = gfs2_statfs_slow(sdp, &sc);
        else
@@ -1394,21 +1416,17 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out;
-        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-        if (error)
-                goto out_qs;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
        if (!rgd) {
                gfs2_consist_inode(ip);
                error = -EIO;
-                goto out_rindex_relse;
+                goto out_qs;
        }
        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
                                   &al->al_rgd_gh);
        if (error)
-                goto out_rindex_relse;
+                goto out_qs;
        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
                                 sdp->sd_jdesc->jd_blocks);
@@ -1423,8 +1441,6 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 out_rg_gunlock:
        gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
-        gfs2_glock_dq_uninit(&al->al_ri_gh);
 out_qs:
        gfs2_quota_unhold(ip);
 out:
@@ -1471,9 +1487,11 @@ static void gfs2_evict_inode(struct inode *inode)
                goto out;
        }
-        error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+        if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
-        if (error)
+                error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
-                goto out_truncate;
+                if (error)
+                        goto out_truncate;
+        }
        if (test_bit(GIF_INVALID, &ip->i_flags)) {
                error = gfs2_inode_refresh(ip);
@@ -1513,6 +1531,10 @@ static void gfs2_evict_inode(struct inode *inode)
        goto out_unlock;
 out_truncate:
+        gfs2_log_flush(sdp, ip->i_gl);
+        write_inode_now(inode, 1);
+        gfs2_ail_flush(ip->i_gl, 0);
        /* Case 2 starts here */
        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
        if (error)
@@ -1552,6 +1574,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
        if (ip) {
                ip->i_flags = 0;
                ip->i_gl = NULL;
+                ip->i_rgd = NULL;
        }
        return &ip->i_inode;
 }
@@ -1572,6 +1595,7 @@ const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
        .write_inode            = gfs2_write_inode,
+        .dirty_inode            = gfs2_dirty_inode,
        .evict_inode            = gfs2_evict_inode,
        .put_super              = gfs2_put_super,
        .sync_fs                = gfs2_sync_fs,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 9ec73a854111..86ac75d99d31 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -185,8 +185,3 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
        gfs2_log_unlock(sdp);
 }
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
-{
-        lops_add(rgd->rd_sbd, &rgd->rd_le);
-}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index fb56b783e028..f8f101ef600c 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -28,20 +28,20 @@ struct gfs2_glock;
 /* reserve either the number of blocks to be allocated plus the rg header
 * block, or all of the blocks in the rg, whichever is smaller */
-static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip)
 {
-        return (al->al_requested < al->al_rgd->rd_length)?
+        const struct gfs2_alloc *al = ip->i_alloc;
-               al->al_requested + 1 : al->al_rgd->rd_length;
+        if (al->al_requested < ip->i_rgd->rd_length)
+                return al->al_requested + 1;
+        return ip->i_rgd->rd_length;
 }
-int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
-                     unsigned int revokes);
+                            unsigned int revokes);
-void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_end(struct gfs2_sbd *sdp);
+extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
 #endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 439b61c03262..71d7bf830c09 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -332,15 +332,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        if (error)
                goto out_alloc;
-        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-        if (error)
-                goto out_quota;
        error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
-        gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_quota:
        gfs2_quota_unhold(ip);
 out_alloc:
        gfs2_alloc_put(ip);
@@ -734,7 +727,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                goto out_gunlock_q;
        error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-                                 blks + gfs2_rg_blocks(al) +
+                                 blks + gfs2_rg_blocks(ip) +
                                 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
        if (error)
                goto out_ipres;
@@ -1296,7 +1289,8 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct inode *inode = &ip->i_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_ea_location el;
        int error;
@@ -1319,7 +1313,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
-        error = gfs2_setattr_simple(ip, attr);
+        error = gfs2_setattr_simple(inode, attr);
        gfs2_trans_end(sdp);
        return error;
 }
@@ -1362,14 +1356,14 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
                        blen++;
                else {
                        if (bstart)
-                                gfs2_rlist_add(sdp, &rlist, bstart);
+                                gfs2_rlist_add(ip, &rlist, bstart);
                        bstart = bn;
                        blen = 1;
                }
                blks++;
        }
        if (bstart)
-                gfs2_rlist_add(sdp, &rlist, bstart);
+                gfs2_rlist_add(ip, &rlist, bstart);
        else
                goto out;
@@ -1501,24 +1495,18 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out_alloc;
-        error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
-        if (error)
-                goto out_quota;
        error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
        if (error)
-                goto out_rindex;
+                goto out_quota;
        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                error = ea_dealloc_indirect(ip);
                if (error)
-                        goto out_rindex;
+                        goto out_quota;
        }
        error = ea_dealloc_block(ip);
-out_rindex:
-        gfs2_glock_dq_uninit(&al->al_ri_gh);
 out_quota:
        gfs2_quota_unhold(ip);
 out_alloc:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c106ca22e812..d24a9b666a23 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls = NULL;
+        u64 last_fs_block, last_fs_page;
        int err;
        err = -EINVAL;
@@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi->rsrc_clump_blocks)
                sbi->rsrc_clump_blocks = 1;
-        err = generic_check_addressable(sbi->alloc_blksz_shift,
+        err = -EFBIG;
-                                        sbi->total_blocks);
+        last_fs_block = sbi->total_blocks - 1;
-        if (err) {
+        last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
+                        PAGE_CACHE_SHIFT;
+        if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
+            (last_fs_page > (pgoff_t)(~0ULL))) {
                printk(KERN_ERR "hfs: filesystem size too large.\n");
                goto out_free_vhdr;
        }
@@ -525,8 +530,8 @@ out_close_cat_tree:
 out_close_ext_tree:
        hfs_btree_close(sbi->ext_tree);
 out_free_vhdr:
-        kfree(sbi->s_vhdr);
+        kfree(sbi->s_vhdr_buf);
-        kfree(sbi->s_backup_vhdr);
+        kfree(sbi->s_backup_vhdr_buf);
 out_unload_nls:
        unload_nls(sbi->nls);
        unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10e515a0d452..7daf4b852d1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -272,9 +272,9 @@ reread:
        return 0;
 out_free_backup_vhdr:
-        kfree(sbi->s_backup_vhdr);
+        kfree(sbi->s_backup_vhdr_buf);
 out_free_vhdr:
-        kfree(sbi->s_vhdr);
+        kfree(sbi->s_vhdr_buf);
 out:
        return error;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87b6e0421c12..ec889538e5a6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                        inode->i_op = &page_symlink_inode_operations;
                        break;
                }
+                lockdep_annotate_inode_mutex_key(inode);
        }
        return inode;
 }
diff --git a/fs/inode.c b/fs/inode.c
index 73920d555c88..ecbb68dc7e2a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -634,7 +634,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
                 * inode to the back of the list so we don't spin on it.
                 */
                if (!spin_trylock(&inode->i_lock)) {
-                        list_move(&inode->i_lru, &sb->s_inode_lru);
+                        list_move_tail(&inode->i_lru, &sb->s_inode_lru);
                        continue;
                }
@@ -848,16 +848,9 @@ struct inode *new_inode(struct super_block *sb)
 }
 EXPORT_SYMBOL(new_inode);
-/**
- * unlock_new_inode - clear the I_NEW state and wake up any waiters
- * @inode:      new inode to unlock
- *
- * Called when the inode is fully initialised to clear the new state of the
- * inode and wake up anyone waiting for the inode to finish initialisation.
- */
-void unlock_new_inode(struct inode *inode)
-{
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+void lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;
@@ -873,7 +866,20 @@ void unlock_new_inode(struct inode *inode)
                                          &type->i_mutex_dir_key);
                }
        }
+}
+EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
 #endif
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:      new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
+void unlock_new_inode(struct inode *inode)
+{
+        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index cfeb7164b085..0f20208df602 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -22,26 +22,29 @@
 #include <linux/security.h>
 #include "nodelist.h"
-/* ---- Initial Security Label Attachment -------------- */
+/* ---- Initial Security Label(s) Attachment callback --- */
-int jffs2_init_security(struct inode *inode, struct inode *dir,
+int jffs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                        const struct qstr *qstr)
+                     void *fs_info)
 {
-        int rc;
+        const struct xattr *xattr;
-        size_t len;
+        int err = 0;
-        void *value;
-        char *name;
-        rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-        if (rc) {
+                err = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
-                if (rc == -EOPNOTSUPP)
+                                        xattr->name, xattr->value,
-                        return 0;
+                                        xattr->value_len, 0);
-                return rc;
+                if (err < 0)
+                        break;
        }
-        rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+        return err;
+}
-        kfree(name);
+/* ---- Initial Security Label(s) Attachment ----------- */
-        kfree(value);
+int jffs2_init_security(struct inode *inode, struct inode *dir,
-        return rc;
+                        const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &jffs2_initxattrs, NULL);
 }
 /* ---- XATTR Handler for "security.*" ----------------- */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index adcf92d3b603..7971f37534a3 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
                /*
                 * Wait for outstanding transactions to be written to log:
                 */
-                jfs_flush_journal(log, 1);
+                jfs_flush_journal(log, 2);
        /*
         * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
         *
         * remove file system from log active file system list.
         */
-        jfs_flush_journal(log, 1);
+        jfs_flush_journal(log, 2);
        /*
         * Make sure all metadata makes it to disk
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index e87fedef23db..26683e15b3ac 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1089,38 +1089,37 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                      const struct qstr *qstr)
+                   void *fs_info)
 {
-        int rc;
+        const struct xattr *xattr;
-        size_t len;
+        tid_t *tid = fs_info;
-        void *value;
-        char *suffix;
        char *name;
+        int err = 0;
-        rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-                                          &len);
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-        if (rc) {
+                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
-                if (rc == -EOPNOTSUPP)
+                               strlen(xattr->name) + 1, GFP_NOFS);
-                        return 0;
+                if (!name) {
-                return rc;
+                        err = -ENOMEM;
-        }
+                        break;
-        name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
+                }
-                       GFP_NOFS);
+                strcpy(name, XATTR_SECURITY_PREFIX);
-        if (!name) {
+                strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
-                rc = -ENOMEM;
-                goto kmalloc_failed;
+                err = __jfs_setxattr(*tid, inode, name,
+                                     xattr->value, xattr->value_len, 0);
+                kfree(name);
+                if (err < 0)
+                        break;
        }
-        strcpy(name, XATTR_SECURITY_PREFIX);
+        return err;
-        strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+}
-        rc = __jfs_setxattr(tid, inode, name, value, len, 0);
-        kfree(name);
-kmalloc_failed:
-        kfree(suffix);
-        kfree(value);
-        return rc;
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                      const struct qstr *qstr)
+{
+        return security_inode_init_security(inode, dir, qstr,
+                                            &jfs_initxattrs, &tid);
 }
 #endif
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index b7c99bfb3da6..6f29836ec0cb 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -316,14 +316,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
        struct hlist_node *pos;
        struct nlm_host *host = NULL;
        struct nsm_handle *nsm = NULL;
-        struct sockaddr_in sin = {
+        struct sockaddr *src_sap = svc_daddr(rqstp);
-                .sin_family     = AF_INET,
+        size_t src_len = rqstp->rq_daddrlen;
-        };
-        struct sockaddr_in6 sin6 = {
-                .sin6_family    = AF_INET6,
-        };
-        struct sockaddr *src_sap;
-        size_t src_len = rqstp->rq_addrlen;
        struct nlm_lookup_host_info ni = {
                .server         = 1,
                .sap            = svc_addr(rqstp),
@@ -340,21 +334,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
        mutex_lock(&nlm_host_mutex);
-        switch (ni.sap->sa_family) {
-        case AF_INET:
-                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
-                src_sap = (struct sockaddr *)&sin;
-                break;
-        case AF_INET6:
-                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
-                src_sap = (struct sockaddr *)&sin6;
-                break;
-        default:
-                dprintk("lockd: %s failed; unrecognized address family\n",
-                        __func__);
-                goto out;
-        }
        if (time_after_eq(jiffies, next_gc))
                nlm_gc_hosts();
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index abfff9d7979d..c061b9aa7ddb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -282,7 +282,7 @@ int lockd_up(void)
        /*
         * Create the kernel thread and wait for it to start.
         */
-        nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
        if (IS_ERR(nlmsvc_rqst)) {
                error = PTR_ERR(nlmsvc_rqst);
                nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 703f545097de..3b0d05dcd7c1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -60,7 +60,7 @@
 *
 *  Initial implementation of mandatory locks. SunOS turned out to be
 *  a rotten model, so I implemented the "obvious" semantics.
- *  See 'Documentation/mandatory.txt' for details.
+ *  See 'Documentation/filesystems/mandatory-locking.txt' for details.
 *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
 *
 *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@ -133,6 +133,20 @@
 #define IS_FLOCK(fl)    (fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)    (fl->fl_flags & FL_LEASE)
+static bool lease_breaking(struct file_lock *fl)
+{
+        return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+}
+static int target_leasetype(struct file_lock *fl)
+{
+        if (fl->fl_flags & FL_UNLOCK_PENDING)
+                return F_UNLCK;
+        if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+                return F_RDLCK;
+        return fl->fl_type;
+}
 int leases_enable = 1;
 int lease_break_time = 45;
@@ -1119,6 +1133,17 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 EXPORT_SYMBOL(locks_mandatory_area);
+static void lease_clear_pending(struct file_lock *fl, int arg)
+{
+        switch (arg) {
+        case F_UNLCK:
+                fl->fl_flags &= ~FL_UNLOCK_PENDING;
+                /* fall through: */
+        case F_RDLCK:
+                fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+        }
+}
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock **before, int arg)
 {
@@ -1127,6 +1152,7 @@ int lease_modify(struct file_lock **before, int arg)
        if (error)
                return error;
+        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(fl);
        if (arg == F_UNLCK)
                locks_delete_lock(before);
@@ -1135,19 +1161,25 @@ int lease_modify(struct file_lock **before, int arg)
 EXPORT_SYMBOL(lease_modify);
+static bool past_time(unsigned long then)
+{
+        if (!then)
+                /* 0 is a special value meaning "this never expires": */
+                return false;
+        return time_after(jiffies, then);
+}
 static void time_out_leases(struct inode *inode)
 {
        struct file_lock **before;
        struct file_lock *fl;
        before = &inode->i_flock;
-        while ((fl = *before) && IS_LEASE(fl) && (fl->fl_type & F_INPROGRESS)) {
+        while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
-                if ((fl->fl_break_time == 0)
+                if (past_time(fl->fl_downgrade_time))
-                                || time_before(jiffies, fl->fl_break_time)) {
+                        lease_modify(before, F_RDLCK);
-                        before = &fl->fl_next;
+                if (past_time(fl->fl_break_time))
-                        continue;
+                        lease_modify(before, F_UNLCK);
-                }
-                lease_modify(before, fl->fl_type & ~F_INPROGRESS);
                if (fl == *before)      /* lease_modify may have freed fl */
                        before = &fl->fl_next;
        }
@@ -1165,7 +1197,7 @@ static void time_out_leases(struct inode *inode)
 */
 int __break_lease(struct inode *inode, unsigned int mode)
 {
-        int error = 0, future;
+        int error = 0;
        struct file_lock *new_fl, *flock;
        struct file_lock *fl;
        unsigned long break_time;
@@ -1182,24 +1214,13 @@ int __break_lease(struct inode *inode, unsigned int mode)
        if ((flock == NULL) || !IS_LEASE(flock))
                goto out;
+        if (!locks_conflict(flock, new_fl))
+                goto out;
        for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
                if (fl->fl_owner == current->files)
                        i_have_this_lease = 1;
-        if (want_write) {
-                /* If we want write access, we have to revoke any lease. */
-                future = F_UNLCK | F_INPROGRESS;
-        } else if (flock->fl_type & F_INPROGRESS) {
-                /* If the lease is already being broken, we just leave it */
-                future = flock->fl_type;
-        } else if (flock->fl_type & F_WRLCK) {
-                /* Downgrade the exclusive lease to a read-only lease. */
-                future = F_RDLCK | F_INPROGRESS;
-        } else {
-                /* the existing lease was read-only, so we can read too. */
-                goto out;
-        }
        if (IS_ERR(new_fl) && !i_have_this_lease
                        && ((mode & O_NONBLOCK) == 0)) {
                error = PTR_ERR(new_fl);
@@ -1214,12 +1235,18 @@ int __break_lease(struct inode *inode, unsigned int mode)
        }
        for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
-                if (fl->fl_type != future) {
+                if (want_write) {
-                        fl->fl_type = future;
+                        if (fl->fl_flags & FL_UNLOCK_PENDING)
+                                continue;
+                        fl->fl_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
-                        /* lease must have lmops break callback */
+                } else {
-                        fl->fl_lmops->lm_break(fl);
+                        if (lease_breaking(flock))
+                                continue;
+                        fl->fl_flags |= FL_DOWNGRADE_PENDING;
+                        fl->fl_downgrade_time = break_time;
                }
+                fl->fl_lmops->lm_break(fl);
        }
        if (i_have_this_lease || (mode & O_NONBLOCK)) {
@@ -1243,10 +1270,13 @@ restart:
        if (error >= 0) {
                if (error == 0)
                        time_out_leases(inode);
-                /* Wait for the next lease that has not been broken yet */
+                /*
+                 * Wait for the next conflicting lease that has not been
+                 * broken yet
+                 */
                for (flock = inode->i_flock; flock && IS_LEASE(flock);
                                flock = flock->fl_next) {
-                        if (flock->fl_type & F_INPROGRESS)
+                        if (locks_conflict(new_fl, flock))
                                goto restart;
                }
                error = 0;
@@ -1314,7 +1344,7 @@ int fcntl_getlease(struct file *filp)
        for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
                        fl = fl->fl_next) {
                if (fl->fl_file == filp) {
-                        type = fl->fl_type & ~F_INPROGRESS;
+                        type = target_leasetype(fl);
                        break;
                }
        }
@@ -1322,50 +1352,23 @@ int fcntl_getlease(struct file *filp)
        return type;
 }
-/**
+int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
- *      generic_setlease        -       sets a lease on an open file
- *      @filp: file pointer
- *      @arg: type of lease to obtain
- *      @flp: input - file_lock to use, output - file_lock inserted
- *
- *      The (input) flp->fl_lmops->lm_break function is required
- *      by break_lease().
- *
- *      Called with file_lock_lock held.
- */
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
-        int error, rdlease_count = 0, wrlease_count = 0;
+        int error;
        lease = *flp;
-        error = -EACCES;
+        error = -EAGAIN;
-        if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-                goto out;
-        error = -EINVAL;
-        if (!S_ISREG(inode->i_mode))
                goto out;
-        error = security_file_lock(filp, arg);
+        if ((arg == F_WRLCK)
-        if (error)
+            && ((dentry->d_count > 1)
+                || (atomic_read(&inode->i_count) > 1)))
                goto out;
-        time_out_leases(inode);
-        BUG_ON(!(*flp)->fl_lmops->lm_break);
-        if (arg != F_UNLCK) {
-                error = -EAGAIN;
-                if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
-                        goto out;
-                if ((arg == F_WRLCK)
-                    && ((dentry->d_count > 1)
-                        || (atomic_read(&inode->i_count) > 1)))
-                        goto out;
-        }
        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
@@ -1374,27 +1377,28 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
+        error = -EAGAIN;
        for (before = &inode->i_flock;
                        ((fl = *before) != NULL) && IS_LEASE(fl);
                        before = &fl->fl_next) {
-                if (fl->fl_file == filp)
+                if (fl->fl_file == filp) {
                        my_before = before;
-                else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
+                        continue;
-                        /*
+                }
-                         * Someone is in the process of opening this
+                /*
-                         * file for writing so we may not take an
+                 * No exclusive leases if someone else has a lease on
-                         * exclusive lease on it.
+                 * this file:
-                         */
+                 */
-                        wrlease_count++;
+                if (arg == F_WRLCK)
-                else
+                        goto out;
-                        rdlease_count++;
+                /*
+                 * Modifying our existing lease is OK, but no getting a
+                 * new lease if someone else is opening for write:
+                 */
+                if (fl->fl_flags & FL_UNLOCK_PENDING)
+                        goto out;
        }
-        error = -EAGAIN;
-        if ((arg == F_RDLCK && (wrlease_count > 0)) ||
-            (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0)))
-                goto out;
        if (my_before != NULL) {
                error = lease->fl_lmops->lm_change(my_before, arg);
                if (!error)
@@ -1402,9 +1406,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
                goto out;
        }
-        if (arg == F_UNLCK)
-                goto out;
        error = -EINVAL;
        if (!leases_enable)
                goto out;
@@ -1415,6 +1416,62 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 out:
        return error;
 }
+int generic_delete_lease(struct file *filp, struct file_lock **flp)
+{
+        struct file_lock *fl, **before;
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        for (before = &inode->i_flock;
+                        ((fl = *before) != NULL) && IS_LEASE(fl);
+                        before = &fl->fl_next) {
+                if (fl->fl_file != filp)
+                        continue;
+                return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
+        }
+        return -EAGAIN;
+}
+/**
+ *      generic_setlease        -       sets a lease on an open file
+ *      @filp: file pointer
+ *      @arg: type of lease to obtain
+ *      @flp: input - file_lock to use, output - file_lock inserted
+ *
+ *      The (input) flp->fl_lmops->lm_break function is required
+ *      by break_lease().
+ *
+ *      Called with file_lock_lock held.
+ */
+int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
+{
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        int error;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+                return -EACCES;
+        if (!S_ISREG(inode->i_mode))
+                return -EINVAL;
+        error = security_file_lock(filp, arg);
+        if (error)
+                return error;
+        time_out_leases(inode);
+        BUG_ON(!(*flp)->fl_lmops->lm_break);
+        switch (arg) {
+        case F_UNLCK:
+                return generic_delete_lease(filp, flp);
+        case F_RDLCK:
+        case F_WRLCK:
+                return generic_add_lease(filp, arg, flp);
+        default:
+                BUG();
+        }
+}
 EXPORT_SYMBOL(generic_setlease);
 static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
@@ -2126,7 +2183,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                }
        } else if (IS_LEASE(fl)) {
                seq_printf(f, "LEASE  ");
-                if (fl->fl_type & F_INPROGRESS)
+                if (lease_breaking(fl))
                        seq_printf(f, "BREAKING  ");
                else if (fl->fl_file)
                        seq_printf(f, "ACTIVE    ");
@@ -2142,7 +2199,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                               : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
        } else {
                seq_printf(f, "%s ",
-                               (fl->fl_type & F_INPROGRESS)
+                               (lease_breaking(fl))
                               ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
                               : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
        }
diff --git a/fs/namei.c b/fs/namei.c
index 2826db35dc25..7657be4352bf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -221,14 +221,12 @@ static int check_acl(struct inode *inode, int mask)
 }
 /*
- * This does basic POSIX ACL permission checking
+ * This does the basic permission checking
 */
 static int acl_permission_check(struct inode *inode, int mask)
 {
        unsigned int mode = inode->i_mode;
-        mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
        if (current_user_ns() != inode_userns(inode))
                goto other_perms;
@@ -257,7 +255,7 @@ other_perms:
 /**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @inode:      inode to check access rights for
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +271,7 @@ int generic_permission(struct inode *inode, int mask)
        int ret;
        /*
-         * Do the basic POSIX ACL permission checks.
+         * Do the basic permission checks.
         */
        ret = acl_permission_check(inode, mask);
        if (ret != -EACCES)
@@ -331,12 +329,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
 /**
 * inode_permission  -  check for access rights to a given inode
 * @inode:      inode to check permission on
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
 *
 * Used to check for read/write/execute permissions on an inode.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
 int inode_permission(struct inode *inode, int mask)
 {
@@ -721,31 +721,22 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
                return -EREMOTE;
-        /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+        /* We don't want to mount if someone's just doing a stat -
-         * and this is the terminal part of the path.
+         * unless they're stat'ing a directory and appended a '/' to
+         * the name.
+         *
+         * We do, however, want to mount if someone wants to open or
+         * create a file of any type under the mountpoint, wants to
+         * traverse through the mountpoint or wants to open the
+         * mounted directory.  Also, autofs may mark negative dentries
+         * as being automount points.  These will need the attentions
+         * of the daemon to instantiate them before they can be used.
         */
-        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
+        if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                return -EISDIR; /* we actually want to stop here */
+                     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+            path->dentry->d_inode)
+                return -EISDIR;
-        /*
-         * We don't want to mount if someone's just doing a stat and they've
-         * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
-         * appended a '/' to the name.
-         */
-        if (!(flags & LOOKUP_FOLLOW)) {
-                /* We do, however, want to mount if someone wants to open or
-                 * create a file of any type under the mountpoint, wants to
-                 * traverse through the mountpoint or wants to open the mounted
-                 * directory.
-                 * Also, autofs may mark negative dentries as being automount
-                 * points.  These will need the attentions of the daemon to
-                 * instantiate them before they can be used.
-                 */
-                if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                             LOOKUP_OPEN | LOOKUP_CREATE)) &&
-                    path->dentry->d_inode)
-                        return -EISDIR;
-        }
        current->total_link_count++;
        if (current->total_link_count >= 40)
                return -ELOOP;
@@ -2044,10 +2035,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
-        /*
+        return 0;
-         * Ensure there are no outstanding leases on the file.
-         */
-        return break_lease(inode, flag);
 }
 static int handle_truncate(struct file *filp)
@@ -2619,6 +2607,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!dir->i_op->rmdir)
                return -EPERM;
+        dget(dentry);
        mutex_lock(&dentry->d_inode->i_mutex);
        error = -EBUSY;
@@ -2639,6 +2628,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 out:
        mutex_unlock(&dentry->d_inode->i_mutex);
+        dput(dentry);
        if (!error)
                d_delete(dentry);
        return error;
@@ -3028,6 +3018,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
+        dget(new_dentry);
        if (target)
                mutex_lock(&target->i_mutex);
@@ -3048,6 +3039,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 out:
        if (target)
                mutex_unlock(&target->i_mutex);
+        dput(new_dentry);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe8273c68..e5e1c7d1839b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1109,6 +1109,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
        /* device */
        if (mnt->mnt_sb->s_op->show_devname) {
+                seq_puts(m, "device ");
                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
        } else {
                if (mnt->mnt_devname) {
@@ -1757,7 +1758,7 @@ static int do_loopback(struct path *path, char *old_name,
                return err;
        if (!old_name || !*old_name)
                return -EINVAL;
-        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index be020771c6b4..dbcd82126aed 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -79,12 +79,9 @@ config NFS_V4_1
        depends on NFS_FS && NFS_V4 && EXPERIMENTAL
        select SUNRPC_BACKCHANNEL
        select PNFS_FILE_LAYOUT
-        select PNFS_BLOCK
-        select MD
-        select BLK_DEV_DM
        help
          This option enables support for minor version 1 of the NFSv4 protocol
-          (RFC 5661 and RFC 5663) in the kernel's NFS client.
+          (RFC 5661) in the kernel's NFS client.
          If unsure, say N.
@@ -93,16 +90,13 @@ config PNFS_FILE_LAYOUT
 config PNFS_BLOCK
        tristate
+        depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM
+        default m
 config PNFS_OBJLAYOUT
-        tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+        tristate
        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
-        help
+        default m
-          Say M here if you want your pNFS client to support the Objects Layout Driver.
-          Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
-          upper level driver (SCSI_OSD_ULD).
-          If unsure, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e56564d2ef95..281ae95932c9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -36,6 +36,7 @@
 #include <linux/namei.h>
 #include <linux/bio.h>          /* struct bio */
 #include <linux/buffer_head.h>  /* various write calls */
+#include <linux/prefetch.h>
 #include "blocklayout.h"
@@ -175,17 +176,6 @@ retry:
        return bio;
 }
-static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-        if (lseg->pls_range.iomode == IOMODE_RW) {
-                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-        } else {
-                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-        }
-}
 /* This is basically copied from mpage_end_io_read */
 static void bl_end_io_read(struct bio *bio, int err)
 {
@@ -205,7 +195,7 @@ static void bl_end_io_read(struct bio *bio, int err)
        if (!uptodate) {
                if (!rdata->pnfs_error)
                        rdata->pnfs_error = -EIO;
-                bl_set_lo_fail(rdata->lseg);
+                pnfs_set_lo_fail(rdata->lseg);
        }
        bio_put(bio);
        put_parallel(par);
@@ -302,6 +292,7 @@ bl_read_pagelist(struct nfs_read_data *rdata)
                                                 bl_end_io_read, par);
                        if (IS_ERR(bio)) {
                                rdata->pnfs_error = PTR_ERR(bio);
+                                bio = NULL;
                                goto out;
                        }
                }
@@ -369,7 +360,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        if (!uptodate) {
                if (!wdata->pnfs_error)
                        wdata->pnfs_error = -EIO;
-                bl_set_lo_fail(wdata->lseg);
+                pnfs_set_lo_fail(wdata->lseg);
        }
        bio_put(bio);
        put_parallel(par);
@@ -385,7 +376,7 @@ static void bl_end_io_write(struct bio *bio, int err)
        if (!uptodate) {
                if (!wdata->pnfs_error)
                        wdata->pnfs_error = -EIO;
-                bl_set_lo_fail(wdata->lseg);
+                pnfs_set_lo_fail(wdata->lseg);
        }
        bio_put(bio);
        put_parallel(par);
@@ -542,6 +533,11 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 fill_invalid_ext:
                dprintk("%s need to zero %d pages\n", __func__, npg_zero);
                for (;npg_zero > 0; npg_zero--) {
+                        if (bl_is_sector_init(be->be_inval, isect)) {
+                                dprintk("isect %llu already init\n",
+                                        (unsigned long long)isect);
+                                goto next_page;
+                        }
                        /* page ref released in bl_end_io_write_zero */
                        index = isect >> PAGE_CACHE_SECTOR_SHIFT;
                        dprintk("%s zero %dth page: index %lu isect %llu\n",
@@ -561,8 +557,7 @@ fill_invalid_ext:
                         * PageUptodate: It was read before
                         * sector_initialized: already written out
                         */
-                        if (PageDirty(page) || PageWriteback(page) ||
+                        if (PageDirty(page) || PageWriteback(page)) {
-                            bl_is_sector_init(be->be_inval, isect)) {
                                print_page(page);
                                unlock_page(page);
                                page_cache_release(page);
@@ -591,6 +586,7 @@ fill_invalid_ext:
                                                 bl_end_io_write_zero, par);
                        if (IS_ERR(bio)) {
                                wdata->pnfs_error = PTR_ERR(bio);
+                                bio = NULL;
                                goto out;
                        }
                        /* FIXME: This should be done in bi_end_io */
@@ -639,6 +635,7 @@ next_page:
                                         bl_end_io_write, par);
                if (IS_ERR(bio)) {
                        wdata->pnfs_error = PTR_ERR(bio);
+                        bio = NULL;
                        goto out;
                }
                isect += PAGE_CACHE_SECTORS;
@@ -804,7 +801,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
                        struct nfs4_deviceid *d_id)
 {
        struct pnfs_device *dev;
-        struct pnfs_block_dev *rv = NULL;
+        struct pnfs_block_dev *rv;
        u32 max_resp_sz;
        int max_pages;
        struct page **pages = NULL;
@@ -822,18 +819,20 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
        dev = kmalloc(sizeof(*dev), GFP_NOFS);
        if (!dev) {
                dprintk("%s kmalloc failed\n", __func__);
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        }
        pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
        if (pages == NULL) {
                kfree(dev);
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        }
        for (i = 0; i < max_pages; i++) {
                pages[i] = alloc_page(GFP_NOFS);
-                if (!pages[i])
+                if (!pages[i]) {
+                        rv = ERR_PTR(-ENOMEM);
                        goto out_free;
+                }
        }
        memcpy(&dev->dev_id, d_id, sizeof(*d_id));
@@ -846,8 +845,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
        dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
        rc = nfs4_proc_getdeviceinfo(server, dev);
        dprintk("%s getdevice info returns %d\n", __func__, rc);
-        if (rc)
+        if (rc) {
+                rv = ERR_PTR(rc);
                goto out_free;
+        }
        rv = nfs4_blk_decode_device(server, dev);
 out_free:
@@ -865,7 +866,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
        struct pnfs_devicelist *dlist = NULL;
        struct pnfs_block_dev *bdev;
        LIST_HEAD(block_disklist);
-        int status = 0, i;
+        int status, i;
        dprintk("%s enter\n", __func__);
@@ -897,8 +898,8 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
                for (i = 0; i < dlist->num_devs; i++) {
                        bdev = nfs4_blk_get_deviceinfo(server, fh,
                                                       &dlist->dev_id[i]);
-                        if (!bdev) {
+                        if (IS_ERR(bdev)) {
-                                status = -ENODEV;
+                                status = PTR_ERR(bdev);
                                goto out_error;
                        }
                        spin_lock(&b_mt_id->bm_lock);
@@ -959,7 +960,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
 };
 static const struct rpc_pipe_ops bl_upcall_ops = {
-        .upcall         = bl_pipe_upcall,
+        .upcall         = rpc_pipe_generic_upcall,
        .downcall       = bl_pipe_downcall,
        .destroy_msg    = bl_pipe_destroy_msg,
 };
@@ -988,17 +989,20 @@ static int __init nfs4blocklayout_init(void)
                              mnt,
                              NFS_PIPE_DIRNAME, 0, &path);
        if (ret)
-                goto out_remove;
+                goto out_putrpc;
        bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
                                    &bl_upcall_ops, 0);
+        path_put(&path);
        if (IS_ERR(bl_device_pipe)) {
                ret = PTR_ERR(bl_device_pipe);
-                goto out_remove;
+                goto out_putrpc;
        }
 out:
        return ret;
+out_putrpc:
+        rpc_put_mount();
 out_remove:
        pnfs_unregister_layoutdriver(&blocklayout_type);
        return ret;
@@ -1011,6 +1015,7 @@ static void __exit nfs4blocklayout_exit(void)
        pnfs_unregister_layoutdriver(&blocklayout_type);
        rpc_unlink(bl_device_pipe);
+        rpc_put_mount();
 }
 MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f27d827960a3..42acf7ef5992 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -150,7 +150,7 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
 }
 struct bl_dev_msg {
-        int status;
+        int32_t status;
        uint32_t major, minor;
 };
@@ -169,8 +169,6 @@ extern wait_queue_head_t bl_wq;
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 /* blocklayoutdev.c */
-ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-                       char __user *, size_t);
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
 struct block_device *nfs4_blkdev_get(dev_t dev);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a83b393fb01c..d08ba9107fde 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -79,28 +79,6 @@ int nfs4_blkdev_put(struct block_device *bdev)
        return blkdev_put(bdev, FMODE_READ);
 }
-/*
- * Shouldn't there be a rpc_generic_upcall() to do this for us?
- */
-ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-                       char __user *dst, size_t buflen)
-{
-        char *data = (char *)msg->data + msg->copied;
-        size_t mlen = min(msg->len - msg->copied, buflen);
-        unsigned long left;
-        left = copy_to_user(dst, data, mlen);
-        if (left == mlen) {
-                msg->errno = -EFAULT;
-                return -EFAULT;
-        }
-        mlen -= left;
-        msg->copied += mlen;
-        msg->errno = 0;
-        return mlen;
-}
 static struct bl_dev_msg bl_mount_reply;
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
@@ -131,7 +109,7 @@ struct pnfs_block_dev *
 nfs4_blk_decode_device(struct nfs_server *server,
                       struct pnfs_device *dev)
 {
-        struct pnfs_block_dev *rv = NULL;
+        struct pnfs_block_dev *rv;
        struct block_device *bd = NULL;
        struct rpc_pipe_msg msg;
        struct bl_msg_hdr bl_msg = {
@@ -141,7 +119,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
        uint8_t *dataptr;
        DECLARE_WAITQUEUE(wq, current);
        struct bl_dev_msg *reply = &bl_mount_reply;
-        int offset, len, i;
+        int offset, len, i, rc;
        dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
        dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -168,8 +146,10 @@ nfs4_blk_decode_device(struct nfs_server *server,
        dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
        add_wait_queue(&bl_wq, &wq);
-        if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+        rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg);
+        if (rc < 0) {
                remove_wait_queue(&bl_wq, &wq);
+                rv = ERR_PTR(rc);
                goto out;
        }
@@ -187,8 +167,9 @@ nfs4_blk_decode_device(struct nfs_server *server,
        bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
        if (IS_ERR(bd)) {
-                dprintk("%s failed to open device : %ld\n",
+                rc = PTR_ERR(bd);
-                        __func__, PTR_ERR(bd));
+                dprintk("%s failed to open device : %d\n", __func__, rc);
+                rv = ERR_PTR(rc);
                goto out;
        }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index e3d294269058..516f3375e067 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,7 +125,7 @@ nfs4_callback_up(struct svc_serv *serv)
        else
                goto out_err;
-        return svc_prepare_thread(serv, &serv->sv_pools[0]);
+        return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
 out_err:
        if (ret == 0)
@@ -199,7 +199,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)
        INIT_LIST_HEAD(&serv->sv_cb_list);
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
-        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
+        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
        if (IS_ERR(rqstp)) {
                svc_xprt_put(serv->sv_bc_xprt);
                serv->sv_bc_xprt = NULL;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383bb565..07df5f1d85e5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
+        int                     slotid;
 };
 struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
        void *dummy, struct cb_process_state *cps);
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern void nfs4_cb_take_slot(struct nfs_client *clp);
 struct cb_devicenotifyitem {
        uint32_t                cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 74780f9f852c..43926add945b 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -348,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Normal */
        if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
                slot->seq_nr++;
-                return htonl(NFS4_OK);
+                goto out_ok;
        }
        /* Replay */
@@ -367,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Wraparound */
        if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
                slot->seq_nr = 1;
-                return htonl(NFS4_OK);
+                goto out_ok;
        }
        /* Misordered request */
        return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+        tbl->highest_used_slotid = args->csa_slotid;
+        return htonl(NFS4_OK);
 }
 /*
@@ -433,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                              struct cb_sequenceres *res,
                              struct cb_process_state *cps)
 {
+        struct nfs4_slot_table *tbl;
        struct nfs_client *clp;
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
-        cps->clp = NULL;
        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
+        tbl = &clp->cl_session->bc_slot_table;
+        spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
-                status = NFS4ERR_DELAY;
+                spin_unlock(&tbl->slot_tbl_lock);
+                status = htonl(NFS4ERR_DELAY);
+                /* Return NFS4ERR_BADSESSION if we're draining the session
+                 * in order to reset it.
+                 */
+                if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+                        status = htonl(NFS4ERR_BADSESSION);
                goto out;
        }
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+        spin_unlock(&tbl->slot_tbl_lock);
        if (status)
                goto out;
+        cps->slotid = args->csa_slotid;
        /*
         * Check for pending referring calls.  If a match is found, a
         * related callback was received before the response to the original
@@ -469,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-        nfs4_cb_take_slot(clp);
 out:
        cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a77e043..918ad647afea 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * Let the state manager know callback processing done.
         * A single slot, so highest used slotid is either 0 or -1
         */
-        tbl->highest_used_slotid--;
+        tbl->highest_used_slotid = -1;
        nfs4_check_drain_bc_complete(session);
        spin_unlock(&tbl->slot_tbl_lock);
 }
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-        if (clp && clp->cl_session)
+        if (cps->slotid != -1)
-                nfs4_callback_free_slot(clp->cl_session);
+                nfs4_callback_free_slot(cps->clp->cl_session);
-}
-/* A single slot, so highest used slotid is either 0 or -1 */
-void nfs4_cb_take_slot(struct nfs_client *clp)
-{
-        struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
-        spin_lock(&tbl->slot_tbl_lock);
-        tbl->highest_used_slotid++;
-        BUG_ON(tbl->highest_used_slotid != 0);
-        spin_unlock(&tbl->slot_tbl_lock);
 }
 #else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_process_state cps = {
                .drc_status = 0,
                .clp = NULL,
+                .slotid = -1,
        };
        unsigned int nops = 0;
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
-        nfs4_cb_free_slot(cps.clp);
+        nfs4_cb_free_slot(&cps);
        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5833fbbf59b0..873bf00d51a2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -336,11 +336,12 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-        if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+        if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
-            sin1->sin6_scope_id != sin2->sin6_scope_id)
                return 0;
+        else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                return sin1->sin6_scope_id == sin2->sin6_scope_id;
-        return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
+        return 1;
 }
 #else   /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -1867,6 +1868,10 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
        /* display one transport per line on subsequent lines */
        clp = list_entry(v, struct nfs_client, cl_share_link);
+        /* Check if the client is initialized */
+        if (clp->cl_cons_state != NFS_CS_READY)
+                return 0;
        seq_printf(m, "v%u %s %s %3d %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 321a66bc3846..7f2654069806 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -240,7 +240,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        sizeof(delegation->stateid.data));
        delegation->type = res->delegation_type;
        delegation->maxsize = res->maxsize;
-        delegation->change_attr = nfsi->change_attr;
+        delegation->change_attr = inode->i_version;
        delegation->cred = get_rpccred(cred);
        delegation->inode = inode;
        delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 28b8c3f3cda3..91c01f0a4c3b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -180,8 +180,6 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
-        loff_t loff;
        dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
                        filp->f_path.dentry->d_name.name,
@@ -197,13 +195,9 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                int retval = nfs_revalidate_file_size(inode, filp);
                if (retval < 0)
                        return (loff_t)retval;
+        }
-                spin_lock(&inode->i_lock);
+        return generic_file_llseek(filp, offset, origin);
-                loff = generic_file_llseek_unlocked(filp, offset, origin);
-                spin_unlock(&inode->i_lock);
-        } else
-                loff = generic_file_llseek_unlocked(filp, offset, origin);
-        return loff;
 }
 /*
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 5b1006480bc2..7cf2c4699b08 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -212,7 +212,7 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
        auxdata.ctime = nfsi->vfs_inode.i_ctime;
        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-                auxdata.change_attr = nfsi->change_attr;
+                auxdata.change_attr = nfsi->vfs_inode.i_version;
        if (bufmax > sizeof(auxdata))
                bufmax = sizeof(auxdata);
@@ -244,7 +244,7 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
        auxdata.ctime = nfsi->vfs_inode.i_ctime;
        if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-                auxdata.change_attr = nfsi->change_attr;
+                auxdata.change_attr = nfsi->vfs_inode.i_version;
        if (memcmp(data, &auxdata, datalen) != 0)
                return FSCACHE_CHECKAUX_OBSOLETE;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f20801ae0a16..47d1c6ff2d8e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -336,8 +336,6 @@ struct idmap {
        struct idmap_hashtable  idmap_group_hash;
 };
-static ssize_t idmap_pipe_upcall(struct file *, struct rpc_pipe_msg *,
-                                 char __user *, size_t);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
                                   size_t);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
@@ -345,7 +343,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
 static const struct rpc_pipe_ops idmap_upcall_ops = {
-        .upcall         = idmap_pipe_upcall,
+        .upcall         = rpc_pipe_generic_upcall,
        .downcall       = idmap_pipe_downcall,
        .destroy_msg    = idmap_pipe_destroy_msg,
 };
@@ -595,27 +593,6 @@ nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h,
        return ret;
 }
-/* RPC pipefs upcall/downcall routines */
-static ssize_t
-idmap_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
-                  char __user *dst, size_t buflen)
-{
-        char *data = (char *)msg->data + msg->copied;
-        size_t mlen = min(msg->len, buflen);
-        unsigned long left;
-        left = copy_to_user(dst, data, mlen);
-        if (left == mlen) {
-                msg->errno = -EFAULT;
-                return -EFAULT;
-        }
-        mlen -= left;
-        msg->copied += mlen;
-        msg->errno = 0;
-        return mlen;
-}
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fe1203797b2b..4dc6d078f108 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -318,7 +318,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                memset(&inode->i_atime, 0, sizeof(inode->i_atime));
                memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
                memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
-                nfsi->change_attr = 0;
+                inode->i_version = 0;
                inode->i_size = 0;
                inode->i_nlink = 0;
                inode->i_uid = -2;
@@ -344,7 +344,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                                | NFS_INO_INVALID_ACCESS
                                | NFS_INO_INVALID_ACL;
                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-                        nfsi->change_attr = fattr->change_attr;
+                        inode->i_version = fattr->change_attr;
                else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
                                | NFS_INO_INVALID_DATA;
@@ -897,8 +897,8 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
-                        && nfsi->change_attr == fattr->pre_change_attr) {
+                        && inode->i_version == fattr->pre_change_attr) {
-                nfsi->change_attr = fattr->change_attr;
+                inode->i_version = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
                ret |= NFS_INO_INVALID_ATTR;
@@ -952,7 +952,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
                return -EIO;
        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-                        nfsi->change_attr != fattr->change_attr)
+                        inode->i_version != fattr->change_attr)
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
        /* Verify a few of the more important attributes */
@@ -1163,7 +1163,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
        }
        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
                        (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
-                fattr->pre_change_attr = NFS_I(inode)->change_attr;
+                fattr->pre_change_attr = inode->i_version;
                fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
@@ -1244,13 +1244,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* More cache consistency checks */
        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-                if (nfsi->change_attr != fattr->change_attr) {
+                if (inode->i_version != fattr->change_attr) {
                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
                                        inode->i_sb->s_id, inode->i_ino);
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        if (S_ISDIR(inode->i_mode))
                                nfs_force_lookup_revalidate(inode);
-                        nfsi->change_attr = fattr->change_attr;
+                        inode->i_version = fattr->change_attr;
                }
        } else if (server->caps & NFS_CAP_CHANGE_ATTR)
                invalid |= save_cache_validity;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913dd473..c1a1bd8ddf1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -457,13 +457,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
                PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
-/*
- * Helper for restarting RPC calls in the possible presence of NFSv4.1
- * sessions.
- */
-static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
-{
-        if (nfs4_has_session(clp))
-                return rpc_restart_call_prepare(task);
-        return rpc_restart_call(task);
-}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..693ae22f8731 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,30 +13,6 @@
 struct idmap;
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations.  This is true even in the event that the previous
- * operation that used the sequence number received an error.  The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err)       \
-(((err) != NFSERR_STALE_CLIENTID) &&  \
- ((err) != NFSERR_STALE_STATEID)  &&  \
- ((err) != NFSERR_BAD_STATEID)    &&  \
- ((err) != NFSERR_BAD_SEQID)      &&  \
- ((err) != NFSERR_BAD_XDR)        &&  \
- ((err) != NFSERR_RESOURCE)       &&  \
- ((err) != NFSERR_NOFILEHANDLE))
 enum nfs4_client_state {
        NFS4CLNT_MANAGER_RUNNING  = 0,
        NFS4CLNT_CHECK_LEASE,
@@ -56,6 +32,9 @@ enum nfs4_session_state {
        NFS4_SESSION_DRAINING,
 };
+#define NFS4_RENEW_TIMEOUT              0x01
+#define NFS4_RENEW_DELEGATION_CB        0x02
 struct nfs4_minor_version_ops {
        u32     minor_version;
@@ -225,7 +204,7 @@ struct nfs4_state_recovery_ops {
 };
 struct nfs4_state_maintenance_ops {
-        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
        struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
        int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
@@ -237,8 +216,6 @@ extern const struct inode_operations nfs4_dir_inode_operations;
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
-extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -349,6 +326,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index e8915d4840ad..09119418402f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -77,19 +77,6 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
        BUG();
 }
-/* For data server errors we don't recover from */
-static void
-filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
-{
-        if (lseg->pls_range.iomode == IOMODE_RW) {
-                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
-        } else {
-                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
-                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
-        }
-}
 static int filelayout_async_handle_error(struct rpc_task *task,
                                         struct nfs4_state *state,
                                         struct nfs_client *clp,
@@ -135,7 +122,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 static int filelayout_read_done_cb(struct rpc_task *task,
                                struct nfs_read_data *data)
 {
-        struct nfs_client *clp = data->ds_clp;
        int reset = 0;
        dprintk("%s DS read\n", __func__);
@@ -145,11 +131,10 @@ static int filelayout_read_done_cb(struct rpc_task *task,
                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
                        __func__, data->ds_clp, data->ds_clp->cl_session);
                if (reset) {
-                        filelayout_set_lo_fail(data->lseg);
+                        pnfs_set_lo_fail(data->lseg);
                        nfs4_reset_read(task, data);
-                        clp = NFS_SERVER(data->inode)->nfs_client;
                }
-                nfs_restart_rpc(task, clp);
+                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
@@ -216,17 +201,13 @@ static int filelayout_write_done_cb(struct rpc_task *task,
        if (filelayout_async_handle_error(task, data->args.context->state,
                                          data->ds_clp, &reset) == -EAGAIN) {
-                struct nfs_client *clp;
                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
                        __func__, data->ds_clp, data->ds_clp->cl_session);
                if (reset) {
-                        filelayout_set_lo_fail(data->lseg);
+                        pnfs_set_lo_fail(data->lseg);
                        nfs4_reset_write(task, data);
-                        clp = NFS_SERVER(data->inode)->nfs_client;
+                }
-                } else
+                rpc_restart_call_prepare(task);
-                        clp = data->ds_clp;
-                nfs_restart_rpc(task, clp);
                return -EAGAIN;
        }
@@ -256,9 +237,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
                        __func__, data->ds_clp, data->ds_clp->cl_session);
                if (reset) {
                        prepare_to_resend_writes(data);
-                        filelayout_set_lo_fail(data->lseg);
+                        pnfs_set_lo_fail(data->lseg);
                } else
-                        nfs_restart_rpc(task, data->ds_clp);
+                        rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039e7a81..d2ae413c986a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,9 +73,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
-                             const struct qstr *name, struct nfs_fh *fhandle,
-                             struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
@@ -753,9 +750,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
        spin_lock(&dir->i_lock);
        nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
-        if (!cinfo->atomic || cinfo->before != nfsi->change_attr)
+        if (!cinfo->atomic || cinfo->before != dir->i_version)
                nfs_force_lookup_revalidate(dir);
-        nfsi->change_attr = cinfo->after;
+        dir->i_version = cinfo->after;
        spin_unlock(&dir->i_lock);
 }
@@ -1596,8 +1593,14 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        int status;
        status = nfs4_run_open_task(data, 0);
-        if (status != 0 || !data->rpc_done)
+        if (!data->rpc_done)
+                return status;
+        if (status != 0) {
+                if (status == -NFS4ERR_BADNAME &&
+                                !(o_arg->open_flags & O_CREAT))
+                        return -ENOENT;
                return status;
+        }
        if (o_arg->open_flags & O_CREAT) {
                update_changeattr(dir, &o_res->cinfo);
@@ -2408,14 +2411,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        return status;
 }
-static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-                const struct nfs_fh *dirfh, const struct qstr *name,
+                const struct qstr *name, struct nfs_fh *fhandle,
-                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                struct nfs_fattr *fattr)
 {
+        struct nfs_server *server = NFS_SERVER(dir);
        int                    status;
        struct nfs4_lookup_arg args = {
                .bitmask = server->attr_bitmask,
-                .dir_fh = dirfh,
+                .dir_fh = NFS_FH(dir),
                .name = name,
        };
        struct nfs4_lookup_res res = {
@@ -2431,40 +2435,8 @@ static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
        nfs_fattr_init(fattr);
-        dprintk("NFS call  lookupfh %s\n", name->name);
-        status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
-        dprintk("NFS reply lookupfh: %d\n", status);
-        return status;
-}
-static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
-                              struct qstr *name, struct nfs_fh *fhandle,
-                              struct nfs_fattr *fattr)
-{
-        struct nfs4_exception exception = { };
-        int err;
-        do {
-                err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
-                /* FIXME: !!!! */
-                if (err == -NFS4ERR_MOVED) {
-                        err = -EREMOTE;
-                        break;
-                }
-                err = nfs4_handle_exception(server, err, &exception);
-        } while (exception.retry);
-        return err;
-}
-static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-                const struct qstr *name, struct nfs_fh *fhandle,
-                struct nfs_fattr *fattr)
-{
-        int status;
-        
        dprintk("NFS call  lookup %s\n", name->name);
-        status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
+        status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
-        if (status == -NFS4ERR_MOVED)
-                status = nfs4_get_referral(dir, name, fattr, fhandle);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
@@ -2485,11 +2457,20 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
        struct nfs4_exception exception = { };
        int err;
        do {
-                err = nfs4_handle_exception(NFS_SERVER(dir),
+                int status;
-                                _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
-                                &exception);
+                status = _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr);
-                if (err == -EPERM)
+                switch (status) {
+                case -NFS4ERR_BADNAME:
+                        return -ENOENT;
+                case -NFS4ERR_MOVED:
+                        err = nfs4_get_referral(dir, name, fattr, fhandle);
+                        break;
+                case -NFS4ERR_WRONGSEC:
                        nfs_fixup_secinfo_attributes(fattr, fhandle);
+                }
+                err = nfs4_handle_exception(NFS_SERVER(dir),
+                                status, &exception);
        } while (exception.retry);
        return err;
 }
@@ -3210,7 +3191,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
        struct nfs_server *server = NFS_SERVER(data->inode);
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
-                nfs_restart_rpc(task, server->nfs_client);
+                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
@@ -3260,7 +3241,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
        struct inode *inode = data->inode;
        
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
-                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
@@ -3317,7 +3298,7 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat
        struct inode *inode = data->inode;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
-                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
+                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
        nfs_refresh_inode(inode, data->res.fattr);
@@ -3374,9 +3355,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
-                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
+                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+                        return;
+                if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
                        nfs4_schedule_lease_recovery(clp);
-                return;
+                        return;
+                }
+                nfs4_schedule_path_down_recovery(clp);
        }
        do_renew_lease(clp, timestamp);
 }
@@ -3386,7 +3371,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
        .rpc_release = nfs4_renew_release,
 };
-int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3395,9 +3380,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
        };
        struct nfs4_renewdata *data;
+        if (renew_flags == 0)
+                return 0;
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
-        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        data->client = clp;
@@ -3406,7 +3393,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                        &nfs4_renew_ops, data);
 }
-int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3851,7 +3838,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        default:
                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
                                -EAGAIN) {
-                        nfs_restart_rpc(task, data->res.server->nfs_client);
+                        rpc_restart_call_prepare(task);
                        return;
                }
        }
@@ -4105,8 +4092,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
-                                nfs_restart_rpc(task,
+                                rpc_restart_call_prepare(task);
-                                                 calldata->server->nfs_client);
        }
 }
@@ -4939,7 +4925,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
                task->tk_status = 0;
                /* fall through */
        case -NFS4ERR_RETRY_UNCACHED_REP:
-                nfs_restart_rpc(task, data->clp);
+                rpc_restart_call_prepare(task);
                return;
        }
        dprintk("<-- %s\n", __func__);
@@ -5504,11 +5490,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
        return rpc_run_task(&task_setup_data);
 }
-static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
 {
        struct rpc_task *task;
        int ret = 0;
+        if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+                return 0;
        task = _nfs41_proc_sequence(clp, cred);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
@@ -5778,7 +5766,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
        server = NFS_SERVER(lrp->args.inode);
        if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-                nfs_restart_rpc(task, lrp->clp);
+                rpc_restart_call_prepare(task);
                return;
        }
        spin_lock(&lo->plh_inode->i_lock);
@@ -5949,7 +5937,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
        }
        if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
-                nfs_restart_rpc(task, server->nfs_client);
+                rpc_restart_call_prepare(task);
                return;
        }
@@ -6262,7 +6250,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .getroot        = nfs4_proc_get_root,
        .getattr        = nfs4_proc_getattr,
        .setattr        = nfs4_proc_setattr,
-        .lookupfh       = nfs4_proc_lookupfh,
        .lookup         = nfs4_proc_lookup,
        .access         = nfs4_proc_access,
        .readlink       = nfs4_proc_readlink,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56d..dc484c0eae7f 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
        struct rpc_cred *cred;
        long lease;
        unsigned long last, now;
+        unsigned renew_flags = 0;
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
        last = clp->cl_last_renewal;
        now = jiffies;
        /* Are we close to a lease timeout? */
-        if (time_after(now, last + lease/3)) {
+        if (time_after(now, last + lease/3))
+                renew_flags |= NFS4_RENEW_TIMEOUT;
+        if (nfs_delegations_present(clp))
+                renew_flags |= NFS4_RENEW_DELEGATION_CB;
+        if (renew_flags != 0) {
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (!nfs_delegations_present(clp)) {
+                        if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
                        nfs_expire_all_delegations(clp);
                } else {
                        /* Queue an asynchronous RENEW. */
-                        ops->sched_state_renewal(clp, cred);
+                        ops->sched_state_renewal(clp, cred, renew_flags);
                        put_rpccred(cred);
                        goto out_exp;
                }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 72ab97ef3d61..39914be40b03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+        nfs_handle_cb_pathdown(clp);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9383ca7245bc..d0cda12fddc3 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
        for (i = 0; i <  ios->numdevs; i++) {
                struct osd_sense_info osi;
                struct osd_request *or = ios->per_dev[i].or;
-                unsigned dev;
                int ret;
                if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
                        continue; /* we recovered */
                }
-                dev = ios->per_dev[i].dev;
+                objlayout_io_set_result(&ios->ol_state, i,
-                objlayout_io_set_result(&ios->ol_state, dev,
+                                        &ios->layout->comps[i].oc_object_id,
-                                        &ios->layout->comps[dev].oc_object_id,
                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
                                        ios->per_dev[i].offset,
                                        ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
 }
 static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-                unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+                unsigned pgbase, struct _objio_per_comp *per_dev, int len,
                gfp_t gfp_flags)
 {
        unsigned pg = *cur_pg;
+        int cur_len = len;
        struct request_queue *q =
                        osd_request_queue(_io_od(ios, per_dev->dev));
-        per_dev->length += cur_len;
        if (per_dev->bio == NULL) {
-                unsigned stripes = ios->layout->num_comps /
+                unsigned pages_in_stripe = ios->layout->group_width *
-                                                     ios->layout->mirrors_p1;
-                unsigned pages_in_stripe = stripes *
                                      (ios->layout->stripe_unit / PAGE_SIZE);
                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-                                    stripes;
+                                    ios->layout->group_width;
                if (BIO_MAX_PAGES_KMALLOC < bio_size)
                        bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
        }
        BUG_ON(cur_len);
+        per_dev->length += len;
        *cur_pg = pg;
        return 0;
 }
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
        int ret = 0;
        while (length) {
-                struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+                struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
                                cur_len = stripe_unit;
                        }
-                        if (max_comp < dev)
+                        if (max_comp < dev - first_dev)
-                                max_comp = dev;
+                                max_comp = dev - first_dev;
                } else {
                        cur_len = stripe_unit;
                }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
        unsigned dev = per_dev->dev;
        struct pnfs_osd_object_cred *cred =
-                        &ios->layout->comps[dev];
+                        &ios->layout->comps[cur_comp];
        struct osd_obj_id obj = {
                .partition = cred->oc_object_id.oid_partition_id,
                .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
                struct osd_request *or = NULL;
                struct pnfs_osd_object_cred *cred =
-                                        &ios->layout->comps[dev];
+                                        &ios->layout->comps[cur_comp];
                struct osd_obj_id obj = {
                        .partition = cred->oc_object_id.oid_partition_id,
                        .id = cred->oc_object_id.oid_object_id,
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758e9123..b3918f7ac34d 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
        p = _osd_xdr_decode_data_map(p, &layout->olo_map);
        layout->olo_comps_index = be32_to_cpup(p++);
        layout->olo_num_comps = be32_to_cpup(p++);
+        dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+                layout->olo_comps_index, layout->olo_num_comps);
        iter->total_comps = layout->olo_num_comps;
        return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e8836c37..ee73d9a4f700 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1168,23 +1168,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 /*
 * Called by non rpc-based layout drivers
 */
-int
+void pnfs_ld_write_done(struct nfs_write_data *data)
-pnfs_ld_write_done(struct nfs_write_data *data)
 {
-        int status;
+        if (likely(!data->pnfs_error)) {
-        if (!data->pnfs_error) {
                pnfs_set_layoutcommit(data);
                data->mds_ops->rpc_call_done(&data->task, data);
-                data->mds_ops->rpc_release(data);
+        } else {
-                return 0;
+                put_lseg(data->lseg);
+                data->lseg = NULL;
+                dprintk("pnfs write error = %d\n", data->pnfs_error);
        }
+        data->mds_ops->rpc_release(data);
-        dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-                data->pnfs_error);
-        status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
-                                    data->mds_ops, NFS_FILE_SYNC);
-        return status ? : -EAGAIN;
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
@@ -1268,23 +1262,17 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 /*
 * Called by non rpc-based layout drivers
 */
-int
+void pnfs_ld_read_done(struct nfs_read_data *data)
-pnfs_ld_read_done(struct nfs_read_data *data)
 {
-        int status;
+        if (likely(!data->pnfs_error)) {
-        if (!data->pnfs_error) {
                __nfs4_read_done_cb(data);
                data->mds_ops->rpc_call_done(&data->task, data);
-                data->mds_ops->rpc_release(data);
+        } else {
-                return 0;
+                put_lseg(data->lseg);
+                data->lseg = NULL;
+                dprintk("pnfs write error = %d\n", data->pnfs_error);
        }
+        data->mds_ops->rpc_release(data);
-        dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-                data->pnfs_error);
-        status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
-                                   data->mds_ops);
-        return status ? : -EAGAIN;
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
@@ -1381,6 +1369,18 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
        }
 }
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 01cbfd54f3cb..1509530cb111 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -178,6 +178,7 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -200,8 +201,8 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
-int pnfs_ld_write_done(struct nfs_write_data *);
+void pnfs_ld_write_done(struct nfs_write_data *);
-int pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_read_done(struct nfs_read_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2171c043ab08..8b48ec63f722 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,16 +35,13 @@ static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 static struct kmem_cache *nfs_rdata_cachep;
-static mempool_t *nfs_rdata_mempool;
-#define MIN_POOL_READ   (32)
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
+        struct nfs_read_data *p;
+        p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
        if (p) {
-                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
                p->npages = pagecount;
                if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -52,7 +49,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                else {
                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
                        if (!p->pagevec) {
-                                mempool_free(p, nfs_rdata_mempool);
+                                kmem_cache_free(nfs_rdata_cachep, p);
                                p = NULL;
                        }
                }
@@ -64,7 +61,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
-        mempool_free(p, nfs_rdata_mempool);
+        kmem_cache_free(nfs_rdata_cachep, p);
 }
 void nfs_readdata_release(struct nfs_read_data *rdata)
@@ -276,7 +273,6 @@ nfs_async_read_error(struct list_head *head)
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
-                SetPageError(req->wb_page);
                nfs_readpage_release(req);
        }
 }
@@ -322,7 +318,6 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head
                offset += len;
        } while(nbytes != 0);
        atomic_set(&req->wb_complete, requests);
-        ClearPageError(page);
        desc->pg_rpc_callops = &nfs_read_partial_ops;
        return ret;
 out_bad:
@@ -331,7 +326,6 @@ out_bad:
                list_del(&data->list);
                nfs_readdata_free(data);
        }
-        SetPageError(page);
        nfs_readpage_release(req);
        return -ENOMEM;
 }
@@ -357,7 +351,6 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &data->pages);
-                ClearPageError(req->wb_page);
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
@@ -435,7 +428,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
-        nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client);
+        rpc_restart_call_prepare(task);
 }
 /*
@@ -462,10 +455,10 @@ static void nfs_readpage_release_partial(void *calldata)
        int status = data->task.tk_status;
        if (status < 0)
-                SetPageError(page);
+                set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags);
        if (atomic_dec_and_test(&req->wb_complete)) {
-                if (!PageError(page))
+                if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags))
                        SetPageUptodate(page);
                nfs_readpage_release(req);
        }
@@ -541,13 +534,23 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
        struct nfs_read_data *data = calldata;
+        struct nfs_pageio_descriptor pgio;
+        if (data->pnfs_error) {
+                nfs_pageio_init_read_mds(&pgio, data->inode);
+                pgio.pg_recoalesce = 1;
+        }
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                nfs_readpage_release(req);
+                if (!data->pnfs_error)
+                        nfs_readpage_release(req);
+                else
+                        nfs_pageio_add_request(&pgio, req);
        }
+        if (data->pnfs_error)
+                nfs_pageio_complete(&pgio);
        nfs_readdata_release(calldata);
 }
@@ -648,7 +651,6 @@ readpage_async_filler(void *data, struct page *page)
        return 0;
 out_error:
        error = PTR_ERR(new);
-        SetPageError(page);
 out_unlock:
        unlock_page(page);
        return error;
@@ -711,16 +713,10 @@ int __init nfs_init_readpagecache(void)
        if (nfs_rdata_cachep == NULL)
                return -ENOMEM;
-        nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
-                                                     nfs_rdata_cachep);
-        if (nfs_rdata_mempool == NULL)
-                return -ENOMEM;
        return 0;
 }
 void nfs_destroy_readpagecache(void)
 {
-        mempool_destroy(nfs_rdata_mempool);
        kmem_cache_destroy(nfs_rdata_cachep);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b961ceac66b4..480b3b6bf71e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -733,18 +733,22 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_sessions(struct seq_file *m, struct nfs_server *server)
+static void show_sessions(struct seq_file *m, struct nfs_server *server)
 {
        if (nfs4_has_session(server->nfs_client))
                seq_printf(m, ",sessions");
 }
 #else
-void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+static void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
 #endif
+#ifdef CONFIG_NFS_V4
 #ifdef CONFIG_NFS_V4_1
-void show_pnfs(struct seq_file *m, struct nfs_server *server)
+static void show_pnfs(struct seq_file *m, struct nfs_server *server)
 {
        seq_printf(m, ",pnfs=");
        if (server->pnfs_curr_ld)
@@ -752,9 +756,10 @@ void show_pnfs(struct seq_file *m, struct nfs_server *server)
        else
                seq_printf(m, "not configured");
 }
-#else  /* CONFIG_NFS_V4_1 */
+#else
-void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+static void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
-#endif /* CONFIG_NFS_V4_1 */
+#endif
+#endif
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
@@ -2035,9 +2040,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
                sb->s_blocksize = nfs_block_bits(server->wsize,
                                                 &sb->s_blocksize_bits);
-        if (server->flags & NFS_MOUNT_NOAC)
-                sb->s_flags |= MS_SYNCHRONOUS;
        sb->s_bdi = &server->backing_dev_info;
        nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2249,6 +2251,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        if (server->flags & NFS_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2361,6 +2367,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2628,6 +2638,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2789,7 +2803,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
                goto out_put_mnt_ns;
        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-                        export_path, LOOKUP_FOLLOW, &path);
+                        export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
        nfs_referral_loop_unprotect();
        put_mnt_ns(ns_private);
@@ -2916,6 +2930,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -3003,6 +3021,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index b2fbbde58e44..4f9319a2e567 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -87,7 +87,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
        struct inode *dir = data->dir;
        if (!NFS_PROTO(dir)->unlink_done(task, dir))
-                nfs_restart_rpc(task, NFS_SERVER(dir)->nfs_client);
+                rpc_restart_call_prepare(task);
 }
 /**
@@ -369,7 +369,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
        struct dentry *new_dentry = data->new_dentry;
        if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
-                nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client);
+                rpc_restart_call_prepare(task);
                return;
        }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f80913..2219c88d96b2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -390,7 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
        BUG_ON(error);
        if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
-                nfsi->change_attr++;
+                inode->i_version++;
        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
@@ -428,7 +428,6 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
        __set_page_dirty_nobuffers(req->wb_page);
-        __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -762,6 +761,8 @@ int nfs_updatepage(struct file *file, struct page *page,
        status = nfs_writepage_setup(ctx, page, offset, count);
        if (status < 0)
                nfs_set_pageerror(page);
+        else
+                __set_page_dirty_nobuffers(page);
        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
@@ -958,7 +959,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
                if (!data)
                        goto out_bad;
                data->pagevec[0] = page;
-                nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags);
+                nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
                list_add(&data->list, res);
                requests++;
                nbytes -= len;
@@ -1010,7 +1011,6 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *r
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_list_add_request(req, &data->pages);
-                ClearPageError(req->wb_page);
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
@@ -1165,7 +1165,13 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
        struct nfs_write_data   *data = calldata;
-        int status = data->task.tk_status;
+        int ret, status = data->task.tk_status;
+        struct nfs_pageio_descriptor pgio;
+        if (data->pnfs_error) {
+                nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+                pgio.pg_recoalesce = 1;
+        }
        /* Update attributes as result of writeback. */
        while (!list_empty(&data->pages)) {
@@ -1181,6 +1187,11 @@ static void nfs_writeback_release_full(void *calldata)
                        req->wb_bytes,
                        (long long)req_offset(req));
+                if (data->pnfs_error) {
+                        dprintk(", pnfs error = %d\n", data->pnfs_error);
+                        goto next;
+                }
                if (status < 0) {
                        nfs_set_pageerror(page);
                        nfs_context_set_write_error(req->wb_context, status);
@@ -1200,7 +1211,19 @@ remove_request:
        next:
                nfs_clear_page_tag_locked(req);
                nfs_end_page_writeback(page);
+                if (data->pnfs_error) {
+                        lock_page(page);
+                        nfs_pageio_cond_complete(&pgio, page->index);
+                        ret = nfs_page_async_flush(&pgio, page, 0);
+                        if (ret) {
+                                nfs_set_pageerror(page);
+                                dprintk("rewrite to MDS error = %d\n", ret);
+                        }
+                        unlock_page(page);
+                }
        }
+        if (data->pnfs_error)
+                nfs_pageio_complete(&pgio);
        nfs_writedata_release(calldata);
 }
@@ -1281,7 +1304,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                 */
                                argp->stable = NFS_FILE_SYNC;
                        }
-                        nfs_restart_rpc(task, server->nfs_client);
+                        rpc_restart_call_prepare(task);
                        return;
                }
                if (time_before(complain, jiffies)) {
@@ -1553,6 +1576,10 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
        int flags = FLUSH_SYNC;
        int ret = 0;
+        /* no commits means nothing needs to be done */
+        if (!nfsi->ncommit)
+                return ret;
        if (wbc->sync_mode == WB_SYNC_NONE) {
                /* Don't commit yet if this is a non-blocking flush and there
                 * are a lot of outstanding writes for this mapping.
@@ -1686,34 +1713,20 @@ out_error:
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
                struct page *page)
 {
-        struct nfs_page *req;
+        /*
-        int ret;
+         * If PagePrivate is set, then the page is currently associated with
+         * an in-progress read or write request. Don't try to migrate it.
+         *
+         * FIXME: we could do this in principle, but we'll need a way to ensure
+         *        that we can safely release the inode reference while holding
+         *        the page lock.
+         */
+        if (PagePrivate(page))
+                return -EBUSY;
        nfs_fscache_release_page(page, GFP_KERNEL);
-        req = nfs_find_and_lock_request(page, false);
+        return migrate_page(mapping, newpage, page);
-        ret = PTR_ERR(req);
-        if (IS_ERR(req))
-                goto out;
-        ret = migrate_page(mapping, newpage, page);
-        if (!req)
-                goto out;
-        if (ret)
-                goto out_unlock;
-        page_cache_get(newpage);
-        spin_lock(&mapping->host->i_lock);
-        req->wb_page = newpage;
-        SetPagePrivate(newpage);
-        set_page_private(newpage, (unsigned long)req);
-        ClearPagePrivate(page);
-        set_page_private(page, 0);
-        spin_unlock(&mapping->host->i_lock);
-        page_cache_release(page);
-out_unlock:
-        nfs_clear_page_tag_locked(req);
-out:
-        return ret;
 }
 #endif
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f4cc1e2bfc54..62f3b9074e84 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/exportfs.h>
-#include <linux/nfsd/syscall.h>
 #include <net/ipv6.h>
 #include "nfsd.h"
@@ -318,7 +317,6 @@ static void svc_export_put(struct kref *ref)
        struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
        path_put(&exp->ex_path);
        auth_domain_put(exp->ex_client);
-        kfree(exp->ex_pathname);
        nfsd4_fslocs_free(&exp->ex_fslocs);
        kfree(exp);
 }
@@ -528,11 +526,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_client = dom;
-        err = -ENOMEM;
-        exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
-        if (!exp.ex_pathname)
-                goto out2;
        /* expiry */
        err = -EINVAL;
        exp.h.expiry_time = get_expiry(&mesg);
@@ -613,8 +606,6 @@ out4:
        nfsd4_fslocs_free(&exp.ex_fslocs);
        kfree(exp.ex_uuid);
 out3:
-        kfree(exp.ex_pathname);
-out2:
        path_put(&exp.ex_path);
 out1:
        auth_domain_put(dom);
@@ -678,7 +669,6 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        new->ex_client = item->ex_client;
        new->ex_path.dentry = dget(item->ex_path.dentry);
        new->ex_path.mnt = mntget(item->ex_path.mnt);
-        new->ex_pathname = NULL;
        new->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = 0;
@@ -696,8 +686,6 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        new->ex_fsid = item->ex_fsid;
        new->ex_uuid = item->ex_uuid;
        item->ex_uuid = NULL;
-        new->ex_pathname = item->ex_pathname;
-        item->ex_pathname = NULL;
        new->ex_fslocs.locations = item->ex_fslocs.locations;
        item->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
@@ -1010,7 +998,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
        return exp;
 }
-static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
 {
        u32 fsidv[2];
@@ -1030,7 +1018,7 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
        struct svc_export *exp;
        __be32 rv;
-        exp = find_fsidzero_export(rqstp);
+        exp = rqst_find_fsidzero_export(rqstp);
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 02eb4edf0ece..7748d6a18d97 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -39,6 +39,8 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
+static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
@@ -351,7 +353,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
        __be32 *p;
        encode_nfs_cb_opnum4(xdr, OP_CB_RECALL);
-        encode_stateid4(xdr, &dp->dl_stateid);
+        encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
        p = xdr_reserve_space(xdr, 4);
        *p++ = xdr_zero;                        /* truncate */
@@ -460,6 +462,8 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
         */
        status = 0;
 out:
+        if (status)
+                nfsd4_mark_cb_fault(cb->cb_clp, status);
        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -686,6 +690,12 @@ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
        warn_no_callback_path(clp, reason);
 }
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+{
+        clp->cl_cb_state = NFSD4_CB_FAULT;
+        warn_no_callback_path(clp, reason);
+}
 static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
@@ -787,7 +797,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        struct nfs4_client *clp = dp->dl_client;
+        struct nfs4_client *clp = dp->dl_stid.sc_client;
        u32 minorversion = clp->cl_minorversion;
        cb->cb_minorversion = minorversion;
@@ -809,7 +819,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        struct nfs4_client *clp = dp->dl_client;
+        struct nfs4_client *clp = dp->dl_stid.sc_client;
        dprintk("%s: minorversion=%d\n", __func__,
                clp->cl_minorversion);
@@ -832,7 +842,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
        struct nfsd4_callback *cb = calldata;
        struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-        struct nfs4_client *clp = dp->dl_client;
+        struct nfs4_client *clp = dp->dl_stid.sc_client;
        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
        nfsd4_cb_done(task, calldata);
@@ -1006,7 +1016,7 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfsd4_callback *cb = &dp->dl_recall;
-        struct nfs4_client *clp = dp->dl_client;
+        struct nfs4_client *clp = dp->dl_stid.sc_client;
        dp->dl_retries = 1;
        cb->cb_op = dp;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e80777666618..fa383361bc61 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -35,6 +35,7 @@
 #include <linux/file.h>
 #include <linux/slab.h>
+#include "idmap.h"
 #include "cache.h"
 #include "xdr4.h"
 #include "vfs.h"
@@ -156,6 +157,8 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
                !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
                return nfserr_inval;
+        accmode |= NFSD_MAY_READ_IF_EXEC;
        if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
                accmode |= NFSD_MAY_READ;
        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
@@ -168,12 +171,29 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
        return status;
 }
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+{
+        umode_t mode = fh->fh_dentry->d_inode->i_mode;
+        if (S_ISREG(mode))
+                return nfs_ok;
+        if (S_ISDIR(mode))
+                return nfserr_isdir;
+        /*
+         * Using err_symlink as our catch-all case may look odd; but
+         * there's no other obvious error for this case in 4.0, and we
+         * happen to know that it will cause the linux v4 client to do
+         * the right thing on attempts to open something other than a
+         * regular file.
+         */
+        return nfserr_symlink;
+}
 static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct svc_fh resfh;
        __be32 status;
-        int created = 0;
        fh_init(&resfh, NFS4_FHSIZE);
        open->op_truncate = 0;
@@ -202,7 +222,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                                        open->op_fname.len, &open->op_iattr,
                                        &resfh, open->op_createmode,
                                        (u32 *)open->op_verf.data,
-                                        &open->op_truncate, &created);
+                                        &open->op_truncate, &open->op_created);
                /*
                 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -216,6 +236,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                status = nfsd_lookup(rqstp, current_fh,
                                     open->op_fname.data, open->op_fname.len, &resfh);
                fh_unlock(current_fh);
+                if (status)
+                        goto out;
+                status = nfsd_check_obj_isreg(&resfh);
        }
        if (status)
                goto out;
@@ -227,9 +250,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
        fh_dup2(current_fh, &resfh);
        /* set reply cache */
-        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+        fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
                        &resfh.fh_handle);
-        if (!created)
+        if (!open->op_created)
                status = do_open_permission(rqstp, current_fh, open,
                                            NFSD_MAY_NOP);
@@ -254,7 +277,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
        memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
        /* set replay cache */
-        fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+        fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
                        &current_fh->fh_handle);
        open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
@@ -283,14 +306,18 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 status;
        struct nfsd4_compoundres *resp;
-        dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
+        dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
                (int)open->op_fname.len, open->op_fname.data,
-                open->op_stateowner);
+                open->op_openowner);
        /* This check required by spec. */
        if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
                return nfserr_inval;
+        /* We don't yet support WANT bits: */
+        open->op_share_access &= NFS4_SHARE_ACCESS_MASK;
+        open->op_created = 0;
        /*
         * RFC5661 18.51.3
         * Before RECLAIM_COMPLETE done, server should deny new lock
@@ -309,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        resp = rqstp->rq_resp;
        status = nfsd4_process_open1(&resp->cstate, open);
        if (status == nfserr_replay_me) {
-                struct nfs4_replay *rp = &open->op_stateowner->so_replay;
+                struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
                fh_put(&cstate->current_fh);
                fh_copy_shallow(&cstate->current_fh.fh_handle,
                                &rp->rp_openfh);
@@ -339,32 +366,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
                case NFS4_OPEN_CLAIM_NULL:
-                        /*
-                         * (1) set CURRENT_FH to the file being opened,
-                         * creating it if necessary, (2) set open->op_cinfo,
-                         * (3) set open->op_truncate if the file is to be
-                         * truncated after opening, (4) do permission checking.
-                         */
                        status = do_open_lookup(rqstp, &cstate->current_fh,
                                                open);
                        if (status)
                                goto out;
                        break;
                case NFS4_OPEN_CLAIM_PREVIOUS:
-                        open->op_stateowner->so_confirmed = 1;
+                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-                        /*
+                case NFS4_OPEN_CLAIM_FH:
-                         * The CURRENT_FH is already set to the file being
+                case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
-                         * opened.  (1) set open->op_cinfo, (2) set
-                         * open->op_truncate if the file is to be truncated
-                         * after opening, (3) do permission checking.
-                        */
                        status = do_open_fhandle(rqstp, &cstate->current_fh,
                                                 open);
                        if (status)
                                goto out;
                        break;
+                case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
                case NFS4_OPEN_CLAIM_DELEGATE_PREV:
-                        open->op_stateowner->so_confirmed = 1;
+                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
                        dprintk("NFSD: unsupported OPEN claim type %d\n",
                                open->op_claim_type);
                        status = nfserr_notsupp;
@@ -381,12 +399,13 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
         */
        status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+        WARN_ON(status && open->op_created);
 out:
-        if (open->op_stateowner) {
+        nfsd4_cleanup_open_state(open, status);
-                nfs4_get_stateowner(open->op_stateowner);
+        if (open->op_openowner)
-                cstate->replay_owner = open->op_stateowner;
+                cstate->replay_owner = &open->op_openowner->oo_owner;
-        }
+        else
-        nfs4_unlock_state();
+                nfs4_unlock_state();
        return status;
 }
@@ -467,17 +486,12 @@ static __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
             struct nfsd4_commit *commit)
 {
-        __be32 status;
        u32 *p = (u32 *)commit->co_verf.data;
        *p++ = nfssvc_boot.tv_sec;
        *p++ = nfssvc_boot.tv_usec;
-        status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+        return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
                             commit->co_count);
-        if (status == nfserr_symlink)
-                status = nfserr_inval;
-        return status;
 }
 static __be32
@@ -492,8 +506,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
                           NFSD_MAY_CREATE);
-        if (status == nfserr_symlink)
-                status = nfserr_notdir;
        if (status)
                return status;
@@ -691,7 +703,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
        readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
-        if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
+        if ((cookie == 1) || (cookie == 2) ||
            (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
                return nfserr_bad_cookie;
@@ -719,8 +731,6 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return nfserr_grace;
        status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
                             remove->rm_name, remove->rm_namelen);
-        if (status == nfserr_symlink)
-                return nfserr_notdir;
        if (!status) {
                fh_unlock(&cstate->current_fh);
                set_change_info(&remove->rm_cinfo, &cstate->current_fh);
@@ -751,8 +761,6 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
                   S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
                status = nfserr_exist;
-        else if (status == nfserr_symlink)
-                status = nfserr_notdir;
        if (!status) {
                set_change_info(&rename->rn_sinfo, &cstate->current_fh);
@@ -892,8 +900,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        write->wr_bytes_written = cnt;
-        if (status == nfserr_symlink)
-                status = nfserr_inval;
        return status;
 }
@@ -930,7 +936,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        count = 4 + (verify->ve_attrlen >> 2);
        buf = kmalloc(count << 2, GFP_KERNEL);
        if (!buf)
-                return nfserr_resource;
+                return nfserr_jukebox;
        status = nfsd4_encode_fattr(&cstate->current_fh,
                                    cstate->current_fh.fh_export,
@@ -994,6 +1000,8 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
+typedef u32(*nfsd4op_rsize)(struct svc_rqst *, struct nfsd4_op *op);
 enum nfsd4_op_flags {
        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
        ALLOWED_ON_ABSENT_FS = 1 << 1,  /* ops processed on absent fs */
@@ -1001,13 +1009,15 @@ enum nfsd4_op_flags {
        /* For rfc 5661 section 2.6.3.1.1: */
        OP_HANDLES_WRONGSEC = 1 << 3,
        OP_IS_PUTFH_LIKE = 1 << 4,
-};
-struct nfsd4_operation {
-        nfsd4op_func op_func;
-        u32 op_flags;
-        char *op_name;
        /*
+         * These are the ops whose result size we estimate before
+         * encoding, to avoid performing an op then not being able to
+         * respond or cache a response.  This includes writes and setattrs
+         * as well as the operations usually called "nonidempotent":
+         */
+        OP_MODIFIES_SOMETHING = 1 << 5,
+        /*
+         * Cache compounds containing these ops in the xid-based drc:
         * We use the DRC for compounds containing non-idempotent
         * operations, *except* those that are 4.1-specific (since
         * sessions provide their own EOS), and except for stateful
@@ -1015,7 +1025,15 @@ struct nfsd4_operation {
         * (since sequence numbers provide EOS for open, lock, etc in
         * the v4.0 case).
         */
-        bool op_cacheresult;
+        OP_CACHEME = 1 << 6,
+};
+struct nfsd4_operation {
+        nfsd4op_func op_func;
+        u32 op_flags;
+        char *op_name;
+        /* Try to get response size before operation */
+        nfsd4op_rsize op_rsize_bop;
 };
 static struct nfsd4_operation nfsd4_ops[];
@@ -1062,7 +1080,7 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
 bool nfsd4_cache_this_op(struct nfsd4_op *op)
 {
-        return OPDESC(op)->op_cacheresult;
+        return OPDESC(op)->op_flags & OP_CACHEME;
 }
 static bool need_wrongsec_check(struct svc_rqst *rqstp)
@@ -1110,6 +1128,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        struct nfsd4_operation *opdesc;
        struct nfsd4_compound_state *cstate = &resp->cstate;
        int             slack_bytes;
+        u32             plen = 0;
        __be32          status;
        resp->xbuf = &rqstp->rq_res;
@@ -1188,6 +1207,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                        goto encode_op;
                }
+                /* If op is non-idempotent */
+                if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+                        plen = opdesc->op_rsize_bop(rqstp, op);
+                        op->status = nfsd4_check_resp_size(resp, plen);
+                }
+                if (op->status)
+                        goto encode_op;
                if (opdesc->op_func)
                        op->status = opdesc->op_func(rqstp, cstate, &op->u);
                else
@@ -1217,7 +1245,7 @@ encode_op:
                        be32_to_cpu(status));
                if (cstate->replay_owner) {
-                        nfs4_put_stateowner(cstate->replay_owner);
+                        nfs4_unlock_state();
                        cstate->replay_owner = NULL;
                }
                /* XXX Ugh, we need to get rid of this kind of special case: */
@@ -1238,6 +1266,144 @@ out:
        return status;
 }
+#define op_encode_hdr_size              (2)
+#define op_encode_stateid_maxsz         (XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define op_encode_verifier_maxsz        (XDR_QUADLEN(NFS4_VERIFIER_SIZE))
+#define op_encode_change_info_maxsz     (5)
+#define nfs4_fattr_bitmap_maxsz         (4)
+#define op_encode_lockowner_maxsz       (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_lock_denied_maxsz     (8 + op_encode_lockowner_maxsz)
+#define nfs4_owner_maxsz                (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define op_encode_ace_maxsz             (3 + nfs4_owner_maxsz)
+#define op_encode_delegation_maxsz      (1 + op_encode_stateid_maxsz + 1 + \
+                                         op_encode_ace_maxsz)
+#define op_encode_channel_attrs_maxsz   (6 + 1 + 1)
+static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size) * sizeof(__be32);
+}
+static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+}
+static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_change_info_maxsz
+                + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_change_info_maxsz)
+                * sizeof(__be32);
+}
+static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+                * sizeof(__be32);
+}
+static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_stateid_maxsz
+                + op_encode_change_info_maxsz + 1
+                + nfs4_fattr_bitmap_maxsz
+                + op_encode_delegation_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        u32 maxcount = 0, rlen = 0;
+        maxcount = svc_max_payload(rqstp);
+        rlen = op->u.read.rd_length;
+        if (rlen > maxcount)
+                rlen = maxcount;
+        return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+}
+static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        u32 rlen = op->u.readdir.rd_maxcount;
+        if (rlen > PAGE_SIZE)
+                rlen = PAGE_SIZE;
+        return (op_encode_hdr_size + op_encode_verifier_maxsz)
+                 * sizeof(__be32) + rlen;
+}
+static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_change_info_maxsz)
+                * sizeof(__be32);
+}
+static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_change_info_maxsz
+                + op_encode_change_info_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+}
+static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+}
+static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+                1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\
+                2 + /*eir_server_owner.so_minor_id */\
+                /* eir_server_owner.so_major_id<> */\
+                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+                /* eir_server_scope<> */\
+                XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
+                1 + /* eir_server_impl_id array length */\
+                0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+}
+static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + \
+                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+                2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+}
+static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size + \
+                XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+                2 + /* csr_sequence, csr_flags */\
+                op_encode_channel_attrs_maxsz + \
+                op_encode_channel_attrs_maxsz) * sizeof(__be32);
+}
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
@@ -1245,20 +1411,27 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_CLOSE] = {
                .op_func = (nfsd4op_func)nfsd4_close,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_CLOSE",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
        },
        [OP_COMMIT] = {
                .op_func = (nfsd4op_func)nfsd4_commit,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_COMMIT",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_commit_rsize,
        },
        [OP_CREATE] = {
                .op_func = (nfsd4op_func)nfsd4_create,
+                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_CREATE",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_rsize,
        },
        [OP_DELEGRETURN] = {
                .op_func = (nfsd4op_func)nfsd4_delegreturn,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_DELEGRETURN",
+                .op_rsize_bop = nfsd4_only_status_rsize,
        },
        [OP_GETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_getattr,
@@ -1271,12 +1444,16 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_LINK] = {
                .op_func = (nfsd4op_func)nfsd4_link,
+                .op_flags = ALLOWED_ON_ABSENT_FS | OP_MODIFIES_SOMETHING
+                                | OP_CACHEME,
                .op_name = "OP_LINK",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_link_rsize,
        },
        [OP_LOCK] = {
                .op_func = (nfsd4op_func)nfsd4_lock,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_LOCK",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_lock_rsize,
        },
        [OP_LOCKT] = {
                .op_func = (nfsd4op_func)nfsd4_lockt,
@@ -1284,7 +1461,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_LOCKU] = {
                .op_func = (nfsd4op_func)nfsd4_locku,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_LOCKU",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
        },
        [OP_LOOKUP] = {
                .op_func = (nfsd4op_func)nfsd4_lookup,
@@ -1302,42 +1481,54 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_OPEN] = {
                .op_func = (nfsd4op_func)nfsd4_open,
-                .op_flags = OP_HANDLES_WRONGSEC,
+                .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
                .op_name = "OP_OPEN",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_open_rsize,
        },
        [OP_OPEN_CONFIRM] = {
                .op_func = (nfsd4op_func)nfsd4_open_confirm,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_OPEN_CONFIRM",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
        },
        [OP_OPEN_DOWNGRADE] = {
                .op_func = (nfsd4op_func)nfsd4_open_downgrade,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_OPEN_DOWNGRADE",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_status_stateid_rsize,
        },
        [OP_PUTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-                                | OP_IS_PUTFH_LIKE,
+                                | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
                .op_name = "OP_PUTFH",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_PUTPUBFH] = {
                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-                                | OP_IS_PUTFH_LIKE,
+                                | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
                .op_name = "OP_PUTPUBFH",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_PUTROOTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putrootfh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-                                | OP_IS_PUTFH_LIKE,
+                                | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
                .op_name = "OP_PUTROOTFH",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_READ] = {
                .op_func = (nfsd4op_func)nfsd4_read,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_READ",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
        },
        [OP_READDIR] = {
                .op_func = (nfsd4op_func)nfsd4_readdir,
+                .op_flags = OP_MODIFIES_SOMETHING,
                .op_name = "OP_READDIR",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
        },
        [OP_READLINK] = {
                .op_func = (nfsd4op_func)nfsd4_readlink,
@@ -1345,29 +1536,36 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_REMOVE] = {
                .op_func = (nfsd4op_func)nfsd4_remove,
+                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_REMOVE",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_remove_rsize,
        },
        [OP_RENAME] = {
-                .op_name = "OP_RENAME",
                .op_func = (nfsd4op_func)nfsd4_rename,
-                .op_cacheresult = true,
+                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+                .op_name = "OP_RENAME",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_rename_rsize,
        },
        [OP_RENEW] = {
                .op_func = (nfsd4op_func)nfsd4_renew,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_RENEW",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_RESTOREFH] = {
                .op_func = (nfsd4op_func)nfsd4_restorefh,
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
-                                | OP_IS_PUTFH_LIKE,
+                                | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING,
                .op_name = "OP_RESTOREFH",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_SAVEFH] = {
                .op_func = (nfsd4op_func)nfsd4_savefh,
-                .op_flags = OP_HANDLES_WRONGSEC,
+                .op_flags = OP_HANDLES_WRONGSEC | OP_MODIFIES_SOMETHING,
                .op_name = "OP_SAVEFH",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_SECINFO] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo,
@@ -1377,19 +1575,22 @@ static struct nfsd4_operation nfsd4_ops[] = {
        [OP_SETATTR] = {
                .op_func = (nfsd4op_func)nfsd4_setattr,
                .op_name = "OP_SETATTR",
-                .op_cacheresult = true,
+                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_setattr_rsize,
        },
        [OP_SETCLIENTID] = {
                .op_func = (nfsd4op_func)nfsd4_setclientid,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_SETCLIENTID",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_setclientid_rsize,
        },
        [OP_SETCLIENTID_CONFIRM] = {
                .op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_SETCLIENTID_CONFIRM",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_VERIFY] = {
                .op_func = (nfsd4op_func)nfsd4_verify,
@@ -1397,35 +1598,46 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_WRITE] = {
                .op_func = (nfsd4op_func)nfsd4_write,
+                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_WRITE",
-                .op_cacheresult = true,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
        },
        [OP_RELEASE_LOCKOWNER] = {
                .op_func = (nfsd4op_func)nfsd4_release_lockowner,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_RELEASE_LOCKOWNER",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        /* NFSv4.1 operations */
        [OP_EXCHANGE_ID] = {
                .op_func = (nfsd4op_func)nfsd4_exchange_id,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_EXCHANGE_ID",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
        },
        [OP_BIND_CONN_TO_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_BIND_CONN_TO_SESSION",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_bind_conn_to_session_rsize,
        },
        [OP_CREATE_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_create_session,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_CREATE_SESSION",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_create_session_rsize,
        },
        [OP_DESTROY_SESSION] = {
                .op_func = (nfsd4op_func)nfsd4_destroy_session,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_DESTROY_SESSION",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_SEQUENCE] = {
                .op_func = (nfsd4op_func)nfsd4_sequence,
@@ -1433,14 +1645,17 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_name = "OP_SEQUENCE",
        },
        [OP_DESTROY_CLIENTID] = {
-                .op_func = NULL,
+                .op_func = (nfsd4op_func)nfsd4_destroy_clientid,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
+                                | OP_MODIFIES_SOMETHING,
                .op_name = "OP_DESTROY_CLIENTID",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_RECLAIM_COMPLETE] = {
                .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
-                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
                .op_name = "OP_RECLAIM_COMPLETE",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_SECINFO_NO_NAME] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
@@ -1454,8 +1669,9 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_FREE_STATEID] = {
                .op_func = (nfsd4op_func)nfsd4_free_stateid,
-                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
                .op_name = "OP_FREE_STATEID",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
 };
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 29d77f60585b..ed083b9a731b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -45,6 +45,7 @@
 /* Globals */
 static struct file *rec_file;
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -88,7 +89,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        struct xdr_netobj cksum;
        struct hash_desc desc;
        struct scatterlist sg;
-        __be32 status = nfserr_resource;
+        __be32 status = nfserr_jukebox;
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
@@ -129,6 +130,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
        if (!rec_file || clp->cl_firststate)
                return 0;
+        clp->cl_firststate = 1;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
@@ -143,10 +145,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                goto out_unlock;
        }
        status = -EEXIST;
-        if (dentry->d_inode) {
+        if (dentry->d_inode)
-                dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
                goto out_put;
-        }
        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out_put;
@@ -156,12 +156,14 @@ out_put:
        dput(dentry);
 out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
-        if (status == 0) {
+        if (status == 0)
-                clp->cl_firststate = 1;
                vfs_fsync(rec_file, 0);
-        }
+        else
+                printk(KERN_ERR "NFSD: failed to write recovery record"
+                                " (err %d); please check that %s exists"
+                                " and is writeable", status,
+                                user_recovery_dirname);
        nfs4_reset_creds(original_cred);
-        dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
        return status;
 }
@@ -354,13 +356,13 @@ nfsd4_recdir_load(void) {
 */
 void
-nfsd4_init_recdir(char *rec_dirname)
+nfsd4_init_recdir()
 {
        const struct cred *original_cred;
        int status;
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
-                        rec_dirname);
+                        user_recovery_dirname);
        BUG_ON(rec_file);
@@ -372,10 +374,10 @@ nfsd4_init_recdir(char *rec_dirname)
                return;
        }
-        rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+        rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
        if (IS_ERR(rec_file)) {
                printk("NFSD: unable to find recovery directory %s\n",
-                                rec_dirname);
+                                user_recovery_dirname);
                rec_file = NULL;
        }
@@ -390,3 +392,30 @@ nfsd4_shutdown_recdir(void)
        fput(rec_file);
        rec_file = NULL;
 }
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+        int status;
+        struct path path;
+        status = kern_path(recdir, LOOKUP_FOLLOW, &path);
+        if (status)
+                return status;
+        status = -ENOTDIR;
+        if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+                strcpy(user_recovery_dirname, recdir);
+                status = 0;
+        }
+        path_put(&path);
+        return status;
+}
+char *
+nfs4_recoverydir(void)
+{
+        return user_recovery_dirname;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3787ec117400..47e94e33a975 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,9 +49,6 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static u32 current_ownerid = 1;
-static u32 current_fileid = 1;
-static u32 current_delegid = 1;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -60,13 +57,7 @@ static u64 current_sessionid = 1;
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 /* forward declarations */
-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
+static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
-static struct nfs4_stateid * search_for_stateid(stateid_t *stid);
-static struct nfs4_delegation * search_for_delegation(stateid_t *stid);
-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static void nfs4_set_recdir(char *recdir);
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner);
 /* Locking: */
@@ -80,7 +71,8 @@ static DEFINE_MUTEX(client_mutex);
 */
 static DEFINE_SPINLOCK(recall_lock);
-static struct kmem_cache *stateowner_slab = NULL;
+static struct kmem_cache *openowner_slab = NULL;
+static struct kmem_cache *lockowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
 static struct kmem_cache *deleg_slab = NULL;
@@ -112,6 +104,11 @@ opaque_hashval(const void *ptr, int nbytes)
 static struct list_head del_recall_lru;
+static void nfsd4_free_file(struct nfs4_file *f)
+{
+        kmem_cache_free(file_slab, f);
+}
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
@@ -119,7 +116,7 @@ put_nfs4_file(struct nfs4_file *fi)
                list_del(&fi->fi_hash);
                spin_unlock(&recall_lock);
                iput(fi->fi_inode);
-                kmem_cache_free(file_slab, fi);
+                nfsd4_free_file(fi);
        }
 }
@@ -136,35 +133,33 @@ unsigned int max_delegations;
 * Open owner state (share locks)
 */
-/* hash tables for nfs4_stateowner */
+/* hash tables for open owners */
-#define OWNER_HASH_BITS              8
+#define OPEN_OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+#define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
-#define ownerid_hashval(id) \
+static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
-        ((id) & OWNER_HASH_MASK)
+{
-#define ownerstr_hashval(clientid, ownername) \
+        unsigned int ret;
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
-static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
+        ret = opaque_hashval(ownername->data, ownername->len);
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
+        ret += clientid;
+        return ret & OPEN_OWNER_HASH_MASK;
+}
+static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-/* hash table for (open)nfs4_stateid */
+static unsigned int file_hashval(struct inode *ino)
-#define STATEID_HASH_BITS              10
+{
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
+        /* XXX: why are we hashing on inode pointer, anyway? */
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+        return hash_ptr(ino, FILE_HASH_BITS);
+}
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
@@ -192,8 +187,15 @@ static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
 static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
        if (atomic_dec_and_test(&fp->fi_access[oflag])) {
-                nfs4_file_put_fd(fp, O_RDWR);
                nfs4_file_put_fd(fp, oflag);
+                /*
+                 * It's also safe to get rid of the RDWR open *if*
+                 * we no longer have need of the other kind of access
+                 * or if we already have the other kind of open:
+                 */
+                if (fp->fi_fds[1-oflag]
+                        || atomic_read(&fp->fi_access[1 - oflag]) == 0)
+                        nfs4_file_put_fd(fp, O_RDWR);
        }
 }
@@ -206,8 +208,73 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
                __nfs4_file_put_access(fp, oflag);
 }
+static inline int get_new_stid(struct nfs4_stid *stid)
+{
+        static int min_stateid = 0;
+        struct idr *stateids = &stid->sc_client->cl_stateids;
+        int new_stid;
+        int error;
+        error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
+        /*
+         * Note: the necessary preallocation was done in
+         * nfs4_alloc_stateid().  The idr code caps the number of
+         * preallocations that can exist at a time, but the state lock
+         * prevents anyone from using ours before we get here:
+         */
+        BUG_ON(error);
+        /*
+         * It shouldn't be a problem to reuse an opaque stateid value.
+         * I don't think it is for 4.1.  But with 4.0 I worry that, for
+         * example, a stray write retransmission could be accepted by
+         * the server when it should have been rejected.  Therefore,
+         * adopt a trick from the sctp code to attempt to maximize the
+         * amount of time until an id is reused, by ensuring they always
+         * "increase" (mod INT_MAX):
+         */
+        min_stateid = new_stid+1;
+        if (min_stateid == INT_MAX)
+                min_stateid = 0;
+        return new_stid;
+}
+static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
+{
+        stateid_t *s = &stid->sc_stateid;
+        int new_id;
+        stid->sc_type = type;
+        stid->sc_client = cl;
+        s->si_opaque.so_clid = cl->cl_clientid;
+        new_id = get_new_stid(stid);
+        s->si_opaque.so_id = (u32)new_id;
+        /* Will be incremented before return to client: */
+        s->si_generation = 0;
+}
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
+{
+        struct idr *stateids = &cl->cl_stateids;
+        if (!idr_pre_get(stateids, GFP_KERNEL))
+                return NULL;
+        /*
+         * Note: if we fail here (or any time between now and the time
+         * we actually get the new idr), we won't need to undo the idr
+         * preallocation, since the idr code caps the number of
+         * preallocated entries.
+         */
+        return kmem_cache_alloc(slab, GFP_KERNEL);
+}
+static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+{
+        return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+}
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
        struct nfs4_delegation *dp;
        struct nfs4_file *fp = stp->st_file;
@@ -224,21 +291,23 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
                return NULL;
        if (num_delegations > max_delegations)
                return NULL;
-        dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
+        dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
        if (dp == NULL)
                return dp;
+        init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+        /*
+         * delegation seqid's are never incremented.  The 4.1 special
+         * meaning of seqid 0 isn't meaningful, really, but let's avoid
+         * 0 anyway just for consistency and use 1:
+         */
+        dp->dl_stid.sc_stateid.si_generation = 1;
        num_delegations++;
        INIT_LIST_HEAD(&dp->dl_perfile);
        INIT_LIST_HEAD(&dp->dl_perclnt);
        INIT_LIST_HEAD(&dp->dl_recall_lru);
-        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
        dp->dl_type = type;
-        dp->dl_stateid.si_boot = boot_time;
-        dp->dl_stateid.si_stateownerid = current_delegid++;
-        dp->dl_stateid.si_fileid = 0;
-        dp->dl_stateid.si_generation = 0;
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
@@ -267,10 +336,18 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        }
 }
+static void unhash_stid(struct nfs4_stid *s)
+{
+        struct idr *stateids = &s->sc_client->cl_stateids;
+        idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
+        unhash_stid(&dp->dl_stid);
        list_del_init(&dp->dl_perclnt);
        spin_lock(&recall_lock);
        list_del_init(&dp->dl_perfile);
@@ -292,10 +369,16 @@ static DEFINE_SPINLOCK(client_lock);
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
 #define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
-#define clientid_hashval(id) \
+static unsigned int clientid_hashval(u32 id)
-        ((id) & CLIENT_HASH_MASK)
+{
-#define clientstr_hashval(name) \
+        return id & CLIENT_HASH_MASK;
-        (opaque_hashval((name), 8) & CLIENT_HASH_MASK)
+}
+static unsigned int clientstr_hashval(const char *name)
+{
+        return opaque_hashval(name, 8) & CLIENT_HASH_MASK;
+}
 /*
 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
 * used in reboot/reset lease grace period processing
@@ -362,7 +445,7 @@ set_deny(unsigned int *deny, unsigned long bmap) {
 }
 static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
        unsigned int access, deny;
        set_access(&access, stp->st_access_bmap);
@@ -385,14 +468,13 @@ static int nfs4_access_to_omode(u32 access)
        BUG();
 }
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-        list_del(&stp->st_hash);
        list_del(&stp->st_perfile);
        list_del(&stp->st_perstateowner);
 }
-static void free_generic_stateid(struct nfs4_stateid *stp)
+static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 {
        int i;
@@ -401,84 +483,106 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
                        if (test_bit(i, &stp->st_access_bmap))
                                nfs4_file_put_access(stp->st_file,
                                                nfs4_access_to_omode(i));
+                        __clear_bit(i, &stp->st_access_bmap);
                }
        }
        put_nfs4_file(stp->st_file);
+        stp->st_file = NULL;
+}
+static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+{
        kmem_cache_free(stateid_slab, stp);
 }
-static void release_lock_stateid(struct nfs4_stateid *stp)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 {
        struct file *file;
        unhash_generic_stateid(stp);
+        unhash_stid(&stp->st_stid);
        file = find_any_file(stp->st_file);
        if (file)
-                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+                locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
+        close_generic_stateid(stp);
        free_generic_stateid(stp);
 }
-static void unhash_lockowner(struct nfs4_stateowner *sop)
+static void unhash_lockowner(struct nfs4_lockowner *lo)
 {
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
-        list_del(&sop->so_idhash);
+        list_del(&lo->lo_owner.so_strhash);
-        list_del(&sop->so_strhash);
+        list_del(&lo->lo_perstateid);
-        list_del(&sop->so_perstateid);
+        while (!list_empty(&lo->lo_owner.so_stateids)) {
-        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&lo->lo_owner.so_stateids,
-                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_ol_stateid, st_perstateowner);
-                                struct nfs4_stateid, st_perstateowner);
                release_lock_stateid(stp);
        }
 }
-static void release_lockowner(struct nfs4_stateowner *sop)
+static void release_lockowner(struct nfs4_lockowner *lo)
 {
-        unhash_lockowner(sop);
+        unhash_lockowner(lo);
-        nfs4_put_stateowner(sop);
+        nfs4_free_lockowner(lo);
 }
 static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
+release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
 {
-        struct nfs4_stateowner *lock_sop;
+        struct nfs4_lockowner *lo;
        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
+                lo = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
+                                struct nfs4_lockowner, lo_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
+                release_lockowner(lo);
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_lockowner(lock_sop);
        }
 }
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
 {
        unhash_generic_stateid(stp);
        release_stateid_lockowners(stp);
+        close_generic_stateid(stp);
+}
+static void release_open_stateid(struct nfs4_ol_stateid *stp)
+{
+        unhash_open_stateid(stp);
+        unhash_stid(&stp->st_stid);
        free_generic_stateid(stp);
 }
-static void unhash_openowner(struct nfs4_stateowner *sop)
+static void unhash_openowner(struct nfs4_openowner *oo)
 {
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
-        list_del(&sop->so_idhash);
+        list_del(&oo->oo_owner.so_strhash);
-        list_del(&sop->so_strhash);
+        list_del(&oo->oo_perclient);
-        list_del(&sop->so_perclient);
+        while (!list_empty(&oo->oo_owner.so_stateids)) {
-        list_del(&sop->so_perstateid); /* XXX: necessary? */
+                stp = list_first_entry(&oo->oo_owner.so_stateids,
-        while (!list_empty(&sop->so_stateids)) {
+                                struct nfs4_ol_stateid, st_perstateowner);
-                stp = list_first_entry(&sop->so_stateids,
-                                struct nfs4_stateid, st_perstateowner);
                release_open_stateid(stp);
        }
 }
-static void release_openowner(struct nfs4_stateowner *sop)
+static void release_last_closed_stateid(struct nfs4_openowner *oo)
 {
-        unhash_openowner(sop);
+        struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
-        list_del(&sop->so_close_lru);
-        nfs4_put_stateowner(sop);
+        if (s) {
+                unhash_stid(&s->st_stid);
+                free_generic_stateid(s);
+                oo->oo_last_closed_stid = NULL;
+        }
+}
+static void release_openowner(struct nfs4_openowner *oo)
+{
+        unhash_openowner(oo);
+        list_del(&oo->oo_close_lru);
+        release_last_closed_stateid(oo);
+        nfs4_free_openowner(oo);
 }
 #define SESSION_HASH_SIZE       512
@@ -843,9 +947,6 @@ renew_client_locked(struct nfs4_client *clp)
                return;
        }
-        /*
-        * Move client to the end to the LRU list.
-        */
        dprintk("renewing client (clientid %08x/%08x)\n", 
                        clp->cl_clientid.cl_boot, 
                        clp->cl_clientid.cl_id);
@@ -943,7 +1044,7 @@ unhash_client_locked(struct nfs4_client *clp)
 static void
 expire_client(struct nfs4_client *clp)
 {
-        struct nfs4_stateowner *sop;
+        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head reaplist;
@@ -961,8 +1062,8 @@ expire_client(struct nfs4_client *clp)
                unhash_delegation(dp);
        }
        while (!list_empty(&clp->cl_openowners)) {
-                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
+                oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
-                release_openowner(sop);
+                release_openowner(oo);
        }
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
@@ -1038,6 +1139,23 @@ static void gen_confirm(struct nfs4_client *clp)
        *p++ = i++;
 }
+static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+{
+        return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+}
+static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+{
+        struct nfs4_stid *s;
+        s = find_stateid(cl, t);
+        if (!s)
+                return NULL;
+        if (typemask & s->sc_type)
+                return s;
+        return NULL;
+}
 static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
                struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -1060,6 +1178,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
                }
        }
+        idr_init(&clp->cl_stateids);
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
@@ -1083,17 +1202,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        return clp;
 }
-static int check_name(struct xdr_netobj name)
-{
-        if (name.len == 0) 
-                return 0;
-        if (name.len > NFS4_OPAQUE_LIMIT) {
-                dprintk("NFSD: check_name: name too long(%d)!\n", name.len);
-                return 0;
-        }
-        return 1;
-}
 static void
 add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 {
@@ -1125,8 +1233,10 @@ find_confirmed_client(clientid_t *clid)
        unsigned int idhashval = clientid_hashval(clid->cl_id);
        list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
-                if (same_clid(&clp->cl_clientid, clid))
+                if (same_clid(&clp->cl_clientid, clid)) {
+                        renew_client(clp);
                        return clp;
+                }
        }
        return NULL;
 }
@@ -1173,20 +1283,6 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
        return NULL;
 }
-static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
-{
-        switch (family) {
-        case AF_INET:
-                ((struct sockaddr_in *)sa)->sin_family = AF_INET;
-                ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
-                return;
-        case AF_INET6:
-                ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
-                ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
-                return;
-        }
-}
 static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
 {
@@ -1218,7 +1314,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
        conn->cb_prog = se->se_callback_prog;
        conn->cb_ident = se->se_callback_ident;
-        rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
+        memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen);
        return;
 out_err:
        conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1350,7 +1446,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                __func__, rqstp, exid, exid->clname.len, exid->clname.data,
                addr_str, exid->flags, exid->spa_how);
-        if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+        if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
                return nfserr_inval;
        /* Currently only support SP4_NONE */
@@ -1849,8 +1945,16 @@ out:
                nfsd4_get_session(cstate->session);
                atomic_inc(&clp->cl_refcount);
-                if (clp->cl_cb_state == NFSD4_CB_DOWN)
+                switch (clp->cl_cb_state) {
-                        seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
+                case NFSD4_CB_DOWN:
+                        seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+                        break;
+                case NFSD4_CB_FAULT:
+                        seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+                        break;
+                default:
+                        seq->status_flags = 0;
+                }
        }
        kfree(conn);
        spin_unlock(&client_lock);
@@ -1858,6 +1962,50 @@ out:
        return status;
 }
+static inline bool has_resources(struct nfs4_client *clp)
+{
+        return !list_empty(&clp->cl_openowners)
+                || !list_empty(&clp->cl_delegations)
+                || !list_empty(&clp->cl_sessions);
+}
+__be32
+nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
+{
+        struct nfs4_client *conf, *unconf, *clp;
+        int status = 0;
+        nfs4_lock_state();
+        unconf = find_unconfirmed_client(&dc->clientid);
+        conf = find_confirmed_client(&dc->clientid);
+        if (conf) {
+                clp = conf;
+                if (!is_client_expired(conf) && has_resources(conf)) {
+                        status = nfserr_clientid_busy;
+                        goto out;
+                }
+                /* rfc5661 18.50.3 */
+                if (cstate->session && conf == cstate->session->se_client) {
+                        status = nfserr_clientid_busy;
+                        goto out;
+                }
+        } else if (unconf)
+                clp = unconf;
+        else {
+                status = nfserr_stale_clientid;
+                goto out;
+        }
+        expire_client(clp);
+out:
+        nfs4_unlock_state();
+        dprintk("%s return %d\n", __func__, ntohl(status));
+        return status;
+}
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
@@ -1900,19 +2048,13 @@ __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
-        struct xdr_netobj       clname = { 
+        struct xdr_netobj       clname = setclid->se_name;
-                .len = setclid->se_namelen,
-                .data = setclid->se_name,
-        };
        nfs4_verifier           clverifier = setclid->se_verf;
        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
        __be32                  status;
        char                    dname[HEXDIR_LEN];
        
-        if (!check_name(clname))
-                return nfserr_inval;
        status = nfs4_make_rec_clidname(dname, &clname);
        if (status)
                return status;
@@ -1946,7 +2088,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * of 5 bullet points, labeled as CASE0 - CASE4 below.
         */
        unconf = find_unconfirmed_client_by_str(dname, strhashval);
-        status = nfserr_resource;
+        status = nfserr_jukebox;
        if (!conf) {
                /*
                 * RFC 3530 14.2.33 CASE 4:
@@ -2116,31 +2258,28 @@ out:
        return status;
 }
+static struct nfs4_file *nfsd4_alloc_file(void)
+{
+        return kmem_cache_alloc(file_slab, GFP_KERNEL);
+}
 /* OPEN Share state helper functions */
-static inline struct nfs4_file *
+static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
-alloc_init_file(struct inode *ino)
 {
-        struct nfs4_file *fp;
        unsigned int hashval = file_hashval(ino);
-        fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
+        atomic_set(&fp->fi_ref, 1);
-        if (fp) {
+        INIT_LIST_HEAD(&fp->fi_hash);
-                atomic_set(&fp->fi_ref, 1);
+        INIT_LIST_HEAD(&fp->fi_stateids);
-                INIT_LIST_HEAD(&fp->fi_hash);
+        INIT_LIST_HEAD(&fp->fi_delegations);
-                INIT_LIST_HEAD(&fp->fi_stateids);
+        fp->fi_inode = igrab(ino);
-                INIT_LIST_HEAD(&fp->fi_delegations);
+        fp->fi_had_conflict = false;
-                fp->fi_inode = igrab(ino);
+        fp->fi_lease = NULL;
-                fp->fi_id = current_fileid++;
+        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
-                fp->fi_had_conflict = false;
+        memset(fp->fi_access, 0, sizeof(fp->fi_access));
-                fp->fi_lease = NULL;
+        spin_lock(&recall_lock);
-                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+        list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-                memset(fp->fi_access, 0, sizeof(fp->fi_access));
+        spin_unlock(&recall_lock);
-                spin_lock(&recall_lock);
-                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-                spin_unlock(&recall_lock);
-                return fp;
-        }
-        return NULL;
 }
 static void
@@ -2155,7 +2294,8 @@ nfsd4_free_slab(struct kmem_cache **slab)
 void
 nfsd4_free_slabs(void)
 {
-        nfsd4_free_slab(&stateowner_slab);
+        nfsd4_free_slab(&openowner_slab);
+        nfsd4_free_slab(&lockowner_slab);
        nfsd4_free_slab(&file_slab);
        nfsd4_free_slab(&stateid_slab);
        nfsd4_free_slab(&deleg_slab);
@@ -2164,16 +2304,20 @@ nfsd4_free_slabs(void)
 static int
 nfsd4_init_slabs(void)
 {
-        stateowner_slab = kmem_cache_create("nfsd4_stateowners",
+        openowner_slab = kmem_cache_create("nfsd4_openowners",
-                        sizeof(struct nfs4_stateowner), 0, 0, NULL);
+                        sizeof(struct nfs4_openowner), 0, 0, NULL);
-        if (stateowner_slab == NULL)
+        if (openowner_slab == NULL)
+                goto out_nomem;
+        lockowner_slab = kmem_cache_create("nfsd4_lockowners",
+                        sizeof(struct nfs4_openowner), 0, 0, NULL);
+        if (lockowner_slab == NULL)
                goto out_nomem;
        file_slab = kmem_cache_create("nfsd4_files",
                        sizeof(struct nfs4_file), 0, 0, NULL);
        if (file_slab == NULL)
                goto out_nomem;
        stateid_slab = kmem_cache_create("nfsd4_stateids",
-                        sizeof(struct nfs4_stateid), 0, 0, NULL);
+                        sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
        if (stateid_slab == NULL)
                goto out_nomem;
        deleg_slab = kmem_cache_create("nfsd4_delegations",
@@ -2187,97 +2331,94 @@ out_nomem:
        return -ENOMEM;
 }
-void
+void nfs4_free_openowner(struct nfs4_openowner *oo)
-nfs4_free_stateowner(struct kref *kref)
 {
-        struct nfs4_stateowner *sop =
+        kfree(oo->oo_owner.so_owner.data);
-                container_of(kref, struct nfs4_stateowner, so_ref);
+        kmem_cache_free(openowner_slab, oo);
-        kfree(sop->so_owner.data);
-        kmem_cache_free(stateowner_slab, sop);
 }
-static inline struct nfs4_stateowner *
+void nfs4_free_lockowner(struct nfs4_lockowner *lo)
-alloc_stateowner(struct xdr_netobj *owner)
 {
-        struct nfs4_stateowner *sop;
+        kfree(lo->lo_owner.so_owner.data);
+        kmem_cache_free(lockowner_slab, lo);
+}
-        if ((sop = kmem_cache_alloc(stateowner_slab, GFP_KERNEL))) {
+static void init_nfs4_replay(struct nfs4_replay *rp)
-                if ((sop->so_owner.data = kmalloc(owner->len, GFP_KERNEL))) {
+{
-                        memcpy(sop->so_owner.data, owner->data, owner->len);
+        rp->rp_status = nfserr_serverfault;
-                        sop->so_owner.len = owner->len;
+        rp->rp_buflen = 0;
-                        kref_init(&sop->so_ref);
+        rp->rp_buf = rp->rp_ibuf;
-                        return sop;
-                } 
-                kmem_cache_free(stateowner_slab, sop);
-        }
-        return NULL;
 }
-static struct nfs4_stateowner *
+static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+{
        struct nfs4_stateowner *sop;
-        struct nfs4_replay *rp;
-        unsigned int idhashval;
-        if (!(sop = alloc_stateowner(&open->op_owner)))
+        sop = kmem_cache_alloc(slab, GFP_KERNEL);
+        if (!sop)
+                return NULL;
+        sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL);
+        if (!sop->so_owner.data) {
+                kmem_cache_free(slab, sop);
                return NULL;
-        idhashval = ownerid_hashval(current_ownerid);
+        }
-        INIT_LIST_HEAD(&sop->so_idhash);
+        sop->so_owner.len = owner->len;
-        INIT_LIST_HEAD(&sop->so_strhash);
-        INIT_LIST_HEAD(&sop->so_perclient);
        INIT_LIST_HEAD(&sop->so_stateids);
-        INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
-        INIT_LIST_HEAD(&sop->so_close_lru);
-        sop->so_time = 0;
-        list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
-        list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
-        list_add(&sop->so_perclient, &clp->cl_openowners);
-        sop->so_is_open_owner = 1;
-        sop->so_id = current_ownerid++;
        sop->so_client = clp;
-        sop->so_seqid = open->op_seqid;
+        init_nfs4_replay(&sop->so_replay);
-        sop->so_confirmed = 0;
-        rp = &sop->so_replay;
-        rp->rp_status = nfserr_serverfault;
-        rp->rp_buflen = 0;
-        rp->rp_buf = rp->rp_ibuf;
        return sop;
 }
-static inline void
+static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
-init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+{
-        struct nfs4_stateowner *sop = open->op_stateowner;
+        list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
-        unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+        list_add(&oo->oo_perclient, &clp->cl_openowners);
+}
-        INIT_LIST_HEAD(&stp->st_hash);
+static struct nfs4_openowner *
-        INIT_LIST_HEAD(&stp->st_perstateowner);
+alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+        struct nfs4_openowner *oo;
+        oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
+        if (!oo)
+                return NULL;
+        oo->oo_owner.so_is_open_owner = 1;
+        oo->oo_owner.so_seqid = open->op_seqid;
+        oo->oo_flags = NFS4_OO_NEW;
+        oo->oo_time = 0;
+        oo->oo_last_closed_stid = NULL;
+        INIT_LIST_HEAD(&oo->oo_close_lru);
+        hash_openowner(oo, clp, strhashval);
+        return oo;
+}
+static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+        struct nfs4_openowner *oo = open->op_openowner;
+        struct nfs4_client *clp = oo->oo_owner.so_client;
+        init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
        INIT_LIST_HEAD(&stp->st_lockowners);
-        INIT_LIST_HEAD(&stp->st_perfile);
+        list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
-        list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
-        list_add(&stp->st_perstateowner, &sop->so_stateids);
        list_add(&stp->st_perfile, &fp->fi_stateids);
-        stp->st_stateowner = sop;
+        stp->st_stateowner = &oo->oo_owner;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
-        stp->st_stateid.si_stateownerid = sop->so_id;
-        stp->st_stateid.si_fileid = fp->fi_id;
-        stp->st_stateid.si_generation = 0;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-        __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+        __set_bit(open->op_share_access, &stp->st_access_bmap);
-                  &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        stp->st_openstp = NULL;
 }
 static void
-move_to_close_lru(struct nfs4_stateowner *sop)
+move_to_close_lru(struct nfs4_openowner *oo)
 {
-        dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
+        dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
-        list_move_tail(&sop->so_close_lru, &close_lru);
+        list_move_tail(&oo->oo_close_lru, &close_lru);
-        sop->so_time = get_seconds();
+        oo->oo_time = get_seconds();
 }
 static int
@@ -2289,14 +2430,18 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
                (sop->so_client->cl_clientid.cl_id == clid->cl_id);
 }
-static struct nfs4_stateowner *
+static struct nfs4_openowner *
 find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
 {
-        struct nfs4_stateowner *so = NULL;
+        struct nfs4_stateowner *so;
+        struct nfs4_openowner *oo;
-        list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+        list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
-                if (same_owner_str(so, &open->op_owner, &open->op_clientid))
+                if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
-                        return so;
+                        oo = openowner(so);
+                        renew_client(oo->oo_owner.so_client);
+                        return oo;
+                }
        }
        return NULL;
 }
@@ -2320,31 +2465,6 @@ find_file(struct inode *ino)
        return NULL;
 }
-static inline int access_valid(u32 x, u32 minorversion)
-{
-        if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
-                return 0;
-        if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
-                return 0;
-        x &= ~NFS4_SHARE_ACCESS_MASK;
-        if (minorversion && x) {
-                if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
-                        return 0;
-                if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
-                        return 0;
-                x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
-        }
-        if (x)
-                return 0;
-        return 1;
-}
-static inline int deny_valid(u32 x)
-{
-        /* Note: unlike access bits, deny bits may be zero. */
-        return x <= NFS4_SHARE_DENY_BOTH;
-}
 /*
 * Called to check deny when READ with all zero stateid or
 * WRITE with all zero or all one stateid
@@ -2354,7 +2474,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_file *fp;
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
        __be32 ret;
        dprintk("NFSD: nfs4_share_conflict\n");
@@ -2429,6 +2549,16 @@ static const struct lock_manager_operations nfsd_lease_mng_ops = {
        .lm_change = nfsd_change_deleg_cb,
 };
+static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid)
+{
+        if (nfsd4_has_session(cstate))
+                return nfs_ok;
+        if (seqid == so->so_seqid - 1)
+                return nfserr_replay_me;
+        if (seqid == so->so_seqid)
+                return nfs_ok;
+        return nfserr_bad_seqid;
+}
 __be32
 nfsd4_process_open1(struct nfsd4_compound_state *cstate,
@@ -2437,57 +2567,49 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
        clientid_t *clientid = &open->op_clientid;
        struct nfs4_client *clp = NULL;
        unsigned int strhashval;
-        struct nfs4_stateowner *sop = NULL;
+        struct nfs4_openowner *oo = NULL;
+        __be32 status;
-        if (!check_name(open->op_owner))
-                return nfserr_inval;
        if (STALE_CLIENTID(&open->op_clientid))
                return nfserr_stale_clientid;
+        /*
+         * In case we need it later, after we've already created the
+         * file and don't want to risk a further failure:
+         */
+        open->op_file = nfsd4_alloc_file();
+        if (open->op_file == NULL)
+                return nfserr_jukebox;
-        strhashval = ownerstr_hashval(clientid->cl_id, open->op_owner);
+        strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
-        sop = find_openstateowner_str(strhashval, open);
+        oo = find_openstateowner_str(strhashval, open);
-        open->op_stateowner = sop;
+        open->op_openowner = oo;
-        if (!sop) {
+        if (!oo) {
-                /* Make sure the client's lease hasn't expired. */
                clp = find_confirmed_client(clientid);
                if (clp == NULL)
                        return nfserr_expired;
-                goto renew;
+                goto new_owner;
        }
-        /* When sessions are used, skip open sequenceid processing */
+        if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
-        if (nfsd4_has_session(cstate))
-                goto renew;
-        if (!sop->so_confirmed) {
                /* Replace unconfirmed owners without checking for replay. */
-                clp = sop->so_client;
+                clp = oo->oo_owner.so_client;
-                release_openowner(sop);
+                release_openowner(oo);
-                open->op_stateowner = NULL;
+                open->op_openowner = NULL;
-                goto renew;
+                goto new_owner;
-        }
-        if (open->op_seqid == sop->so_seqid - 1) {
-                if (sop->so_replay.rp_buflen)
-                        return nfserr_replay_me;
-                /* The original OPEN failed so spectacularly
-                 * that we don't even have replay data saved!
-                 * Therefore, we have no choice but to continue
-                 * processing this OPEN; presumably, we'll
-                 * fail again for the same reason.
-                 */
-                dprintk("nfsd4_process_open1: replay with no replay cache\n");
-                goto renew;
-        }
-        if (open->op_seqid != sop->so_seqid)
-                return nfserr_bad_seqid;
-renew:
-        if (open->op_stateowner == NULL) {
-                sop = alloc_init_open_stateowner(strhashval, clp, open);
-                if (sop == NULL)
-                        return nfserr_resource;
-                open->op_stateowner = sop;
        }
-        list_del_init(&sop->so_close_lru);
+        status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
-        renew_client(sop->so_client);
+        if (status)
+                return status;
+        clp = oo->oo_owner.so_client;
+        goto alloc_stateid;
+new_owner:
+        oo = alloc_init_open_stateowner(strhashval, clp, open);
+        if (oo == NULL)
+                return nfserr_jukebox;
+        open->op_openowner = oo;
+alloc_stateid:
+        open->op_stp = nfs4_alloc_stateid(clp);
+        if (!open->op_stp)
+                return nfserr_jukebox;
        return nfs_ok;
 }
@@ -2500,36 +2622,37 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
                return nfs_ok;
 }
-static struct nfs4_delegation *
+static int share_access_to_flags(u32 share_access)
-find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
-        struct nfs4_delegation *dp;
+        share_access &= ~NFS4_SHARE_WANT_MASK;
-        spin_lock(&recall_lock);
+        return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
-                        spin_unlock(&recall_lock);
-                        return dp;
-                }
-        spin_unlock(&recall_lock);
-        return NULL;
 }
-static int share_access_to_flags(u32 share_access)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
 {
-        share_access &= ~NFS4_SHARE_WANT_MASK;
+        struct nfs4_stid *ret;
-        return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+        ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID);
+        if (!ret)
+                return NULL;
+        return delegstateid(ret);
+}
+static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
+{
+        return open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+               open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH;
 }
 static __be32
-nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
                struct nfs4_delegation **dp)
 {
        int flags;
        __be32 status = nfserr_bad_stateid;
-        *dp = find_delegation_file(fp, &open->op_delegate_stateid);
+        *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
        if (*dp == NULL)
                goto out;
        flags = share_access_to_flags(open->op_share_access);
@@ -2537,41 +2660,37 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
        if (status)
                *dp = NULL;
 out:
-        if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR)
+        if (!nfsd4_is_deleg_cur(open))
                return nfs_ok;
        if (status)
                return status;
-        open->op_stateowner->so_confirmed = 1;
+        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
        return nfs_ok;
 }
 static __be32
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
+nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
 {
-        struct nfs4_stateid *local;
+        struct nfs4_ol_stateid *local;
-        __be32 status = nfserr_share_denied;
+        struct nfs4_openowner *oo = open->op_openowner;
-        struct nfs4_stateowner *sop = open->op_stateowner;
        list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
                /* ignore lock owners */
                if (local->st_stateowner->so_is_open_owner == 0)
                        continue;
                /* remember if we have seen this open owner */
-                if (local->st_stateowner == sop)
+                if (local->st_stateowner == &oo->oo_owner)
                        *stpp = local;
                /* check for conflicting share reservations */
                if (!test_share(local, open))
-                        goto out;
+                        return nfserr_share_denied;
        }
-        status = 0;
+        return nfs_ok;
-out:
-        return status;
 }
-static inline struct nfs4_stateid *
+static void nfs4_free_stateid(struct nfs4_ol_stateid *s)
-nfs4_alloc_stateid(void)
 {
-        return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
+        kmem_cache_free(stateid_slab, s);
 }
 static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -2592,12 +2711,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
        int oflag = nfs4_access_to_omode(open->op_share_access);
        int access = nfs4_access_to_access(open->op_share_access);
-        /* CLAIM_DELEGATE_CUR is used in response to a broken lease;
-         * allowing it to break the lease and return EAGAIN leaves the
-         * client unable to make progress in returning the delegation */
-        if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
-                access |= NFSD_MAY_NOT_BREAK_LEASE;
        if (!fp->fi_fds[oflag]) {
                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
                        &fp->fi_fds[oflag]);
@@ -2609,27 +2722,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
        return nfs_ok;
 }
-static __be32
-nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
-                struct nfs4_file *fp, struct svc_fh *cur_fh,
-                struct nfsd4_open *open)
-{
-        struct nfs4_stateid *stp;
-        __be32 status;
-        stp = nfs4_alloc_stateid();
-        if (stp == NULL)
-                return nfserr_resource;
-        status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
-        if (status) {
-                kmem_cache_free(stateid_slab, stp);
-                return status;
-        }
-        *stpp = stp;
-        return 0;
-}
 static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
                struct nfsd4_open *open)
@@ -2646,9 +2738,9 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
 {
-        u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+        u32 op_share_access = open->op_share_access;
        bool new_access;
        __be32 status;
@@ -2677,8 +2769,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 static void
 nfs4_set_claim_prev(struct nfsd4_open *open)
 {
-        open->op_stateowner->so_confirmed = 1;
+        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-        open->op_stateowner->so_client->cl_firststate = 1;
+        open->op_openowner->oo_owner.so_client->cl_firststate = 1;
 }
 /* Should we give out recallable state?: */
@@ -2721,7 +2813,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
        if (!fl)
                return -ENOMEM;
        fl->fl_file = find_readable_file(fp);
-        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
        if (status) {
                list_del_init(&dp->dl_perclnt);
@@ -2750,7 +2842,7 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
        atomic_inc(&fp->fi_delegees);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        spin_unlock(&recall_lock);
-        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
        return 0;
 }
@@ -2758,14 +2850,14 @@ static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
 * Attempt to hand out a delegation.
 */
 static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_stateid *stp)
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
 {
        struct nfs4_delegation *dp;
-        struct nfs4_stateowner *sop = stp->st_stateowner;
+        struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
        int cb_up;
        int status, flag = 0;
-        cb_up = nfsd4_cb_channel_good(sop->so_client);
+        cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
        flag = NFS4_OPEN_DELEGATE_NONE;
        open->op_recall = 0;
        switch (open->op_claim_type) {
@@ -2781,7 +2873,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                         * had the chance to reclaim theirs.... */
                        if (locks_in_grace())
                                goto out;
-                        if (!cb_up || !sop->so_confirmed)
+                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
                                goto out;
                        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                                flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2792,17 +2884,17 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                        goto out;
        }
-        dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
+        dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
        if (dp == NULL)
                goto out_no_deleg;
        status = nfs4_set_delegation(dp, flag);
        if (status)
                goto out_free;
-        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
+        memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
        dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
-                STATEID_VAL(&dp->dl_stateid));
+                STATEID_VAL(&dp->dl_stid.sc_stateid));
 out:
        if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
                        && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2824,16 +2916,13 @@ __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
        struct nfs4_file *fp = NULL;
        struct inode *ino = current_fh->fh_dentry->d_inode;
-        struct nfs4_stateid *stp = NULL;
+        struct nfs4_ol_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
        __be32 status;
-        status = nfserr_inval;
-        if (!access_valid(open->op_share_access, resp->cstate.minorversion)
-                        || !deny_valid(open->op_share_deny))
-                goto out;
        /*
         * Lookup file; if found, lookup stateid and check open request,
         * and check for delegations in the process of being recalled.
@@ -2843,17 +2932,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        if (fp) {
                if ((status = nfs4_check_open(fp, open, &stp)))
                        goto out;
-                status = nfs4_check_deleg(fp, open, &dp);
+                status = nfs4_check_deleg(cl, fp, open, &dp);
                if (status)
                        goto out;
        } else {
                status = nfserr_bad_stateid;
-                if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
+                if (nfsd4_is_deleg_cur(open))
-                        goto out;
-                status = nfserr_resource;
-                fp = alloc_init_file(ino);
-                if (fp == NULL)
                        goto out;
+                status = nfserr_jukebox;
+                fp = open->op_file;
+                open->op_file = NULL;
+                nfsd4_init_file(fp, ino);
        }
        /*
@@ -2865,24 +2954,24 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
                if (status)
                        goto out;
-                update_stateid(&stp->st_stateid);
        } else {
-                status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
+                status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
                if (status)
                        goto out;
-                init_stateid(stp, fp, open);
+                stp = open->op_stp;
+                open->op_stp = NULL;
+                init_open_stateid(stp, fp, open);
                status = nfsd4_truncate(rqstp, current_fh, open);
                if (status) {
                        release_open_stateid(stp);
                        goto out;
                }
-                if (nfsd4_has_session(&resp->cstate))
-                        update_stateid(&stp->st_stateid);
        }
-        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
+        update_stateid(&stp->st_stid.sc_stateid);
+        memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        if (nfsd4_has_session(&resp->cstate))
-                open->op_stateowner->so_confirmed = 1;
+                open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
        /*
        * Attempt to hand out a delegation. No error return, because the
@@ -2893,7 +2982,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        status = nfs_ok;
        dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
-                STATEID_VAL(&stp->st_stateid));
+                STATEID_VAL(&stp->st_stid.sc_stateid));
 out:
        if (fp)
                put_nfs4_file(fp);
@@ -2903,13 +2992,34 @@ out:
        * To finish the open response, we just need to set the rflags.
        */
        open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-        if (!open->op_stateowner->so_confirmed &&
+        if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
            !nfsd4_has_session(&resp->cstate))
                open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
        return status;
 }
+void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+{
+        if (open->op_openowner) {
+                struct nfs4_openowner *oo = open->op_openowner;
+                if (!list_empty(&oo->oo_owner.so_stateids))
+                        list_del_init(&oo->oo_close_lru);
+                if (oo->oo_flags & NFS4_OO_NEW) {
+                        if (status) {
+                                release_openowner(oo);
+                                open->op_openowner = NULL;
+                        } else
+                                oo->oo_flags &= ~NFS4_OO_NEW;
+                }
+        }
+        if (open->op_file)
+                nfsd4_free_file(open->op_file);
+        if (open->op_stp)
+                nfs4_free_stateid(open->op_stp);
+}
 __be32
 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            clientid_t *clid)
@@ -2930,7 +3040,6 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                dprintk("nfsd4_renew: clientid not found!\n");
                goto out;
        }
-        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
                        && clp->cl_cb_state != NFSD4_CB_UP)
@@ -2962,7 +3071,7 @@ static time_t
 nfs4_laundromat(void)
 {
        struct nfs4_client *clp;
-        struct nfs4_stateowner *sop;
+        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head *pos, *next, reaplist;
        time_t cutoff = get_seconds() - nfsd4_lease;
@@ -3019,16 +3128,14 @@ nfs4_laundromat(void)
        }
        test_val = nfsd4_lease;
        list_for_each_safe(pos, next, &close_lru) {
-                sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
+                oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
-                if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
+                if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
-                        u = sop->so_time - cutoff;
+                        u = oo->oo_time - cutoff;
                        if (test_val > u)
                                test_val = u;
                        break;
                }
-                dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
+                release_openowner(oo);
-                        sop->so_id);
-                release_openowner(sop);
        }
        if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
                clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -3050,30 +3157,17 @@ laundromat_main(struct work_struct *not_used)
        queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
 }
-static struct nfs4_stateowner *
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
-search_close_lru(u32 st_id, int flags)
 {
-        struct nfs4_stateowner *local = NULL;
+        if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+                return nfserr_bad_stateid;
-        if (flags & CLOSE_STATE) {
+        return nfs_ok;
-                list_for_each_entry(local, &close_lru, so_close_lru) {
-                        if (local->so_id == st_id)
-                                return local;
-                }
-        }
-        return NULL;
-}
-static inline int
-nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
-{
-        return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
 }
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (stateid->si_boot == boot_time)
+        if (stateid->si_opaque.so_clid.cl_boot == boot_time)
                return 0;
        dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
                STATEID_VAL(stateid));
@@ -3096,7 +3190,7 @@ access_permit_write(unsigned long access_bmap)
 }
 static
-__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
 {
        __be32 status = nfserr_openmode;
@@ -3139,68 +3233,80 @@ grace_disallows_io(struct inode *inode)
        return locks_in_grace() && mandatory_lock(inode);
 }
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
+/* Returns true iff a is later than b: */
+static bool stateid_generation_after(stateid_t *a, stateid_t *b)
+{
+        return (s32)a->si_generation - (s32)b->si_generation > 0;
+}
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
        /*
         * When sessions are used the stateid generation number is ignored
         * when it is zero.
         */
-        if ((flags & HAS_SESSION) && in->si_generation == 0)
+        if (has_session && in->si_generation == 0)
-                goto out;
+                return nfs_ok;
+        if (in->si_generation == ref->si_generation)
+                return nfs_ok;
        /* If the client sends us a stateid from the future, it's buggy: */
-        if (in->si_generation > ref->si_generation)
+        if (stateid_generation_after(in, ref))
                return nfserr_bad_stateid;
        /*
-         * The following, however, can happen.  For example, if the
+         * However, we could see a stateid from the past, even from a
-         * client sends an open and some IO at the same time, the open
+         * non-buggy client.  For example, if the client sends a lock
-         * may bump si_generation while the IO is still in flight.
+         * while some IO is outstanding, the lock may bump si_generation
-         * Thanks to hard links and renames, the client never knows what
+         * while the IO is still in flight.  The client could avoid that
-         * file an open will affect.  So it could avoid that situation
+         * situation by waiting for responses on all the IO requests,
-         * only by serializing all opens and IO from the same open
+         * but better performance may result in retrying IO that
-         * owner.  To recover from the old_stateid error, the client
+         * receives an old_stateid error if requests are rarely
-         * will just have to retry the IO:
+         * reordered in flight:
         */
-        if (in->si_generation < ref->si_generation)
+        return nfserr_old_stateid;
-                return nfserr_old_stateid;
-out:
-        return nfs_ok;
 }
-static int is_delegation_stateid(stateid_t *stateid)
+__be32 nfs4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 {
-        return stateid->si_fileid == 0;
+        struct nfs4_stid *s;
-}
+        struct nfs4_ol_stateid *ols;
+        __be32 status;
-static int is_open_stateid(struct nfs4_stateid *stateid)
+        if (STALE_STATEID(stateid))
-{
+                return nfserr_stale_stateid;
-        return stateid->st_openstp == NULL;
+        s = find_stateid(cl, stateid);
+        if (!s)
+                 return nfserr_stale_stateid;
+        status = check_stateid_generation(stateid, &s->sc_stateid, 1);
+        if (status)
+                return status;
+        if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+                return nfs_ok;
+        ols = openlockstateid(s);
+        if (ols->st_stateowner->so_is_open_owner
+            && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+                return nfserr_bad_stateid;
+        return nfs_ok;
 }
-__be32 nfs4_validate_stateid(stateid_t *stateid, int flags)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s)
 {
-        struct nfs4_stateid *stp = NULL;
+        struct nfs4_client *cl;
-        __be32 status = nfserr_stale_stateid;
+        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+                return nfserr_bad_stateid;
        if (STALE_STATEID(stateid))
-                goto out;
+                return nfserr_stale_stateid;
+        cl = find_confirmed_client(&stateid->si_opaque.so_clid);
-        status = nfserr_expired;
+        if (!cl)
-        stp = search_for_stateid(stateid);
+                return nfserr_expired;
-        if (!stp)
+        *s = find_stateid_by_type(cl, stateid, typemask);
-                goto out;
+        if (!*s)
-        status = nfserr_bad_stateid;
+                return nfserr_bad_stateid;
+        return nfs_ok;
-        if (!stp->st_stateowner->so_confirmed)
-                goto out;
-        status = check_stateid_generation(stateid, &stp->st_stateid, flags);
-        if (status)
-                goto out;
-        status = nfs_ok;
-out:
-        return status;
 }
 /*
@@ -3210,7 +3316,8 @@ __be32
 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                           stateid_t *stateid, int flags, struct file **filpp)
 {
-        struct nfs4_stateid *stp = NULL;
+        struct nfs4_stid *s;
+        struct nfs4_ol_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
        struct svc_fh *current_fh = &cstate->current_fh;
        struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -3222,60 +3329,47 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        if (grace_disallows_io(ino))
                return nfserr_grace;
-        if (nfsd4_has_session(cstate))
-                flags |= HAS_SESSION;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(current_fh, stateid, flags);
-        status = nfserr_stale_stateid;
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s);
-        if (STALE_STATEID(stateid)) 
+        if (status)
+                return status;
+        status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+        if (status)
                goto out;
+        switch (s->sc_type) {
-        /*
+        case NFS4_DELEG_STID:
-         * We assume that any stateid that has the current boot time,
+                dp = delegstateid(s);
-         * but that we can't find, is expired:
-         */
-        status = nfserr_expired;
-        if (is_delegation_stateid(stateid)) {
-                dp = find_delegation_stateid(ino, stateid);
-                if (!dp)
-                        goto out;
-                status = check_stateid_generation(stateid, &dp->dl_stateid,
-                                                  flags);
-                if (status)
-                        goto out;
                status = nfs4_check_delegmode(dp, flags);
                if (status)
                        goto out;
-                renew_client(dp->dl_client);
                if (filpp) {
                        *filpp = dp->dl_file->fi_deleg_file;
                        BUG_ON(!*filpp);
                }
-        } else { /* open or lock stateid */
+                break;
-                stp = find_stateid(stateid, flags);
+        case NFS4_OPEN_STID:
-                if (!stp)
+        case NFS4_LOCK_STID:
-                        goto out;
+                stp = openlockstateid(s);
-                status = nfserr_bad_stateid;
+                status = nfs4_check_fh(current_fh, stp);
-                if (nfs4_check_fh(current_fh, stp))
-                        goto out;
-                if (!stp->st_stateowner->so_confirmed)
-                        goto out;
-                status = check_stateid_generation(stateid, &stp->st_stateid,
-                                                  flags);
                if (status)
                        goto out;
+                if (stp->st_stateowner->so_is_open_owner
+                    && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+                        goto out;
                status = nfs4_check_openmode(stp, flags);
                if (status)
                        goto out;
-                renew_client(stp->st_stateowner->so_client);
                if (filpp) {
                        if (flags & RD_STATE)
                                *filpp = find_readable_file(stp->st_file);
                        else
                                *filpp = find_writeable_file(stp->st_file);
                }
+                break;
+        default:
+                return nfserr_bad_stateid;
        }
        status = nfs_ok;
 out:
@@ -3283,18 +3377,9 @@ out:
 }
 static __be32
-nfsd4_free_delegation_stateid(stateid_t *stateid)
+nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
-        struct nfs4_delegation *dp = search_for_delegation(stateid);
+        if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
-        if (dp)
-                return nfserr_locks_held;
-        return nfserr_bad_stateid;
-}
-static __be32
-nfsd4_free_lock_stateid(struct nfs4_stateid *stp)
-{
-        if (check_for_locks(stp->st_file, stp->st_stateowner))
                return nfserr_locks_held;
        release_lock_stateid(stp);
        return nfs_ok;
@@ -3307,51 +3392,40 @@ __be32
 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   struct nfsd4_test_stateid *test_stateid)
 {
-        test_stateid->ts_has_session = nfsd4_has_session(cstate);
+        /* real work is done during encoding */
        return nfs_ok;
 }
-/*
- * Free a state id
- */
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   struct nfsd4_free_stateid *free_stateid)
 {
        stateid_t *stateid = &free_stateid->fr_stateid;
-        struct nfs4_stateid *stp;
+        struct nfs4_stid *s;
-        __be32 ret;
+        struct nfs4_client *cl = cstate->session->se_client;
+        __be32 ret = nfserr_bad_stateid;
        nfs4_lock_state();
-        if (is_delegation_stateid(stateid)) {
+        s = find_stateid(cl, stateid);
-                ret = nfsd4_free_delegation_stateid(stateid);
+        if (!s)
-                goto out;
-        }
-        stp = search_for_stateid(stateid);
-        if (!stp) {
-                ret = nfserr_bad_stateid;
                goto out;
-        }
+        switch (s->sc_type) {
-        if (stateid->si_generation != 0) {
+        case NFS4_DELEG_STID:
-                if (stateid->si_generation < stp->st_stateid.si_generation) {
-                        ret = nfserr_old_stateid;
-                        goto out;
-                }
-                if (stateid->si_generation > stp->st_stateid.si_generation) {
-                        ret = nfserr_bad_stateid;
-                        goto out;
-                }
-        }
-        if (is_open_stateid(stp)) {
                ret = nfserr_locks_held;
                goto out;
-        } else {
+        case NFS4_OPEN_STID:
-                ret = nfsd4_free_lock_stateid(stp);
+        case NFS4_LOCK_STID:
-                goto out;
+                ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+                if (ret)
+                        goto out;
+                if (s->sc_type == NFS4_LOCK_STID)
+                        ret = nfsd4_free_lock_stateid(openlockstateid(s));
+                else
+                        ret = nfserr_locks_held;
+                break;
+        default:
+                ret = nfserr_bad_stateid;
        }
 out:
        nfs4_unlock_state();
        return ret;
@@ -3364,124 +3438,64 @@ setlkflg (int type)
                RD_STATE : WR_STATE;
 }
+static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_t *stateid, u32 seqid, struct nfs4_ol_stateid *stp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        struct nfs4_stateowner *sop = stp->st_stateowner;
+        __be32 status;
+        status = nfsd4_check_seqid(cstate, sop, seqid);
+        if (status)
+                return status;
+        if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+                /*
+                 * "Closed" stateid's exist *only* to return
+                 * nfserr_replay_me from the previous step.
+                 */
+                return nfserr_bad_stateid;
+        status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+        if (status)
+                return status;
+        return nfs4_check_fh(current_fh, stp);
+}
 /* 
 * Checks for sequence id mutating operations. 
 */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-                         stateid_t *stateid, int flags,
+                         stateid_t *stateid, char typemask,
-                         struct nfs4_stateowner **sopp,
+                         struct nfs4_ol_stateid **stpp)
-                         struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
-        struct nfs4_stateid *stp;
-        struct nfs4_stateowner *sop;
-        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
+        struct nfs4_stid *s;
        dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
                seqid, STATEID_VAL(stateid));
        *stpp = NULL;
-        *sopp = NULL;
+        status = nfsd4_lookup_stateid(stateid, typemask, &s);
+        if (status)
-        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+                return status;
-                dprintk("NFSD: preprocess_seqid_op: magic stateid!\n");
+        *stpp = openlockstateid(s);
-                return nfserr_bad_stateid;
+        cstate->replay_owner = (*stpp)->st_stateowner;
-        }
-        if (STALE_STATEID(stateid))
-                return nfserr_stale_stateid;
-        if (nfsd4_has_session(cstate))
-                flags |= HAS_SESSION;
-        /*
-        * We return BAD_STATEID if filehandle doesn't match stateid, 
-        * the confirmed flag is incorrecly set, or the generation 
-        * number is incorrect.  
-        */
-        stp = find_stateid(stateid, flags);
-        if (stp == NULL) {
-                /*
-                 * Also, we should make sure this isn't just the result of
-                 * a replayed close:
-                 */
-                sop = search_close_lru(stateid->si_stateownerid, flags);
-                /* It's not stale; let's assume it's expired: */
-                if (sop == NULL)
-                        return nfserr_expired;
-                *sopp = sop;
-                goto check_replay;
-        }
-        *stpp = stp;
-        *sopp = sop = stp->st_stateowner;
-        if (lock) {
-                clientid_t *lockclid = &lock->v.new.clientid;
-                struct nfs4_client *clp = sop->so_client;
-                int lkflg = 0;
-                __be32 status;
-                lkflg = setlkflg(lock->lk_type);
-                if (lock->lk_is_new) {
-                        if (!sop->so_is_open_owner)
-                                return nfserr_bad_stateid;
-                        if (!(flags & HAS_SESSION) &&
-                            !same_clid(&clp->cl_clientid, lockclid))
-                                return nfserr_bad_stateid;
-                        /* stp is the open stateid */
-                        status = nfs4_check_openmode(stp, lkflg);
-                        if (status)
-                                return status;
-                } else {
-                        /* stp is the lock stateid */
-                        status = nfs4_check_openmode(stp->st_openstp, lkflg);
-                        if (status)
-                                return status;
-               }
-        }
-        if (nfs4_check_fh(current_fh, stp)) {
+        return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
-                dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n");
+}
-                return nfserr_bad_stateid;
-        }
-        /*
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
-        *  We now validate the seqid and stateid generation numbers.
+{
-        *  For the moment, we ignore the possibility of 
+        __be32 status;
-        *  generation number wraparound.
+        struct nfs4_openowner *oo;
-        */
-        if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
-                goto check_replay;
-        if (sop->so_confirmed && flags & CONFIRM) {
+        status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-                dprintk("NFSD: preprocess_seqid_op: expected"
+                                                NFS4_OPEN_STID, stpp);
-                                " unconfirmed stateowner!\n");
-                return nfserr_bad_stateid;
-        }
-        if (!sop->so_confirmed && !(flags & CONFIRM)) {
-                dprintk("NFSD: preprocess_seqid_op: stateowner not"
-                                " confirmed yet!\n");
-                return nfserr_bad_stateid;
-        }
-        status = check_stateid_generation(stateid, &stp->st_stateid, flags);
        if (status)
                return status;
-        renew_client(sop->so_client);
+        oo = openowner((*stpp)->st_stateowner);
+        if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+                return nfserr_bad_stateid;
        return nfs_ok;
-check_replay:
-        if (seqid == sop->so_seqid - 1) {
-                dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
-                /* indicate replay to calling function */
-                return nfserr_replay_me;
-        }
-        dprintk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
-                        sop->so_seqid, seqid);
-        *sopp = NULL;
-        return nfserr_bad_seqid;
 }
 __be32
@@ -3489,8 +3503,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                   struct nfsd4_open_confirm *oc)
 {
        __be32 status;
-        struct nfs4_stateowner *sop;
+        struct nfs4_openowner *oo;
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
        dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
                        (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3502,38 +3516,52 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(cstate,
+        status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
-                                        CONFIRM | OPEN_STATE,
+                                        NFS4_OPEN_STID, &stp);
-                                        &oc->oc_stateowner, &stp, NULL)))
+        if (status)
-                goto out; 
+                goto out;
+        oo = openowner(stp->st_stateowner);
-        sop = oc->oc_stateowner;
+        status = nfserr_bad_stateid;
-        sop->so_confirmed = 1;
+        if (oo->oo_flags & NFS4_OO_CONFIRMED)
-        update_stateid(&stp->st_stateid);
+                goto out;
-        memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
+        oo->oo_flags |= NFS4_OO_CONFIRMED;
+        update_stateid(&stp->st_stid.sc_stateid);
+        memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
-                __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
+                __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
-        nfsd4_create_clid_dir(sop->so_client);
+        nfsd4_create_clid_dir(oo->oo_owner.so_client);
+        status = nfs_ok;
 out:
-        if (oc->oc_stateowner) {
+        if (!cstate->replay_owner)
-                nfs4_get_stateowner(oc->oc_stateowner);
+                nfs4_unlock_state();
-                cstate->replay_owner = oc->oc_stateowner;
-        }
-        nfs4_unlock_state();
        return status;
 }
-static inline void nfs4_file_downgrade(struct nfs4_stateid *stp, unsigned int to_access)
+static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
 {
-        int i;
+        if (!test_bit(access, &stp->st_access_bmap))
+                return;
+        nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+        __clear_bit(access, &stp->st_access_bmap);
+}
-        for (i = 1; i < 4; i++) {
+static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
-                if (test_bit(i, &stp->st_access_bmap) && !(i & to_access)) {
+{
-                        nfs4_file_put_access(stp->st_file, i);
+        switch (to_access) {
-                        __clear_bit(i, &stp->st_access_bmap);
+        case NFS4_SHARE_ACCESS_READ:
-                }
+                nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_WRITE);
+                nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+                break;
+        case NFS4_SHARE_ACCESS_WRITE:
+                nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_READ);
+                nfs4_stateid_downgrade_bit(stp, NFS4_SHARE_ACCESS_BOTH);
+                break;
+        case NFS4_SHARE_ACCESS_BOTH:
+                break;
+        default:
+                BUG();
        }
 }
@@ -3553,24 +3581,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                     struct nfsd4_open_downgrade *od)
 {
        __be32 status;
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
        dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 
                        (int)cstate->current_fh.fh_dentry->d_name.len,
                        cstate->current_fh.fh_dentry->d_name.name);
-        if (!access_valid(od->od_share_access, cstate->minorversion)
+        /* We don't yet support WANT bits: */
-                        || !deny_valid(od->od_share_deny))
+        od->od_share_access &= NFS4_SHARE_ACCESS_MASK;
-                return nfserr_inval;
        nfs4_lock_state();
-        if ((status = nfs4_preprocess_seqid_op(cstate,
+        status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
-                                        od->od_seqid,
+                                        &od->od_stateid, &stp);
-                                        &od->od_stateid, 
+        if (status)
-                                        OPEN_STATE,
-                                        &od->od_stateowner, &stp, NULL)))
                goto out; 
        status = nfserr_inval;
        if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
                dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
@@ -3582,22 +3606,45 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                        stp->st_deny_bmap, od->od_share_deny);
                goto out;
        }
-        nfs4_file_downgrade(stp, od->od_share_access);
+        nfs4_stateid_downgrade(stp, od->od_share_access);
        reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
-        update_stateid(&stp->st_stateid);
+        update_stateid(&stp->st_stid.sc_stateid);
-        memcpy(&od->od_stateid, &stp->st_stateid, sizeof(stateid_t));
+        memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        status = nfs_ok;
 out:
-        if (od->od_stateowner) {
+        if (!cstate->replay_owner)
-                nfs4_get_stateowner(od->od_stateowner);
+                nfs4_unlock_state();
-                cstate->replay_owner = od->od_stateowner;
-        }
-        nfs4_unlock_state();
        return status;
 }
+void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
+{
+        struct nfs4_openowner *oo;
+        struct nfs4_ol_stateid *s;
+        if (!so->so_is_open_owner)
+                return;
+        oo = openowner(so);
+        s = oo->oo_last_closed_stid;
+        if (!s)
+                return;
+        if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
+                /* Release the last_closed_stid on the next seqid bump: */
+                oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
+                return;
+        }
+        oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
+        release_last_closed_stateid(oo);
+}
+static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+{
+        unhash_open_stateid(s);
+        s->st_stid.sc_type = NFS4_CLOSED_STID;
+}
 /*
 * nfs4_unlock_state() called after encode
 */
@@ -3606,39 +3653,37 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_close *close)
 {
        __be32 status;
-        struct nfs4_stateid *stp;
+        struct nfs4_openowner *oo;
+        struct nfs4_ol_stateid *stp;
        dprintk("NFSD: nfsd4_close on file %.*s\n", 
                        (int)cstate->current_fh.fh_dentry->d_name.len,
                        cstate->current_fh.fh_dentry->d_name.name);
        nfs4_lock_state();
-        /* check close_lru for replay */
+        status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
-        if ((status = nfs4_preprocess_seqid_op(cstate,
+                                        &close->cl_stateid,
-                                        close->cl_seqid,
+                                        NFS4_OPEN_STID|NFS4_CLOSED_STID,
-                                        &close->cl_stateid, 
+                                        &stp);
-                                        OPEN_STATE | CLOSE_STATE,
+        if (status)
-                                        &close->cl_stateowner, &stp, NULL)))
                goto out; 
+        oo = openowner(stp->st_stateowner);
        status = nfs_ok;
-        update_stateid(&stp->st_stateid);
+        update_stateid(&stp->st_stid.sc_stateid);
-        memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
+        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
-        /* release_stateid() calls nfsd_close() if needed */
+        nfsd4_close_open_stateid(stp);
-        release_open_stateid(stp);
+        oo->oo_last_closed_stid = stp;
        /* place unused nfs4_stateowners on so_close_lru list to be
         * released by the laundromat service after the lease period
         * to enable us to handle CLOSE replay
         */
-        if (list_empty(&close->cl_stateowner->so_stateids))
+        if (list_empty(&oo->oo_owner.so_stateids))
-                move_to_close_lru(close->cl_stateowner);
+                move_to_close_lru(oo);
 out:
-        if (close->cl_stateowner) {
+        if (!cstate->replay_owner)
-                nfs4_get_stateowner(close->cl_stateowner);
+                nfs4_unlock_state();
-                cstate->replay_owner = close->cl_stateowner;
-        }
-        nfs4_unlock_state();
        return status;
 }
@@ -3648,34 +3693,22 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct nfs4_delegation *dp;
        stateid_t *stateid = &dr->dr_stateid;
+        struct nfs4_stid *s;
        struct inode *inode;
        __be32 status;
-        int flags = 0;
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
                return status;
        inode = cstate->current_fh.fh_dentry->d_inode;
-        if (nfsd4_has_session(cstate))
-                flags |= HAS_SESSION;
        nfs4_lock_state();
-        status = nfserr_bad_stateid;
+        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s);
-        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+        if (status)
-                goto out;
-        status = nfserr_stale_stateid;
-        if (STALE_STATEID(stateid))
-                goto out;
-        status = nfserr_bad_stateid;
-        if (!is_delegation_stateid(stateid))
-                goto out;
-        status = nfserr_expired;
-        dp = find_delegation_stateid(inode, stateid);
-        if (!dp)
                goto out;
-        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+        dp = delegstateid(s);
+        status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
        if (status)
                goto out;
-        renew_client(dp->dl_client);
        unhash_delegation(dp);
 out:
@@ -3713,9 +3746,6 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1: NFS4_MAX_UINT64;
 }
-#define lockownerid_hashval(id) \
-        ((id) & LOCK_HASH_MASK)
 static inline unsigned int
 lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
                struct xdr_netobj *ownername)
@@ -3725,101 +3755,7 @@ lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
                & LOCK_HASH_MASK;
 }
-static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
-static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
-static int
-same_stateid(stateid_t *id_one, stateid_t *id_two)
-{
-        if (id_one->si_stateownerid != id_two->si_stateownerid)
-                return 0;
-        return id_one->si_fileid == id_two->si_fileid;
-}
-static struct nfs4_stateid *
-find_stateid(stateid_t *stid, int flags)
-{
-        struct nfs4_stateid *local;
-        u32 st_id = stid->si_stateownerid;
-        u32 f_id = stid->si_fileid;
-        unsigned int hashval;
-        dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-        if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
-                hashval = stateid_hashval(st_id, f_id);
-                list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-                        if ((local->st_stateid.si_stateownerid == st_id) &&
-                            (local->st_stateid.si_fileid == f_id))
-                                return local;
-                }
-        } 
-        if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
-                hashval = stateid_hashval(st_id, f_id);
-                list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-                        if ((local->st_stateid.si_stateownerid == st_id) &&
-                            (local->st_stateid.si_fileid == f_id))
-                                return local;
-                }
-        }
-        return NULL;
-}
-static struct nfs4_stateid *
-search_for_stateid(stateid_t *stid)
-{
-        struct nfs4_stateid *local;
-        unsigned int hashval = stateid_hashval(stid->si_stateownerid, stid->si_fileid);
-        list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
-                if (same_stateid(&local->st_stateid, stid))
-                        return local;
-        }
-        list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
-                if (same_stateid(&local->st_stateid, stid))
-                        return local;
-        }
-        return NULL;
-}
-static struct nfs4_delegation *
-search_for_delegation(stateid_t *stid)
-{
-        struct nfs4_file *fp;
-        struct nfs4_delegation *dp;
-        struct list_head *pos;
-        int i;
-        for (i = 0; i < FILE_HASH_SIZE; i++) {
-                list_for_each_entry(fp, &file_hashtbl[i], fi_hash) {
-                        list_for_each(pos, &fp->fi_delegations) {
-                                dp = list_entry(pos, struct nfs4_delegation, dl_perfile);
-                                if (same_stateid(&dp->dl_stateid, stid))
-                                        return dp;
-                        }
-                }
-        }
-        return NULL;
-}
-static struct nfs4_delegation *
-find_delegation_stateid(struct inode *ino, stateid_t *stid)
-{
-        struct nfs4_file *fp;
-        struct nfs4_delegation *dl;
-        dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
-                STATEID_VAL(stid));
-        fp = find_file(ino);
-        if (!fp)
-                return NULL;
-        dl = find_delegation_file(fp, stid);
-        put_nfs4_file(fp);
-        return dl;
-}
 /*
 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3846,15 +3782,21 @@ static const struct lock_manager_operations nfsd_posix_mng_ops  = {
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-        struct nfs4_stateowner *sop;
+        struct nfs4_lockowner *lo;
        if (fl->fl_lmops == &nfsd_posix_mng_ops) {
-                sop = (struct nfs4_stateowner *) fl->fl_owner;
+                lo = (struct nfs4_lockowner *) fl->fl_owner;
-                kref_get(&sop->so_ref);
+                deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data,
-                deny->ld_sop = sop;
+                                        lo->lo_owner.so_owner.len, GFP_KERNEL);
-                deny->ld_clientid = sop->so_client->cl_clientid;
+                if (!deny->ld_owner.data)
+                        /* We just don't care that much */
+                        goto nevermind;
+                deny->ld_owner.len = lo->lo_owner.so_owner.len;
+                deny->ld_clientid = lo->lo_owner.so_client->cl_clientid;
        } else {
-                deny->ld_sop = NULL;
+nevermind:
+                deny->ld_owner.len = 0;
+                deny->ld_owner.data = NULL;
                deny->ld_clientid.cl_boot = 0;
                deny->ld_clientid.cl_id = 0;
        }
@@ -3867,8 +3809,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
                deny->ld_type = NFS4_WRITE_LT;
 }
-static struct nfs4_stateowner *
+static struct nfs4_lockowner *
-find_lockstateowner_str(struct inode *inode, clientid_t *clid,
+find_lockowner_str(struct inode *inode, clientid_t *clid,
                struct xdr_netobj *owner)
 {
        unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
@@ -3876,11 +3818,17 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
        list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
                if (same_owner_str(op, owner, clid))
-                        return op;
+                        return lockowner(op);
        }
        return NULL;
 }
+static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+{
+        list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+        list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
+}
 /*
 * Alloc a lock owner structure.
 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
@@ -3889,67 +3837,40 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 * strhashval = lock_ownerstr_hashval 
 */
-static struct nfs4_stateowner *
+static struct nfs4_lockowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_stateid *open_stp, struct nfsd4_lock *lock) {
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
-        struct nfs4_stateowner *sop;
+        struct nfs4_lockowner *lo;
-        struct nfs4_replay *rp;
-        unsigned int idhashval;
-        if (!(sop = alloc_stateowner(&lock->lk_new_owner)))
+        lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
+        if (!lo)
                return NULL;
-        idhashval = lockownerid_hashval(current_ownerid);
+        INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
-        INIT_LIST_HEAD(&sop->so_idhash);
+        lo->lo_owner.so_is_open_owner = 0;
-        INIT_LIST_HEAD(&sop->so_strhash);
-        INIT_LIST_HEAD(&sop->so_perclient);
-        INIT_LIST_HEAD(&sop->so_stateids);
-        INIT_LIST_HEAD(&sop->so_perstateid);
-        INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
-        sop->so_time = 0;
-        list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
-        list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
-        list_add(&sop->so_perstateid, &open_stp->st_lockowners);
-        sop->so_is_open_owner = 0;
-        sop->so_id = current_ownerid++;
-        sop->so_client = clp;
        /* It is the openowner seqid that will be incremented in encode in the
         * case of new lockowners; so increment the lock seqid manually: */
-        sop->so_seqid = lock->lk_new_lock_seqid + 1;
+        lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
-        sop->so_confirmed = 1;
+        hash_lockowner(lo, strhashval, clp, open_stp);
-        rp = &sop->so_replay;
+        return lo;
-        rp->rp_status = nfserr_serverfault;
-        rp->rp_buflen = 0;
-        rp->rp_buf = rp->rp_ibuf;
-        return sop;
 }
-static struct nfs4_stateid *
+static struct nfs4_ol_stateid *
-alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
+alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
 {
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
-        unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
+        struct nfs4_client *clp = lo->lo_owner.so_client;
-        stp = nfs4_alloc_stateid();
+        stp = nfs4_alloc_stateid(clp);
        if (stp == NULL)
-                goto out;
+                return NULL;
-        INIT_LIST_HEAD(&stp->st_hash);
+        init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
-        INIT_LIST_HEAD(&stp->st_perfile);
-        INIT_LIST_HEAD(&stp->st_perstateowner);
-        INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
-        list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
        list_add(&stp->st_perfile, &fp->fi_stateids);
-        list_add(&stp->st_perstateowner, &sop->so_stateids);
+        list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
-        stp->st_stateowner = sop;
+        stp->st_stateowner = &lo->lo_owner;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = boot_time;
-        stp->st_stateid.si_stateownerid = sop->so_id;
-        stp->st_stateid.si_fileid = fp->fi_id;
-        stp->st_stateid.si_generation = 0;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
-out:
        return stp;
 }
@@ -3960,7 +3881,7 @@ check_lock_length(u64 offset, u64 length)
             LOFF_OVERFLOW(offset, length)));
 }
-static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
 {
        struct nfs4_file *fp = lock_stp->st_file;
        int oflag = nfs4_access_to_omode(access);
@@ -3978,15 +3899,16 @@ __be32
 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
           struct nfsd4_lock *lock)
 {
-        struct nfs4_stateowner *open_sop = NULL;
+        struct nfs4_openowner *open_sop = NULL;
-        struct nfs4_stateowner *lock_sop = NULL;
+        struct nfs4_lockowner *lock_sop = NULL;
-        struct nfs4_stateid *lock_stp;
+        struct nfs4_ol_stateid *lock_stp;
        struct nfs4_file *fp;
        struct file *filp = NULL;
        struct file_lock file_lock;
        struct file_lock conflock;
        __be32 status = 0;
        unsigned int strhashval;
+        int lkflg;
        int err;
        dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -4010,7 +3932,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 * Use open owner and open stateid to create lock owner and
                 * lock stateid.
                 */
-                struct nfs4_stateid *open_stp = NULL;
+                struct nfs4_ol_stateid *open_stp = NULL;
                
                status = nfserr_stale_clientid;
                if (!nfsd4_has_session(cstate) &&
@@ -4018,26 +3940,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                /* validate and update open stateid and open seqid */
-                status = nfs4_preprocess_seqid_op(cstate,
+                status = nfs4_preprocess_confirmed_seqid_op(cstate,
                                        lock->lk_new_open_seqid,
                                        &lock->lk_new_open_stateid,
-                                        OPEN_STATE,
+                                        &open_stp);
-                                        &lock->lk_replay_owner, &open_stp,
-                                        lock);
                if (status)
                        goto out;
-                open_sop = lock->lk_replay_owner;
+                open_sop = openowner(open_stp->st_stateowner);
+                status = nfserr_bad_stateid;
+                if (!nfsd4_has_session(cstate) &&
+                        !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+                                                &lock->v.new.clientid))
+                        goto out;
                /* create lockowner and lock stateid */
                fp = open_stp->st_file;
-                strhashval = lock_ownerstr_hashval(fp->fi_inode, 
+                strhashval = lock_ownerstr_hashval(fp->fi_inode,
-                                open_sop->so_client->cl_clientid.cl_id, 
+                                open_sop->oo_owner.so_client->cl_clientid.cl_id,
                                &lock->v.new.owner);
                /* XXX: Do we need to check for duplicate stateowners on
                 * the same file, or should they just be allowed (and
                 * create new stateids)? */
-                status = nfserr_resource;
+                status = nfserr_jukebox;
                lock_sop = alloc_init_lock_stateowner(strhashval,
-                                open_sop->so_client, open_stp, lock);
+                                open_sop->oo_owner.so_client, open_stp, lock);
                if (lock_sop == NULL)
                        goto out;
                lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
@@ -4046,16 +3971,20 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        } else {
                /* lock (lock owner + lock stateid) already exists */
                status = nfs4_preprocess_seqid_op(cstate,
-                                       lock->lk_old_lock_seqid, 
+                                       lock->lk_old_lock_seqid,
-                                       &lock->lk_old_lock_stateid, 
+                                       &lock->lk_old_lock_stateid,
-                                       LOCK_STATE,
+                                       NFS4_LOCK_STID, &lock_stp);
-                                       &lock->lk_replay_owner, &lock_stp, lock);
                if (status)
                        goto out;
-                lock_sop = lock->lk_replay_owner;
+                lock_sop = lockowner(lock_stp->st_stateowner);
                fp = lock_stp->st_file;
        }
-        /* lock->lk_replay_owner and lock_stp have been created or found */
+        /* lock_sop and lock_stp have been created or found */
+        lkflg = setlkflg(lock->lk_type);
+        status = nfs4_check_openmode(lock_stp, lkflg);
+        if (status)
+                goto out;
        status = nfserr_grace;
        if (locks_in_grace() && !lock->lk_reclaim)
@@ -4106,8 +4035,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
        switch (-err) {
        case 0: /* success! */
-                update_stateid(&lock_stp->st_stateid);
+                update_stateid(&lock_stp->st_stid.sc_stateid);
-                memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
+                memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid, 
                                sizeof(stateid_t));
                status = 0;
                break;
@@ -4119,19 +4048,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        case (EDEADLK):
                status = nfserr_deadlock;
                break;
-        default:        
+        default:
                dprintk("NFSD: nfsd4_lock: vfs_lock_file() failed! status %d\n",err);
-                status = nfserr_resource;
+                status = nfserrno(err);
                break;
        }
 out:
        if (status && lock->lk_is_new && lock_sop)
                release_lockowner(lock_sop);
-        if (lock->lk_replay_owner) {
+        if (!cstate->replay_owner)
-                nfs4_get_stateowner(lock->lk_replay_owner);
+                nfs4_unlock_state();
-                cstate->replay_owner = lock->lk_replay_owner;
-        }
-        nfs4_unlock_state();
        return status;
 }
@@ -4163,6 +4089,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct inode *inode;
        struct file_lock file_lock;
+        struct nfs4_lockowner *lo;
        int error;
        __be32 status;
@@ -4172,19 +4099,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (check_lock_length(lockt->lt_offset, lockt->lt_length))
                 return nfserr_inval;
-        lockt->lt_stateowner = NULL;
        nfs4_lock_state();
        status = nfserr_stale_clientid;
        if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
                goto out;
-        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
+        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-                dprintk("NFSD: nfsd4_lockt: fh_verify() failed!\n");
-                if (status == nfserr_symlink)
-                        status = nfserr_inval;
                goto out;
-        }
        inode = cstate->current_fh.fh_dentry->d_inode;
        locks_init_lock(&file_lock);
@@ -4203,10 +4125,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        }
-        lockt->lt_stateowner = find_lockstateowner_str(inode,
+        lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
-                        &lockt->lt_clientid, &lockt->lt_owner);
+        if (lo)
-        if (lockt->lt_stateowner)
+                file_lock.fl_owner = (fl_owner_t)lo;
-                file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
        file_lock.fl_pid = current->tgid;
        file_lock.fl_flags = FL_POSIX;
@@ -4234,7 +4155,7 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_locku *locku)
 {
-        struct nfs4_stateid *stp;
+        struct nfs4_ol_stateid *stp;
        struct file *filp = NULL;
        struct file_lock file_lock;
        __be32 status;
@@ -4249,13 +4170,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfs4_lock_state();
                                                                                
-        if ((status = nfs4_preprocess_seqid_op(cstate,
+        status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-                                        locku->lu_seqid, 
+                                        &locku->lu_stateid, NFS4_LOCK_STID, &stp);
-                                        &locku->lu_stateid, 
+        if (status)
-                                        LOCK_STATE,
-                                        &locku->lu_stateowner, &stp, NULL)))
                goto out;
        filp = find_any_file(stp->st_file);
        if (!filp) {
                status = nfserr_lock_range;
@@ -4264,7 +4182,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        BUG_ON(!filp);
        locks_init_lock(&file_lock);
        file_lock.fl_type = F_UNLCK;
-        file_lock.fl_owner = (fl_owner_t) locku->lu_stateowner;
+        file_lock.fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
        file_lock.fl_pid = current->tgid;
        file_lock.fl_file = filp;
        file_lock.fl_flags = FL_POSIX; 
@@ -4285,15 +4203,12 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        /*
        * OK, unlock succeeded; the only thing left to do is update the stateid.
        */
-        update_stateid(&stp->st_stateid);
+        update_stateid(&stp->st_stid.sc_stateid);
-        memcpy(&locku->lu_stateid, &stp->st_stateid, sizeof(stateid_t));
+        memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 out:
-        if (locku->lu_stateowner) {
+        if (!cstate->replay_owner)
-                nfs4_get_stateowner(locku->lu_stateowner);
+                nfs4_unlock_state();
-                cstate->replay_owner = locku->lu_stateowner;
-        }
-        nfs4_unlock_state();
        return status;
 out_nfserr:
@@ -4307,7 +4222,7 @@ out_nfserr:
 *      0: no locks held by lockowner
 */
 static int
-check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 {
        struct file_lock **flpp;
        struct inode *inode = filp->fi_inode;
@@ -4332,7 +4247,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 {
        clientid_t *clid = &rlockowner->rl_clientid;
        struct nfs4_stateowner *sop;
-        struct nfs4_stateid *stp;
+        struct nfs4_lockowner *lo;
+        struct nfs4_ol_stateid *stp;
        struct xdr_netobj *owner = &rlockowner->rl_owner;
        struct list_head matches;
        int i;
@@ -4356,16 +4272,15 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
         * data structures. */
        INIT_LIST_HEAD(&matches);
        for (i = 0; i < LOCK_HASH_SIZE; i++) {
-                list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) {
+                list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
                        if (!same_owner_str(sop, owner, clid))
                                continue;
                        list_for_each_entry(stp, &sop->so_stateids,
                                        st_perstateowner) {
-                                if (check_for_locks(stp->st_file, sop))
+                                lo = lockowner(sop);
+                                if (check_for_locks(stp->st_file, lo))
                                        goto out;
-                                /* Note: so_perclient unused for lockowners,
+                                list_add(&lo->lo_list, &matches);
-                                 * so it's OK to fool with here. */
-                                list_add(&sop->so_perclient, &matches);
                        }
                }
        }
@@ -4374,12 +4289,12 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
         * have been checked. */
        status = nfs_ok;
        while (!list_empty(&matches)) {
-                sop = list_entry(matches.next, struct nfs4_stateowner,
+                lo = list_entry(matches.next, struct nfs4_lockowner,
-                                                                so_perclient);
+                                                                lo_list);
                /* unhash_stateowner deletes so_perclient only
                 * for openowners. */
-                list_del(&sop->so_perclient);
+                list_del(&lo->lo_list);
-                release_lockowner(sop);
+                release_lockowner(lo);
        }
 out:
        nfs4_unlock_state();
@@ -4501,16 +4416,10 @@ nfs4_state_init(void)
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
-        for (i = 0; i < OWNER_HASH_SIZE; i++) {
+        for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
+                INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
-                INIT_LIST_HEAD(&ownerid_hashtbl[i]);
-        }
-        for (i = 0; i < STATEID_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&stateid_hashtbl[i]);
-                INIT_LIST_HEAD(&lockstateid_hashtbl[i]);
        }
        for (i = 0; i < LOCK_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
                INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
        }
        memset(&onestateid, ~0, sizeof(stateid_t));
@@ -4527,7 +4436,7 @@ nfsd4_load_reboot_recovery_data(void)
        int status;
        nfs4_lock_state();
-        nfsd4_init_recdir(user_recovery_dirname);
+        nfsd4_init_recdir();
        status = nfsd4_recdir_load();
        nfs4_unlock_state();
        if (status)
@@ -4636,40 +4545,3 @@ nfs4_state_shutdown(void)
        nfs4_unlock_state();
        nfsd4_destroy_callback_queue();
 }
-/*
- * user_recovery_dirname is protected by the nfsd_mutex since it's only
- * accessed when nfsd is starting.
- */
-static void
-nfs4_set_recdir(char *recdir)
-{
-        strcpy(user_recovery_dirname, recdir);
-}
-/*
- * Change the NFSv4 recovery directory to recdir.
- */
-int
-nfs4_reset_recoverydir(char *recdir)
-{
-        int status;
-        struct path path;
-        status = kern_path(recdir, LOOKUP_FOLLOW, &path);
-        if (status)
-                return status;
-        status = -ENOTDIR;
-        if (S_ISDIR(path.dentry->d_inode->i_mode)) {
-                nfs4_set_recdir(recdir);
-                status = 0;
-        }
-        path_put(&path);
-        return status;
-}
-char *
-nfs4_recoverydir(void)
-{
-        return user_recovery_dirname;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c8bf405d19de..66d095d7955e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -456,7 +456,6 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
        DECODE_HEAD;
-        close->cl_stateowner = NULL;
        READ_BUF(4);
        READ32(close->cl_seqid);
        return nfsd4_decode_stateid(argp, &close->cl_stateid);
@@ -551,7 +550,6 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
        DECODE_HEAD;
-        lock->lk_replay_owner = NULL;
        /*
        * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
        */
@@ -611,7 +609,6 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 {
        DECODE_HEAD;
-        locku->lu_stateowner = NULL;
        READ_BUF(8);
        READ32(locku->lu_type);
        if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
@@ -642,6 +639,83 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *x)
+{
+        __be32 *p;
+        u32 w;
+        READ_BUF(4);
+        READ32(w);
+        *x = w;
+        switch (w & NFS4_SHARE_ACCESS_MASK) {
+        case NFS4_SHARE_ACCESS_READ:
+        case NFS4_SHARE_ACCESS_WRITE:
+        case NFS4_SHARE_ACCESS_BOTH:
+                break;
+        default:
+                return nfserr_bad_xdr;
+        }
+        w &= !NFS4_SHARE_ACCESS_MASK;
+        if (!w)
+                return nfs_ok;
+        if (!argp->minorversion)
+                return nfserr_bad_xdr;
+        switch (w & NFS4_SHARE_WANT_MASK) {
+        case NFS4_SHARE_WANT_NO_PREFERENCE:
+        case NFS4_SHARE_WANT_READ_DELEG:
+        case NFS4_SHARE_WANT_WRITE_DELEG:
+        case NFS4_SHARE_WANT_ANY_DELEG:
+        case NFS4_SHARE_WANT_NO_DELEG:
+        case NFS4_SHARE_WANT_CANCEL:
+                break;
+        default:
+                return nfserr_bad_xdr;
+        }
+        w &= ~NFS4_SHARE_WANT_MASK;
+        if (!w)
+                return nfs_ok;
+        switch (w) {
+        case NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+        case NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED:
+        case (NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL |
+              NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+                return nfs_ok;
+        }
+xdr_error:
+        return nfserr_bad_xdr;
+}
+static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+{
+        __be32 *p;
+        READ_BUF(4);
+        READ32(*x);
+        /* Note: unlinke access bits, deny bits may be zero. */
+        if (*x & ~NFS4_SHARE_DENY_BOTH)
+                return nfserr_bad_xdr;
+        return nfs_ok;
+xdr_error:
+        return nfserr_bad_xdr;
+}
+static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+        __be32 *p;
+        READ_BUF(4);
+        READ32(o->len);
+        if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+                return nfserr_bad_xdr;
+        READ_BUF(o->len);
+        SAVEMEM(o->data, o->len);
+        return nfs_ok;
+xdr_error:
+        return nfserr_bad_xdr;
+}
 static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
@@ -649,19 +723,23 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
        memset(open->op_bmval, 0, sizeof(open->op_bmval));
        open->op_iattr.ia_valid = 0;
-        open->op_stateowner = NULL;
+        open->op_openowner = NULL;
        /* seqid, share_access, share_deny, clientid, ownerlen */
-        READ_BUF(16 + sizeof(clientid_t));
+        READ_BUF(4);
        READ32(open->op_seqid);
-        READ32(open->op_share_access);
+        status = nfsd4_decode_share_access(argp, &open->op_share_access);
-        READ32(open->op_share_deny);
+        if (status)
+                goto xdr_error;
+        status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+        if (status)
+                goto xdr_error;
+        READ_BUF(sizeof(clientid_t));
        COPYMEM(&open->op_clientid, sizeof(clientid_t));
-        READ32(open->op_owner.len);
+        status = nfsd4_decode_opaque(argp, &open->op_owner);
+        if (status)
-        /* owner, open_flag */
+                goto xdr_error;
-        READ_BUF(open->op_owner.len + 4);
+        READ_BUF(4);
-        SAVEMEM(open->op_owner.data, open->op_owner.len);
        READ32(open->op_create);
        switch (open->op_create) {
        case NFS4_OPEN_NOCREATE:
@@ -727,6 +805,19 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
                        return status;
                break;
+        case NFS4_OPEN_CLAIM_FH:
+        case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+                if (argp->minorversion < 1)
+                        goto xdr_error;
+                /* void */
+                break;
+        case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+                if (argp->minorversion < 1)
+                        goto xdr_error;
+                status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+                if (status)
+                        return status;
+                break;
        default:
                goto xdr_error;
        }
@@ -739,7 +830,6 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 {
        DECODE_HEAD;
                    
-        open_conf->oc_stateowner = NULL;
        status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
        if (status)
                return status;
@@ -754,15 +844,17 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 {
        DECODE_HEAD;
                    
-        open_down->od_stateowner = NULL;
        status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
        if (status)
                return status;
-        READ_BUF(12);
+        READ_BUF(4);
        READ32(open_down->od_seqid);
-        READ32(open_down->od_share_access);
+        status = nfsd4_decode_share_access(argp, &open_down->od_share_access);
-        READ32(open_down->od_share_deny);
+        if (status)
-                                                        
+                return status;
+        status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+        if (status)
+                return status;
        DECODE_TAIL;
 }
@@ -903,12 +995,13 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 {
        DECODE_HEAD;
-        READ_BUF(12);
+        READ_BUF(8);
        COPYMEM(setclientid->se_verf.data, 8);
-        READ32(setclientid->se_namelen);
-        READ_BUF(setclientid->se_namelen + 8);
+        status = nfsd4_decode_opaque(argp, &setclientid->se_name);
-        SAVEMEM(setclientid->se_name, setclientid->se_namelen);
+        if (status)
+                return nfserr_bad_xdr;
+        READ_BUF(8);
        READ32(setclientid->se_callback_prog);
        READ32(setclientid->se_callback_netid_len);
@@ -1051,11 +1144,9 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
        READ_BUF(NFS4_VERIFIER_SIZE);
        COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
-        READ_BUF(4);
+        status = nfsd4_decode_opaque(argp, &exid->clname);
-        READ32(exid->clname.len);
+        if (status)
+                return nfserr_bad_xdr;
-        READ_BUF(exid->clname.len);
-        SAVEMEM(exid->clname.data, exid->clname.len);
        READ_BUF(4);
        READ32(exid->flags);
@@ -1326,6 +1417,16 @@ xdr_error:
        goto out;
 }
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+{
+        DECODE_HEAD;
+        READ_BUF(8);
+        COPYMEM(&dc->clientid, 8);
+        DECODE_TAIL;
+}
 static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
 {
        DECODE_HEAD;
@@ -1447,7 +1548,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_test_stateid,
        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_destroy_clientid,
        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
@@ -1630,15 +1731,20 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 * we know whether the error to be returned is a sequence id mutating error.
 */
-#define ENCODE_SEQID_OP_TAIL(stateowner) do {                   \
+static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
-        if (seqid_mutating_err(nfserr) && stateowner) {         \
+{
-                stateowner->so_seqid++;                         \
+        struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
-                stateowner->so_replay.rp_status = nfserr;       \
-                stateowner->so_replay.rp_buflen =               \
+        if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
-                          (((char *)(resp)->p - (char *)save)); \
+                stateowner->so_seqid++;
-                memcpy(stateowner->so_replay.rp_buf, save,      \
+                stateowner->so_replay.rp_status = nfserr;
-                        stateowner->so_replay.rp_buflen);       \
+                stateowner->so_replay.rp_buflen =
-        } } while (0);
+                          (char *)resp->p - (char *)save;
+                memcpy(stateowner->so_replay.rp_buf, save,
+                        stateowner->so_replay.rp_buflen);
+                nfsd4_purge_closed_stateid(stateowner);
+        }
+}
 /* Encode as an array of strings the string given with components
 * separated @sep.
@@ -1697,36 +1803,89 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
 }
 /*
- * Return the path to an export point in the pseudo filesystem namespace
+ * Encode a path in RFC3530 'pathname4' format
- * Returned string is safe to use as long as the caller holds a reference
- * to @exp.
 */
-static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+static __be32 nfsd4_encode_path(const struct path *root,
+                const struct path *path, __be32 **pp, int *buflen)
 {
-        struct svc_fh tmp_fh;
+        struct path cur = {
-        char *path = NULL, *rootpath;
+                .mnt = path->mnt,
-        size_t rootlen;
+                .dentry = path->dentry,
+        };
+        __be32 *p = *pp;
+        struct dentry **components = NULL;
+        unsigned int ncomponents = 0;
+        __be32 err = nfserr_jukebox;
-        fh_init(&tmp_fh, NFS4_FHSIZE);
+        dprintk("nfsd4_encode_components(");
-        *stat = exp_pseudoroot(rqstp, &tmp_fh);
-        if (*stat)
-                return NULL;
-        rootpath = tmp_fh.fh_export->ex_pathname;
-        path = exp->ex_pathname;
+        path_get(&cur);
+        /* First walk the path up to the nfsd root, and store the
+         * dentries/path components in an array.
+         */
+        for (;;) {
+                if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+                        break;
+                if (cur.dentry == cur.mnt->mnt_root) {
+                        if (follow_up(&cur))
+                                continue;
+                        goto out_free;
+                }
+                if ((ncomponents & 15) == 0) {
+                        struct dentry **new;
+                        new = krealloc(components,
+                                        sizeof(*new) * (ncomponents + 16),
+                                        GFP_KERNEL);
+                        if (!new)
+                                goto out_free;
+                        components = new;
+                }
+                components[ncomponents++] = cur.dentry;
+                cur.dentry = dget_parent(cur.dentry);
+        }
-        rootlen = strlen(rootpath);
+        *buflen -= 4;
-        if (strncmp(path, rootpath, rootlen)) {
+        if (*buflen < 0)
-                dprintk("nfsd: fs_locations failed;"
+                goto out_free;
-                        "%s is not contained in %s\n", path, rootpath);
+        WRITE32(ncomponents);
-                *stat = nfserr_notsupp;
-                path = NULL;
+        while (ncomponents) {
-                goto out;
+                struct dentry *dentry = components[ncomponents - 1];
+                unsigned int len = dentry->d_name.len;
+                *buflen -= 4 + (XDR_QUADLEN(len) << 2);
+                if (*buflen < 0)
+                        goto out_free;
+                WRITE32(len);
+                WRITEMEM(dentry->d_name.name, len);
+                dprintk("/%s", dentry->d_name.name);
+                dput(dentry);
+                ncomponents--;
        }
-        path += rootlen;
-out:
+        *pp = p;
-        fh_put(&tmp_fh);
+        err = 0;
-        return path;
+out_free:
+        dprintk(")\n");
+        while (ncomponents)
+                dput(components[--ncomponents]);
+        kfree(components);
+        path_put(&cur);
+        return err;
+}
+static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
+                const struct path *path, __be32 **pp, int *buflen)
+{
+        struct svc_export *exp_ps;
+        __be32 res;
+        exp_ps = rqst_find_fsidzero_export(rqstp);
+        if (IS_ERR(exp_ps))
+                return nfserrno(PTR_ERR(exp_ps));
+        res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+        exp_put(exp_ps);
+        return res;
 }
 /*
@@ -1740,11 +1899,8 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
        int i;
        __be32 *p = *pp;
        struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-        char *root = nfsd4_path(rqstp, exp, &status);
-        if (status)
+        status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
-                return status;
-        status = nfsd4_encode_components('/', root, &p, buflen);
        if (status)
                return status;
        if ((*buflen -= 4) < 0)
@@ -1760,12 +1916,19 @@ static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
        return 0;
 }
-static u32 nfs4_ftypes[16] = {
+static u32 nfs4_file_type(umode_t mode)
-        NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
+{
-        NF4DIR,  NF4BAD,  NF4BLK, NF4BAD,
+        switch (mode & S_IFMT) {
-        NF4REG,  NF4BAD,  NF4LNK, NF4BAD,
+        case S_IFIFO:   return NF4FIFO;
-        NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
+        case S_IFCHR:   return NF4CHR;
-};
+        case S_IFDIR:   return NF4DIR;
+        case S_IFBLK:   return NF4BLK;
+        case S_IFLNK:   return NF4LNK;
+        case S_IFREG:   return NF4REG;
+        case S_IFSOCK:  return NF4SOCK;
+        default:        return NF4BAD;
+        };
+}
 static __be32
 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
@@ -1954,7 +2117,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_TYPE) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                dummy = nfs4_ftypes[(stat.mode & S_IFMT) >> 12];
+                dummy = nfs4_file_type(stat.mode);
                if (dummy == NF4BAD)
                        goto out_serverfault;
                WRITE32(dummy);
@@ -2488,7 +2651,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
        if (!nfserr)
                nfsd4_encode_stateid(resp, &close->cl_stateid);
-        ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2564,17 +2727,18 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 static void
 nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
 {
+        struct xdr_netobj *conf = &ld->ld_owner;
        __be32 *p;
-        RESERVE_SPACE(32 + XDR_LEN(ld->ld_sop ? ld->ld_sop->so_owner.len : 0));
+        RESERVE_SPACE(32 + XDR_LEN(conf->len));
        WRITE64(ld->ld_start);
        WRITE64(ld->ld_length);
        WRITE32(ld->ld_type);
-        if (ld->ld_sop) {
+        if (conf->len) {
                WRITEMEM(&ld->ld_clientid, 8);
-                WRITE32(ld->ld_sop->so_owner.len);
+                WRITE32(conf->len);
-                WRITEMEM(ld->ld_sop->so_owner.data, ld->ld_sop->so_owner.len);
+                WRITEMEM(conf->data, conf->len);
-                kref_put(&ld->ld_sop->so_ref, nfs4_free_stateowner);
+                kfree(conf->data);
        }  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
                WRITE64((u64)0); /* clientid */
                WRITE32(0); /* length of owner name */
@@ -2592,7 +2756,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
        else if (nfserr == nfserr_denied)
                nfsd4_encode_lock_denied(resp, &lock->lk_denied);
-        ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2612,7 +2776,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
        if (!nfserr)
                nfsd4_encode_stateid(resp, &locku->lu_stateid);
-        ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2693,7 +2857,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
        }
        /* XXX save filehandle here */
 out:
-        ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2705,7 +2869,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
        if (!nfserr)
                nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
-        ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2717,7 +2881,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
        if (!nfserr)
                nfsd4_encode_stateid(resp, &od->od_stateid);
-        ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+        encode_seqid_op_tail(resp, save, nfserr);
        return nfserr;
 }
@@ -2759,8 +2923,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
                        read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
                        &maxcount);
-        if (nfserr == nfserr_symlink)
-                nfserr = nfserr_inval;
        if (nfserr)
                return nfserr;
        eof = (read->rd_offset + maxcount >=
@@ -2886,8 +3048,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
            readdir->common.err == nfserr_toosmall &&
            readdir->buffer == page) 
                nfserr = nfserr_toosmall;
-        if (nfserr == nfserr_symlink)
-                nfserr = nfserr_notdir;
        if (nfserr)
                goto err_no_verf;
@@ -3218,9 +3378,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
        WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
        WRITE32(seq->seqid);
        WRITE32(seq->slotid);
-        WRITE32(seq->maxslots);
+        /* Note slotid's are numbered from zero: */
-        /* For now: target_maxslots = maxslots */
+        WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
-        WRITE32(seq->maxslots);
+        WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
        WRITE32(seq->status_flags);
        ADJUST_ARGS();
@@ -3233,6 +3393,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
                          struct nfsd4_test_stateid *test_stateid)
 {
        struct nfsd4_compoundargs *argp;
+        struct nfs4_client *cl = resp->cstate.session->se_client;
        stateid_t si;
        __be32 *p;
        int i;
@@ -3248,7 +3409,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
        nfs4_lock_state();
        for (i = 0; i < test_stateid->ts_num_ids; i++) {
                nfsd4_decode_stateid(argp, &si);
-                valid = nfs4_validate_stateid(&si, test_stateid->ts_has_session);
+                valid = nfs4_validate_stateid(cl, &si);
                RESERVE_SPACE(4);
                *p++ = htonl(valid);
                resp->p = p;
@@ -3334,34 +3495,29 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 /*
 * Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation.
+ * after encoding the current operation with pad.
 *
- * pad: add on 8 bytes for the next operation's op_code and status so that
+ * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
- * there is room to cache a failure on the next operation.
+ *      which was specified at nfsd4_operation, else pad is zero.
 *
- * Compare this length to the session se_fmaxresp_cached.
+ * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
 *
 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
 * will be at least a page and will therefore hold the xdr_buf head.
 */
-static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 {
-        int status = 0;
        struct xdr_buf *xb = &resp->rqstp->rq_res;
-        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
        struct nfsd4_session *session = NULL;
        struct nfsd4_slot *slot = resp->cstate.slot;
-        u32 length, tlen = 0, pad = 8;
+        u32 length, tlen = 0;
        if (!nfsd4_has_session(&resp->cstate))
-                return status;
+                return 0;
        session = resp->cstate.session;
-        if (session == NULL || slot->sl_cachethis == 0)
+        if (session == NULL)
-                return status;
+                return 0;
-        if (resp->opcnt >= args->opcnt)
-                pad = 0; /* this is the last operation */
        if (xb->page_len == 0) {
                length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3374,10 +3530,14 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
        dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
                length, xb->page_len, tlen, pad);
-        if (length <= session->se_fchannel.maxresp_cached)
+        if (length > session->se_fchannel.maxresp_sz)
-                return status;
+                return nfserr_rep_too_big;
-        else
+        if (slot->sl_cachethis == 1 &&
+            length > session->se_fchannel.maxresp_cached)
                return nfserr_rep_too_big_to_cache;
+        return 0;
 }
 void
@@ -3397,8 +3557,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
               !nfsd4_enc_ops[op->opnum]);
        op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
        /* nfsd4_check_drc_limit guarantees enough room for error status */
-        if (!op->status && nfsd4_check_drc_limit(resp))
+        if (!op->status)
-                op->status = nfserr_rep_too_big_to_cache;
+                op->status = nfsd4_check_resp_size(resp, 0);
 status:
        /*
         * Note: We write the status directly, instead of using WRITE32(),
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c7716143cbd1..db34a585e112 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -9,7 +9,6 @@
 #include <linux/ctype.h>
 #include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/gss_api.h>
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 7ecfa2420307..58134a23fdfb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -11,13 +11,39 @@
 #include <linux/types.h>
 #include <linux/mount.h>
+#include <linux/nfs.h>
+#include <linux/nfs2.h>
+#include <linux/nfs3.h>
+#include <linux/nfs4.h>
+#include <linux/sunrpc/msg_prot.h>
 #include <linux/nfsd/debug.h>
 #include <linux/nfsd/export.h>
 #include <linux/nfsd/stats.h>
 /*
 * nfsd version
 */
 #define NFSD_SUPPORTED_MINOR_VERSION    1
+/*
+ * Maximum blocksizes supported by daemon under various circumstances.
+ */
+#define NFSSVC_MAXBLKSIZE       RPCSVC_MAXPAYLOAD
+/* NFSv2 is limited by the protocol specification, see RFC 1094 */
+#define NFSSVC_MAXBLKSIZE_V2    (8*1024)
+/*
+ * Largest number of bytes we need to allocate for an NFS
+ * call or reply.  Used to control buffer sizes.  We use
+ * the length of v3 WRITE, READDIR and READDIR replies
+ * which are an RPC header, up to 26 XDR units of reply
+ * data, and some page data.
+ *
+ * Note that accuracy here doesn't matter too much as the
+ * size is rounded up to a page size when allocating space.
+ */
+#define NFSD_BUFSIZE            ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
 struct readdir_cd {
        __be32                  err;    /* 0, nfserr, or nfserr_eof */
@@ -335,6 +361,13 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 #define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
        NFSD_WRITEABLE_ATTRS_WORD2
+extern int nfsd4_is_junction(struct dentry *dentry);
+#else
+static inline int nfsd4_is_junction(struct dentry *dentry)
+{
+        return 0;
+}
 #endif /* CONFIG_NFSD_V4 */
 #endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 90c6aa6d5e0f..c763de5c1157 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,28 +59,25 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 * the write call).
 */
 static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
 {
-        /* Type can be negative when creating hardlinks - not to a dir */
+        mode &= S_IFMT;
-        if (type > 0 && (mode & S_IFMT) != type) {
-                if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
+        if (requested == 0) /* the caller doesn't care */
-                        return nfserr_symlink;
+                return nfs_ok;
-                else if (type == S_IFDIR)
+        if (mode == requested)
-                        return nfserr_notdir;
+                return nfs_ok;
-                else if ((mode & S_IFMT) == S_IFDIR)
+        /*
-                        return nfserr_isdir;
+         * v4 has an error more specific than err_notdir which we should
-                else
+         * return in preference to err_notdir:
-                        return nfserr_inval;
+         */
-        }
+        if (rqstp->rq_vers == 4 && mode == S_IFLNK)
-        if (type < 0 && (mode & S_IFMT) == -type) {
+                return nfserr_symlink;
-                if (rqstp->rq_vers == 4 && (mode & S_IFMT) == S_IFLNK)
+        if (requested == S_IFDIR)
-                        return nfserr_symlink;
+                return nfserr_notdir;
-                else if (type == -S_IFDIR)
+        if (mode == S_IFDIR)
-                        return nfserr_isdir;
+                return nfserr_isdir;
-                else
+        return nfserr_inval;
-                        return nfserr_notdir;
-        }
-        return 0;
 }
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4eefaf1b42e8..a3cf38476a1b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -35,6 +35,7 @@
 #ifndef _NFSD4_STATE_H
 #define _NFSD4_STATE_H
+#include <linux/idr.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/nfsd/nfsfh.h>
 #include "nfsfh.h"
@@ -45,24 +46,20 @@ typedef struct {
 } clientid_t;
 typedef struct {
-        u32             so_boot;
+        clientid_t      so_clid;
-        u32             so_stateownerid;
+        u32             so_id;
-        u32             so_fileid;
 } stateid_opaque_t;
 typedef struct {
        u32                     si_generation;
        stateid_opaque_t        si_opaque;
 } stateid_t;
-#define si_boot           si_opaque.so_boot
-#define si_stateownerid   si_opaque.so_stateownerid
-#define si_fileid         si_opaque.so_fileid
 #define STATEID_FMT     "(%08x/%08x/%08x/%08x)"
 #define STATEID_VAL(s) \
-        (s)->si_boot, \
+        (s)->si_opaque.so_clid.cl_boot, \
-        (s)->si_stateownerid, \
+        (s)->si_opaque.so_clid.cl_id, \
-        (s)->si_fileid, \
+        (s)->si_opaque.so_id, \
        (s)->si_generation
 struct nfsd4_callback {
@@ -76,17 +73,27 @@ struct nfsd4_callback {
        bool cb_done;
 };
+struct nfs4_stid {
+#define NFS4_OPEN_STID 1
+#define NFS4_LOCK_STID 2
+#define NFS4_DELEG_STID 4
+/* For an open stateid kept around *only* to process close replays: */
+#define NFS4_CLOSED_STID 8
+        unsigned char sc_type;
+        stateid_t sc_stateid;
+        struct nfs4_client *sc_client;
+};
 struct nfs4_delegation {
+        struct nfs4_stid        dl_stid; /* must be first field */
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
        struct list_head        dl_recall_lru;  /* delegation recalled */
        atomic_t                dl_count;       /* ref count */
-        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
-        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
        struct nfsd4_callback   dl_recall;
@@ -104,6 +111,11 @@ struct nfs4_cb_conn {
        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
 };
+static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+{
+        return container_of(s, struct nfs4_delegation, dl_stid);
+}
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
 /* Maximum number of operations per session compound */
@@ -220,6 +232,7 @@ struct nfs4_client {
        struct list_head        cl_idhash;      /* hash by cl_clientid.id */
        struct list_head        cl_strhash;     /* hash by cl_name */
        struct list_head        cl_openowners;
+        struct idr              cl_stateids;    /* stateid lookup */
        struct list_head        cl_delegations;
        struct list_head        cl_lru;         /* tail queue */
        struct xdr_netobj       cl_name;        /* id generated by client */
@@ -245,6 +258,7 @@ struct nfs4_client {
 #define NFSD4_CB_UP             0
 #define NFSD4_CB_UNKNOWN        1
 #define NFSD4_CB_DOWN           2
+#define NFSD4_CB_FAULT          3
        int                     cl_cb_state;
        struct nfsd4_callback   cl_cb_null;
        struct nfsd4_session    *cl_cb_session;
@@ -293,6 +307,9 @@ static inline void
 update_stateid(stateid_t *stateid)
 {
        stateid->si_generation++;
+        /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
+        if (stateid->si_generation == 0)
+                stateid->si_generation = 1;
 }
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -312,49 +329,57 @@ struct nfs4_replay {
        __be32                  rp_status;
        unsigned int            rp_buflen;
        char                    *rp_buf;
-        unsigned                intrp_allocated;
        struct knfsd_fh         rp_openfh;
        char                    rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
-/*
-* nfs4_stateowner can either be an open_owner, or a lock_owner
-*
-*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
-*         for lock_owner
-*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
-*         for lock_owner
-*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
-*         struct is reaped.
-*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
-*         and is used to ensure no dangling nfs4_stateid references when we 
-*         release a stateowner.
-*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
-*         close is called to reap associated byte-range locks
-*    so_close_lru: (open) stateowner is placed on this list instead of being
-*         reaped (when so_perfilestate is empty) to hold the last close replay.
-*         reaped by laundramat thread after lease period.
-*/
 struct nfs4_stateowner {
-        struct kref             so_ref;
-        struct list_head        so_idhash;   /* hash by so_id */
        struct list_head        so_strhash;   /* hash by op_name */
-        struct list_head        so_perclient;
        struct list_head        so_stateids;
-        struct list_head        so_perstateid; /* for lockowners only */
-        struct list_head        so_close_lru; /* tail queue */
-        time_t                  so_time; /* time of placement on so_close_lru */
-        int                     so_is_open_owner; /* 1=openowner,0=lockowner */
-        u32                     so_id;
        struct nfs4_client *    so_client;
        /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
         * sequence id expected from the client: */
        u32                     so_seqid;
        struct xdr_netobj       so_owner;     /* open owner name */
-        int                     so_confirmed; /* successful OPEN_CONFIRM? */
        struct nfs4_replay      so_replay;
+        bool                    so_is_open_owner;
 };
+struct nfs4_openowner {
+        struct nfs4_stateowner  oo_owner; /* must be first field */
+        struct list_head        oo_perclient;
+        /*
+         * We keep around openowners a little while after last close,
+         * which saves clients from having to confirm, and allows us to
+         * handle close replays if they come soon enough.  The close_lru
+         * is a list of such openowners, to be reaped by the laundromat
+         * thread eventually if they remain unused:
+         */
+        struct list_head        oo_close_lru;
+        struct nfs4_ol_stateid *oo_last_closed_stid;
+        time_t                  oo_time; /* time of placement on so_close_lru */
+#define NFS4_OO_CONFIRMED   1
+#define NFS4_OO_PURGE_CLOSE 2
+#define NFS4_OO_NEW         4
+        unsigned char           oo_flags;
+};
+struct nfs4_lockowner {
+        struct nfs4_stateowner  lo_owner; /* must be first element */
+        struct list_head        lo_perstateid; /* for lockowners only */
+        struct list_head        lo_list; /* for temporary uses */
+};
+static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
+{
+        return container_of(so, struct nfs4_openowner, oo_owner);
+}
+static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
+{
+        return container_of(so, struct nfs4_lockowner, lo_owner);
+}
 /*
 *  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
 *    o fi_perfile list is used to search for conflicting 
@@ -368,17 +393,17 @@ struct nfs4_file {
        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
        struct file *           fi_fds[3];
        /*
-         * Each open or lock stateid contributes 1 to either
+         * Each open or lock stateid contributes 0-4 to the counts
-         * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
+         * below depending on which bits are set in st_access_bitmap:
-         * on open or lock mode:
+         *     1 to fi_access[O_RDONLY] if NFS4_SHARE_ACCES_READ is set
+         *   + 1 to fi_access[O_WRONLY] if NFS4_SHARE_ACCESS_WRITE is set
+         *   + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
         */
        atomic_t                fi_access[2];
        struct file             *fi_deleg_file;
        struct file_lock        *fi_lease;
        atomic_t                fi_delegees;
        struct inode            *fi_inode;
-        u32                     fi_id;      /* used with stateowner->so_id 
-                                             * for stateid_hashtbl hash */
        bool                    fi_had_conflict;
 };
@@ -408,50 +433,27 @@ static inline struct file *find_any_file(struct nfs4_file *f)
                return f->fi_fds[O_RDONLY];
 }
-/*
+/* "ol" stands for "Open or Lock".  Better suggestions welcome. */
-* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+struct nfs4_ol_stateid {
-*
+        struct nfs4_stid    st_stid; /* must be first field */
-* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
-*
-*       st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
-*       st_perfile: file_hashtbl[] entry.
-*       st_perfile_state: nfs4_stateowner->so_perfilestate
-*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
-*       st_access_bmap: used only for open stateid
-*       st_deny_bmap: used only for open stateid
-*       st_openstp: open stateid lock stateid was derived from
-*
-* XXX: open stateids and lock stateids have diverged sufficiently that
-* we should consider defining separate structs for the two cases.
-*/
-struct nfs4_stateid {
-        struct list_head              st_hash; 
        struct list_head              st_perfile;
        struct list_head              st_perstateowner;
        struct list_head              st_lockowners;
        struct nfs4_stateowner      * st_stateowner;
        struct nfs4_file            * st_file;
-        stateid_t                     st_stateid;
        unsigned long                 st_access_bmap;
        unsigned long                 st_deny_bmap;
-        struct nfs4_stateid         * st_openstp;
+        struct nfs4_ol_stateid         * st_openstp;
 };
+static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
+{
+        return container_of(s, struct nfs4_ol_stateid, st_stid);
+}
 /* flags for preprocess_seqid_op() */
-#define HAS_SESSION             0x00000001
-#define CONFIRM                 0x00000002
-#define OPEN_STATE              0x00000004
-#define LOCK_STATE              0x00000008
 #define RD_STATE                0x00000010
 #define WR_STATE                0x00000020
-#define CLOSE_STATE             0x00000040
-#define seqid_mutating_err(err)                       \
-        (((err) != nfserr_stale_clientid) &&    \
-        ((err) != nfserr_bad_seqid) &&          \
-        ((err) != nfserr_stale_stateid) &&      \
-        ((err) != nfserr_bad_stateid))
 struct nfsd4_compound_state;
@@ -461,7 +463,8 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void nfs4_free_stateowner(struct kref *kref);
+extern void nfs4_free_openowner(struct nfs4_openowner *);
+extern void nfs4_free_lockowner(struct nfs4_lockowner *);
 extern int set_callback_cred(void);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
@@ -473,7 +476,7 @@ extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern void nfsd4_init_recdir(char *recdir_name);
+extern void nfsd4_init_recdir(void);
 extern int nfsd4_recdir_load(void);
 extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
@@ -482,18 +485,7 @@ extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
-extern __be32 nfs4_validate_stateid(stateid_t *, int);
+extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
+extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
-static inline void
-nfs4_put_stateowner(struct nfs4_stateowner *so)
-{
-        kref_put(&so->so_ref, nfs4_free_stateowner);
-}
-static inline void
-nfs4_get_stateowner(struct nfs4_stateowner *so)
-{
-        kref_get(&so->so_ref);
-}
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd0acca5370a..7a2e442623c8 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -168,6 +168,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 {
        if (d_mountpoint(dentry))
                return 1;
+        if (nfsd4_is_junction(dentry))
+                return 1;
        if (!(exp->ex_flags & NFSEXP_V4ROOT))
                return 0;
        return dentry->d_inode != NULL;
@@ -502,7 +504,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
        unsigned int flags = 0;
        /* Get inode */
-        error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+        error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
        if (error)
                return error;
@@ -592,6 +594,22 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
        return error;
 }
+#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+int nfsd4_is_junction(struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        if (inode == NULL)
+                return 0;
+        if (inode->i_mode & S_IXUGO)
+                return 0;
+        if (!(inode->i_mode & S_ISVTX))
+                return 0;
+        if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+                return 0;
+        return 1;
+}
 #endif /* defined(CONFIG_NFSD_V4) */
 #ifdef CONFIG_NFSD_V3
@@ -1352,7 +1370,7 @@ __be32
 do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                char *fname, int flen, struct iattr *iap,
                struct svc_fh *resfhp, int createmode, u32 *verifier,
-                int *truncp, int *created)
+                bool *truncp, bool *created)
 {
        struct dentry   *dentry, *dchild = NULL;
        struct inode    *dirp;
@@ -1632,10 +1650,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
-        err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
+        err = fh_verify(rqstp, tfhp, 0, NFSD_MAY_NOP);
        if (err)
                goto out;
+        err = nfserr_isdir;
+        if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+                goto out;
        err = nfserr_perm;
        if (!len)
                goto out;
@@ -2114,7 +2134,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        /* Allow read access to binaries even when mode 111 */
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
-            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+             (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
+              acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
                err = inode_permission(inode, MAY_EXEC);
        return err? nfserrno(err) : 0;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e0bbac04d1dd..3f54ad03bb2b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -10,21 +10,22 @@
 /*
 * Flags for nfsd_permission
 */
-#define NFSD_MAY_NOP            0
+#define NFSD_MAY_NOP                    0
-#define NFSD_MAY_EXEC           1 /* == MAY_EXEC */
+#define NFSD_MAY_EXEC                   0x001 /* == MAY_EXEC */
-#define NFSD_MAY_WRITE          2 /* == MAY_WRITE */
+#define NFSD_MAY_WRITE                  0x002 /* == MAY_WRITE */
-#define NFSD_MAY_READ           4 /* == MAY_READ */
+#define NFSD_MAY_READ                   0x004 /* == MAY_READ */
-#define NFSD_MAY_SATTR          8
+#define NFSD_MAY_SATTR                  0x008
-#define NFSD_MAY_TRUNC          16
+#define NFSD_MAY_TRUNC                  0x010
-#define NFSD_MAY_LOCK           32
+#define NFSD_MAY_LOCK                   0x020
-#define NFSD_MAY_MASK           63
+#define NFSD_MAY_MASK                   0x03f
 /* extra hints to permission and open routines: */
-#define NFSD_MAY_OWNER_OVERRIDE 64
+#define NFSD_MAY_OWNER_OVERRIDE         0x040
-#define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_LOCAL_ACCESS           0x080 /* for device special files */
-#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT     0x100
-#define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_NOT_BREAK_LEASE        0x200
-#define NFSD_MAY_BYPASS_GSS     1024
+#define NFSD_MAY_BYPASS_GSS             0x400
+#define NFSD_MAY_READ_IF_EXEC           0x800
 #define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -61,7 +62,7 @@ __be32		nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
 __be32          do_nfsd_create(struct svc_rqst *, struct svc_fh *,
                                char *name, int len, struct iattr *attrs,
                                struct svc_fh *res, int createmode,
-                                u32 *verifier, int *truncp, int *created);
+                                u32 *verifier, bool *truncp, bool *created);
 __be32          nfsd_commit(struct svc_rqst *, struct svc_fh *,
                                loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d2a8d04428c7..2364747ee97d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -81,7 +81,6 @@ struct nfsd4_access {
 struct nfsd4_close {
        u32             cl_seqid;           /* request */
        stateid_t       cl_stateid;         /* request+response */
-        struct nfs4_stateowner * cl_stateowner; /* response */
 };
 struct nfsd4_commit {
@@ -131,7 +130,7 @@ struct nfsd4_link {
 struct nfsd4_lock_denied {
        clientid_t      ld_clientid;
-        struct nfs4_stateowner   *ld_sop;
+        struct xdr_netobj       ld_owner;
        u64             ld_start;
        u64             ld_length;
        u32             ld_type;
@@ -165,9 +164,6 @@ struct nfsd4_lock {
                } ok;
                struct nfsd4_lock_denied        denied;
        } u;
-        /* The lk_replay_owner is the open owner in the open_to_lock_owner
-         * case and the lock owner otherwise: */
-        struct nfs4_stateowner *lk_replay_owner;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
@@ -188,7 +184,6 @@ struct nfsd4_lockt {
        struct xdr_netobj               lt_owner;
        u64                             lt_offset;
        u64                             lt_length;
-        struct nfs4_stateowner *        lt_stateowner;
        struct nfsd4_lock_denied        lt_denied;
 };
@@ -199,7 +194,6 @@ struct nfsd4_locku {
        stateid_t       lu_stateid;
        u64             lu_offset;
        u64             lu_length;
-        struct nfs4_stateowner  *lu_stateowner;
 };
@@ -232,8 +226,11 @@ struct nfsd4_open {
        u32             op_recall;          /* recall */
        struct nfsd4_change_info  op_cinfo; /* response */
        u32             op_rflags;          /* response */
-        int             op_truncate;        /* used during processing */
+        bool            op_truncate;        /* used during processing */
-        struct nfs4_stateowner *op_stateowner; /* used during processing */
+        bool            op_created;         /* used during processing */
+        struct nfs4_openowner *op_openowner; /* used during processing */
+        struct nfs4_file *op_file;          /* used during processing */
+        struct nfs4_ol_stateid *op_stp;     /* used during processing */
        struct nfs4_acl *op_acl;
 };
 #define op_iattr        iattr
@@ -243,7 +240,6 @@ struct nfsd4_open_confirm {
        stateid_t       oc_req_stateid          /* request */;
        u32             oc_seqid                /* request */;
        stateid_t       oc_resp_stateid         /* response */;
-        struct nfs4_stateowner * oc_stateowner; /* response */
 };
 struct nfsd4_open_downgrade {
@@ -251,7 +247,6 @@ struct nfsd4_open_downgrade {
        u32             od_seqid;
        u32             od_share_access;
        u32             od_share_deny;
-        struct nfs4_stateowner *od_stateowner;
 };
@@ -325,8 +320,7 @@ struct nfsd4_setattr {
 struct nfsd4_setclientid {
        nfs4_verifier   se_verf;            /* request */
-        u32             se_namelen;         /* request */
+        struct xdr_netobj se_name;
-        char *          se_name;            /* request */
        u32             se_callback_prog;   /* request */
        u32             se_callback_netid_len;  /* request */
        char *          se_callback_netid_val;  /* request */
@@ -351,7 +345,6 @@ struct nfsd4_saved_compoundargs {
 struct nfsd4_test_stateid {
        __be32          ts_num_ids;
-        __be32          ts_has_session;
        struct nfsd4_compoundargs *ts_saved_args;
        struct nfsd4_saved_compoundargs ts_savedp;
 };
@@ -405,6 +398,10 @@ struct nfsd4_destroy_session {
        struct nfs4_sessionid   sessionid;
 };
+struct nfsd4_destroy_clientid {
+        clientid_t clientid;
+};
 struct nfsd4_reclaim_complete {
        u32 rca_one_fs;
 };
@@ -532,6 +529,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
                struct nfsd4_compoundargs *);
 int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
                struct nfsd4_compoundres *);
+int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
 void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
@@ -558,11 +556,13 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_destroy_session *);
+extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
 __be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
                struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
                struct svc_fh *current_fh, struct nfsd4_open *open);
+extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
 extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 81ecf9c0bf0a..194fb22ef79d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,20 +7185,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 {
        int ret = 0;
        struct buffer_head *dir_bh = NULL;
-        struct ocfs2_security_xattr_info si = {
-                .enable = 1,
-        };
-        ret = ocfs2_init_security_get(inode, dir, qstr, &si);
+        ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
        if (!ret) {
-                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
-                                      si.name, si.value, si.value_len,
-                                      XATTR_CREATE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto leave;
-                }
-        } else if (ret != -EOPNOTSUPP) {
                mlog_errno(ret);
                goto leave;
        }
@@ -7255,6 +7244,22 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                     void *fs_info)
+{
+        const struct xattr *xattr;
+        int err = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+                                      xattr->name, xattr->value,
+                                      xattr->value_len, XATTR_CREATE);
+                if (err)
+                        break;
+        }
+        return err;
+}
 int ocfs2_init_security_get(struct inode *inode,
                            struct inode *dir,
                            const struct qstr *qstr,
@@ -7263,8 +7268,13 @@ int ocfs2_init_security_get(struct inode *inode,
        /* check whether ocfs2 support feature xattr */
        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
                return -EOPNOTSUPP;
-        return security_inode_init_security(inode, dir, qstr, &si->name,
+        if (si)
-                                            &si->value, &si->value_len);
+                return security_old_inode_init_security(inode, dir, qstr,
+                                                        &si->name, &si->value,
+                                                        &si->value_len);
+        return security_inode_init_security(inode, dir, qstr,
+                                            &ocfs2_initxattrs, NULL);
 }
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/open.c b/fs/open.c
index f71192109457..22c41b543f2d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -685,6 +685,10 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        if (error)
                goto cleanup_all;
+        error = break_lease(inode, f->f_flags);
+        if (error)
+                goto cleanup_all;
        if (!open && f->f_op)
                open = f->f_op->open;
        if (open) {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 10027b42b7e2..cea4623f1ed6 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -218,6 +218,8 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
        const struct posix_acl_entry *pa, *pe, *mask_obj;
        int found = 0;
+        want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b654a1bc..42b274da92c3 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <asm/cputime.h>
+#include <linux/tick.h>
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
 #define arch_idle_time(cpu) 0
 #endif
+static cputime64_t get_idle_time(int cpu)
+{
+        u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
+        cputime64_t idle;
+        if (idle_time == -1ULL) {
+                /* !NO_HZ so we can rely on cpustat.idle */
+                idle = kstat_cpu(cpu).cpustat.idle;
+                idle = cputime64_add(idle, arch_idle_time(cpu));
+        } else
+                idle = usecs_to_cputime(idle_time);
+        return idle;
+}
+static cputime64_t get_iowait_time(int cpu)
+{
+        u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+        cputime64_t iowait;
+        if (iowait_time == -1ULL)
+                /* !NO_HZ so we can rely on cpustat.iowait */
+                iowait = kstat_cpu(cpu).cpustat.iowait;
+        else
+                iowait = usecs_to_cputime(iowait_time);
+        return iowait;
+}
 static int show_stat(struct seq_file *p, void *v)
 {
        int i, j;
@@ -42,9 +72,8 @@ static int show_stat(struct seq_file *p, void *v)
                user = cputime64_add(user, kstat_cpu(i).cpustat.user);
                nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
                system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-                idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+                idle = cputime64_add(idle, get_idle_time(i));
-                idle = cputime64_add(idle, arch_idle_time(i));
+                iowait = cputime64_add(iowait, get_iowait_time(i));
-                iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
                irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(guest),
                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
                user = kstat_cpu(i).cpustat.user;
                nice = kstat_cpu(i).cpustat.nice;
                system = kstat_cpu(i).cpustat.system;
-                idle = kstat_cpu(i).cpustat.idle;
+                idle = get_idle_time(i);
-                idle = cputime64_add(idle, arch_idle_time(i));
+                iowait = get_iowait_time(i);
-                iowait = kstat_cpu(i).cpustat.iowait;
                irq = kstat_cpu(i).cpustat.irq;
                softirq = kstat_cpu(i).cpustat.softirq;
                steal = kstat_cpu(i).cpustat.steal;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a887adb9..5afaa58a8630 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,54 @@ struct numa_maps_private {
        struct numa_maps md;
 };
-static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+                        unsigned long nr_pages)
 {
        int count = page_mapcount(page);
-        md->pages++;
+        md->pages += nr_pages;
        if (pte_dirty || PageDirty(page))
-                md->dirty++;
+                md->dirty += nr_pages;
        if (PageSwapCache(page))
-                md->swapcache++;
+                md->swapcache += nr_pages;
        if (PageActive(page) || PageUnevictable(page))
-                md->active++;
+                md->active += nr_pages;
        if (PageWriteback(page))
-                md->writeback++;
+                md->writeback += nr_pages;
        if (PageAnon(page))
-                md->anon++;
+                md->anon += nr_pages;
        if (count > md->mapcount_max)
                md->mapcount_max = count;
-        md->node[page_to_nid(page)]++;
+        md->node[page_to_nid(page)] += nr_pages;
+}
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+                unsigned long addr)
+{
+        struct page *page;
+        int nid;
+        if (!pte_present(pte))
+                return NULL;
+        page = vm_normal_page(vma, addr, pte);
+        if (!page)
+                return NULL;
+        if (PageReserved(page))
+                return NULL;
+        nid = page_to_nid(page);
+        if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+                return NULL;
+        return page;
 }
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -912,26 +936,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *pte;
        md = walk->private;
-        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        spin_lock(&walk->mm->page_table_lock);
-        do {
+        if (pmd_trans_huge(*pmd)) {
-                struct page *page;
+                if (pmd_trans_splitting(*pmd)) {
-                int nid;
+                        spin_unlock(&walk->mm->page_table_lock);
+                        wait_split_huge_page(md->vma->anon_vma, pmd);
+                } else {
+                        pte_t huge_pte = *(pte_t *)pmd;
+                        struct page *page;
-                if (!pte_present(*pte))
+                        page = can_gather_numa_stats(huge_pte, md->vma, addr);
-                        continue;
+                        if (page)
+                                gather_stats(page, md, pte_dirty(huge_pte),
+                                                HPAGE_PMD_SIZE/PAGE_SIZE);
+                        spin_unlock(&walk->mm->page_table_lock);
+                        return 0;
+                }
+        } else {
+                spin_unlock(&walk->mm->page_table_lock);
+        }
-                page = vm_normal_page(md->vma, addr, *pte);
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        do {
+                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
                if (!page)
                        continue;
+                gather_stats(page, md, pte_dirty(*pte), 1);
-                if (PageReserved(page))
-                        continue;
-                nid = page_to_nid(page);
-                if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
-                        continue;
-                gather_stats(page, md, pte_dirty(*pte));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +982,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
                return 0;
        md = walk->private;
-        gather_stats(page, md, pte_dirty(*pte));
+        gather_stats(page, md, pte_dirty(*pte), 1);
        return 0;
 }
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..10b6be3ca280 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
         * resolution (think about autofs) and thus deadlocks could arise.
         */
        if (cmds == Q_QUOTAON) {
-                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
                if (ret)
                        pathp = ERR_PTR(ret);
                else
diff --git a/fs/read_write.c b/fs/read_write.c
index 179f1c33ea57..dfd125798791 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -35,23 +35,45 @@ static inline int unsigned_offsets(struct file *file)
        return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
+static loff_t lseek_execute(struct file *file, struct inode *inode,
+                loff_t offset, loff_t maxsize)
+{
+        if (offset < 0 && !unsigned_offsets(file))
+                return -EINVAL;
+        if (offset > maxsize)
+                return -EINVAL;
+        if (offset != file->f_pos) {
+                file->f_pos = offset;
+                file->f_version = 0;
+        }
+        return offset;
+}
 /**
- * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:       file structure to seek on
 * @offset:     file offset to seek to
 * @origin:     type of seek
+ * @size:       max size of file system
+ *
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * file size.
 *
- * Updates the file offset to the value specified by @offset and @origin.
+ * Synchronization:
- * Locking must be provided by the caller.
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
 */
 loff_t
-generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
+generic_file_llseek_size(struct file *file, loff_t offset, int origin,
+                loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
        switch (origin) {
        case SEEK_END:
-                offset += inode->i_size;
+                offset += i_size_read(inode);
                break;
        case SEEK_CUR:
                /*
@@ -62,14 +84,22 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                 */
                if (offset == 0)
                        return file->f_pos;
-                offset += file->f_pos;
+                /*
-                break;
+                 * f_lock protects against read/modify/write race with other
+                 * SEEK_CURs. Note that parallel writes and reads behave
+                 * like SEEK_SET.
+                 */
+                spin_lock(&file->f_lock);
+                offset = lseek_execute(file, inode, file->f_pos + offset,
+                                       maxsize);
+                spin_unlock(&file->f_lock);
+                return offset;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
-                if (offset >= inode->i_size)
+                if (offset >= i_size_read(inode))
                        return -ENXIO;
                break;
        case SEEK_HOLE:
@@ -77,26 +107,15 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
-                if (offset >= inode->i_size)
+                if (offset >= i_size_read(inode))
                        return -ENXIO;
-                offset = inode->i_size;
+                offset = i_size_read(inode);
                break;
        }
-        if (offset < 0 && !unsigned_offsets(file))
+        return lseek_execute(file, inode, offset, maxsize);
-                return -EINVAL;
-        if (offset > inode->i_sb->s_maxbytes)
-                return -EINVAL;
-        /* Special lock needed here? */
-        if (offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
-        return offset;
 }
-EXPORT_SYMBOL(generic_file_llseek_unlocked);
+EXPORT_SYMBOL(generic_file_llseek_size);
 /**
 * generic_file_llseek - generic llseek implementation for regular files
@@ -110,13 +129,10 @@ EXPORT_SYMBOL(generic_file_llseek_unlocked);
 */
 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-        loff_t rval;
+        struct inode *inode = file->f_mapping->host;
-        mutex_lock(&file->f_dentry->d_inode->i_mutex);
-        rval = generic_file_llseek_unlocked(file, offset, origin);
-        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-        return rval;
+        return generic_file_llseek_size(file, offset, origin,
+                                        inode->i_sb->s_maxbytes);
 }
 EXPORT_SYMBOL(generic_file_llseek);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a159ba5a35e7..eb711060a6f2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -291,14 +291,13 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
        for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
                jb = jb_array + i;
                jb->journal_list = NULL;
-                jb->bitmaps = vmalloc(mem);
+                jb->bitmaps = vzalloc(mem);
                if (!jb->bitmaps) {
                        reiserfs_warning(sb, "clm-2000", "unable to "
                                         "allocate bitmaps for journal lists");
                        failed = 1;
                        break;
                }
-                memset(jb->bitmaps, 0, mem);
        }
        if (failed) {
                free_list_bitmaps(sb, jb_array);
@@ -353,11 +352,10 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
        if (num_cnodes <= 0) {
                return NULL;
        }
-        head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+        head = vzalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
        if (!head) {
                return NULL;
        }
-        memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
        head[0].prev = NULL;
        head[0].next = head + 1;
        for (i = 1; i < num_cnodes; i++) {
@@ -2685,14 +2683,13 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
         * dependency inversion warnings.
         */
        reiserfs_write_unlock(sb);
-        journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
+        journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
                reiserfs_write_lock(sb);
                return 1;
        }
-        memset(journal, 0, sizeof(struct reiserfs_journal));
        INIT_LIST_HEAD(&journal->j_bitmap_nodes);
        INIT_LIST_HEAD(&journal->j_prealloc_list);
        INIT_LIST_HEAD(&journal->j_working_list);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index b6b9b1fe33b0..7483279b482d 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -111,15 +111,13 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                /* allocate additional bitmap blocks, reallocate array of bitmap
                 * block pointers */
                bitmap =
-                    vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+                    vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
                if (!bitmap) {
                        /* Journal bitmaps are still supersized, but the memory isn't
                         * leaked, so I guess it's ok */
                        printk("reiserfs_resize: unable to allocate memory.\n");
                        return -ENOMEM;
                }
-                memset(bitmap, 0,
-                       sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
                for (i = 0; i < bmap_nr; i++)
                        bitmap[i] = old_bitmap[i];
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ef66c18a9332..534668fa41be 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -66,8 +66,8 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
        if (IS_PRIVATE(dir))
                return 0;
-        error = security_inode_init_security(inode, dir, qstr, &sec->name,
+        error = security_old_inode_init_security(inode, dir, qstr, &sec->name,
-                                             &sec->value, &sec->length);
+                                                 &sec->value, &sec->length);
        if (error) {
                if (error == -EOPNOTSUPP)
                        error = 0;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 1360d4f88f41..048b59d5b2f0 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -19,9 +19,9 @@ config SQUASHFS
          If you want to compile this as a module ( = code which can be
          inserted in and removed from the running kernel whenever you want),
-          say M here and read <file:Documentation/modules.txt>.  The module
+          say M here.  The module will be called squashfs.  Note that the root
-          will be called squashfs.  Note that the root file system (the one
+          file system (the one containing the directory /) cannot be compiled
-          containing the directory /) cannot be compiled as a module.
+          as a module.
          If unsure, say N.
diff --git a/fs/stat.c b/fs/stat.c
index ba5316ffac61..78a3aa83c7ea 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
-        if (flag & AT_NO_AUTOMOUNT)
-                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a830d8..48ffbdf0d017 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,20 +43,48 @@ static DEFINE_IDA(sysfs_ino_ida);
 static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
        struct sysfs_dirent *parent_sd = sd->s_parent;
-        struct sysfs_dirent **pos;
-        BUG_ON(sd->s_sibling);
+        struct rb_node **p;
+        struct rb_node *parent;
-        /* Store directory entries in order by ino.  This allows
-         * readdir to properly restart without having to add a
+        if (sysfs_type(sd) == SYSFS_DIR)
-         * cursor into the s_dir.children list.
+                parent_sd->s_dir.subdirs++;
-         */
-        for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
+        p = &parent_sd->s_dir.inode_tree.rb_node;
-                if (sd->s_ino < (*pos)->s_ino)
+        parent = NULL;
-                        break;
+        while (*p) {
+                parent = *p;
+#define node    rb_entry(parent, struct sysfs_dirent, inode_node)
+                if (sd->s_ino < node->s_ino) {
+                        p = &node->inode_node.rb_left;
+                } else if (sd->s_ino > node->s_ino) {
+                        p = &node->inode_node.rb_right;
+                } else {
+                        printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n",
+                               (unsigned long) sd->s_ino);
+                        BUG();
+                }
+#undef node
        }
-        sd->s_sibling = *pos;
+        rb_link_node(&sd->inode_node, parent, p);
-        *pos = sd;
+        rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree);
+        p = &parent_sd->s_dir.name_tree.rb_node;
+        parent = NULL;
+        while (*p) {
+                int c;
+                parent = *p;
+#define node    rb_entry(parent, struct sysfs_dirent, name_node)
+                c = strcmp(sd->s_name, node->s_name);
+                if (c < 0) {
+                        p = &node->name_node.rb_left;
+                } else {
+                        p = &node->name_node.rb_right;
+                }
+#undef node
+        }
+        rb_link_node(&sd->name_node, parent, p);
+        rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree);
 }
 /**
@@ -71,16 +99,11 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 */
 static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
-        struct sysfs_dirent **pos;
+        if (sysfs_type(sd) == SYSFS_DIR)
+                sd->s_parent->s_dir.subdirs--;
-        for (pos = &sd->s_parent->s_dir.children; *pos;
+        rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree);
-             pos = &(*pos)->s_sibling) {
+        rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree);
-                if (*pos == sd) {
-                        *pos = sd->s_sibling;
-                        sd->s_sibling = NULL;
-                        break;
-                }
-        }
 }
 /**
@@ -126,7 +149,6 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 */
 void sysfs_put_active(struct sysfs_dirent *sd)
 {
-        struct completion *cmpl;
        int v;
        if (unlikely(!sd))
@@ -138,10 +160,9 @@ void sysfs_put_active(struct sysfs_dirent *sd)
                return;
        /* atomic_dec_return() is a mb(), we'll always see the updated
-         * sd->s_sibling.
+         * sd->u.completion.
         */
-        cmpl = (void *)sd->s_sibling;
+        complete(sd->u.completion);
-        complete(cmpl);
 }
 /**
@@ -155,16 +176,16 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
        DECLARE_COMPLETION_ONSTACK(wait);
        int v;
-        BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
+        BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
        if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
                return;
-        sd->s_sibling = (void *)&wait;
+        sd->u.completion = (void *)&wait;
        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
        /* atomic_add_return() is a mb(), put_active() will always see
-         * the updated sd->s_sibling.
+         * the updated sd->u.completion.
         */
        v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
@@ -173,8 +194,6 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
                wait_for_completion(&wait);
        }
-        sd->s_sibling = NULL;
        lock_acquired(&sd->dep_map, _RET_IP_);
        rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
@@ -384,6 +403,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *ps_iattr;
+        if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+                        sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+                        acxt->parent_sd->s_name, sd->s_name);
+                return -EINVAL;
+        }
        if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
                return -EEXIST;
@@ -490,7 +516,7 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
        }
        sd->s_flags |= SYSFS_FLAG_REMOVED;
-        sd->s_sibling = acxt->removed;
+        sd->u.removed_list = acxt->removed;
        acxt->removed = sd;
 }
@@ -514,8 +540,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
        while (acxt->removed) {
                struct sysfs_dirent *sd = acxt->removed;
-                acxt->removed = sd->s_sibling;
+                acxt->removed = sd->u.removed_list;
-                sd->s_sibling = NULL;
                sysfs_deactivate(sd);
                unmap_bin_file(sd);
@@ -540,15 +565,43 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
                                       const void *ns,
                                       const unsigned char *name)
 {
-        struct sysfs_dirent *sd;
+        struct rb_node *p = parent_sd->s_dir.name_tree.rb_node;
+        struct sysfs_dirent *found = NULL;
-        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+        if (!!sysfs_ns_type(parent_sd) != !!ns) {
-                if (ns && sd->s_ns && (sd->s_ns != ns))
+                WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-                        continue;
+                        sysfs_ns_type(parent_sd)? "required": "invalid",
-                if (!strcmp(sd->s_name, name))
+                        parent_sd->s_name, name);
-                        return sd;
+                return NULL;
        }
-        return NULL;
+        while (p) {
+                int c;
+#define node    rb_entry(p, struct sysfs_dirent, name_node)
+                c = strcmp(name, node->s_name);
+                if (c < 0) {
+                        p = node->name_node.rb_left;
+                } else if (c > 0) {
+                        p = node->name_node.rb_right;
+                } else {
+                        found = node;
+                        p = node->name_node.rb_left;
+                }
+#undef node
+        }
+        if (found) {
+                while (found->s_ns != ns) {
+                        p = rb_next(&found->name_node);
+                        if (!p)
+                                return NULL;
+                        found = rb_entry(p, struct sysfs_dirent, name_node);
+                        if (strcmp(name, found->s_name))
+                                return NULL;
+                }
+        }
+        return found;
 }
 /**
@@ -744,21 +797,19 @@ void sysfs_remove_subdir(struct sysfs_dirent *sd)
 static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 {
        struct sysfs_addrm_cxt acxt;
-        struct sysfs_dirent **pos;
+        struct rb_node *pos;
        if (!dir_sd)
                return;
        pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
        sysfs_addrm_start(&acxt, dir_sd);
-        pos = &dir_sd->s_dir.children;
+        pos = rb_first(&dir_sd->s_dir.inode_tree);
-        while (*pos) {
+        while (pos) {
-                struct sysfs_dirent *sd = *pos;
+                struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node);
+                pos = rb_next(pos);
                if (sysfs_type(sd) != SYSFS_DIR)
                        sysfs_remove_one(&acxt, sd);
-                else
-                        pos = &(*pos)->s_sibling;
        }
        sysfs_addrm_finish(&acxt);
@@ -881,12 +932,28 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
                        pos = NULL;
        }
        if (!pos && (ino > 1) && (ino < INT_MAX)) {
-                pos = parent_sd->s_dir.children;
+                struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node;
-                while (pos && (ino > pos->s_ino))
+                while (p) {
-                        pos = pos->s_sibling;
+#define node    rb_entry(p, struct sysfs_dirent, inode_node)
+                        if (ino < node->s_ino) {
+                                pos = node;
+                                p = node->inode_node.rb_left;
+                        } else if (ino > node->s_ino) {
+                                p = node->inode_node.rb_right;
+                        } else {
+                                pos = node;
+                                break;
+                        }
+#undef node
+                }
+        }
+        while (pos && pos->s_ns != ns) {
+                struct rb_node *p = rb_next(&pos->inode_node);
+                if (!p)
+                        pos = NULL;
+                else
+                        pos = rb_entry(p, struct sysfs_dirent, inode_node);
        }
-        while (pos && pos->s_ns && pos->s_ns != ns)
-                pos = pos->s_sibling;
        return pos;
 }
@@ -894,10 +961,13 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-        if (pos)
+        if (pos) do {
-                pos = pos->s_sibling;
+                struct rb_node *p = rb_next(&pos->inode_node);
-        while (pos && pos->s_ns && pos->s_ns != ns)
+                if (!p)
-                pos = pos->s_sibling;
+                        pos = NULL;
+                else
+                        pos = rb_entry(p, struct sysfs_dirent, inode_node);
+        } while (pos && pos->s_ns != ns);
        return pos;
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1ad8c93c1b85..d4e6080b4b20 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
        mutex_lock(&sysfs_mutex);
        if (sd && dir)
-                /* Only directories are tagged, so no need to pass
-                 * a tag explicitly.
-                 */
                sd = sysfs_find_dirent(sd, NULL, dir);
        if (sd && attr)
                sd = sysfs_find_dirent(sd, NULL, attr);
@@ -488,17 +485,56 @@ const struct file_operations sysfs_file_operations = {
        .poll           = sysfs_poll,
 };
+int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+                  const void **pns)
+{
+        struct sysfs_dirent *dir_sd = kobj->sd;
+        const struct sysfs_ops *ops;
+        const void *ns = NULL;
+        int err;
+        err = 0;
+        if (!sysfs_ns_type(dir_sd))
+                goto out;
+        err = -EINVAL;
+        if (!kobj->ktype)
+                goto out;
+        ops = kobj->ktype->sysfs_ops;
+        if (!ops)
+                goto out;
+        if (!ops->namespace)
+                goto out;
+        err = 0;
+        ns = ops->namespace(kobj, attr);
+out:
+        if (err) {
+                WARN(1, KERN_ERR "missing sysfs namespace attribute operation for "
+                     "kobject: %s\n", kobject_name(kobj));
+        }
+        *pns = ns;
+        return err;
+}
 int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
                        const struct attribute *attr, int type, mode_t amode)
 {
        umode_t mode = (amode & S_IALLUGO) | S_IFREG;
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
+        const void *ns;
        int rc;
+        rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
+        if (rc)
+                return rc;
        sd = sysfs_new_dirent(attr->name, mode, type);
        if (!sd)
                return -ENOMEM;
+        sd->s_ns = ns;
        sd->s_attr.attr = (void *)attr;
        sysfs_dirent_init_lockdep(sd);
@@ -586,12 +622,17 @@ int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 {
        struct sysfs_dirent *sd;
        struct iattr newattrs;
+        const void *ns;
        int rc;
+        rc = sysfs_attr_ns(kobj, attr, &ns);
+        if (rc)
+                return rc;
        mutex_lock(&sysfs_mutex);
        rc = -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
+        sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
        if (!sd)
                goto out;
@@ -616,7 +657,12 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
+        const void *ns;
+        if (sysfs_attr_ns(kobj, attr, &ns))
+                return;
+        sysfs_hash_and_remove(kobj->sd, ns, attr->name);
 }
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a81c72..e23f28894a3a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -202,18 +202,6 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
        inode->i_ctime = iattr->ia_ctime;
 }
-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
-        struct sysfs_dirent *child;
-        int nr = 0;
-        for (child = sd->s_dir.children; child; child = child->s_sibling)
-                if (sysfs_type(child) == SYSFS_DIR)
-                        nr++;
-        return nr + 2;
-}
 static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct sysfs_inode_attrs *iattrs = sd->s_iattr;
@@ -230,7 +218,7 @@ static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
        }
        if (sysfs_type(sd) == SYSFS_DIR)
-                inode->i_nlink = sysfs_count_nlink(sd);
+                inode->i_nlink = sd->s_dir.subdirs + 2;
 }
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -336,8 +324,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
        sysfs_addrm_start(&acxt, dir_sd);
        sd = sysfs_find_dirent(dir_sd, ns, name);
-        if (sd && (sd->s_ns != ns))
-                sd = NULL;
        if (sd)
                sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 845ab3ad229d..ce29e28b766d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,14 +11,18 @@
 #include <linux/lockdep.h>
 #include <linux/kobject_ns.h>
 #include <linux/fs.h>
+#include <linux/rbtree.h>
 struct sysfs_open_dirent;
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
        struct kobject          *kobj;
-        /* children list starts here and goes through sd->s_sibling */
-        struct sysfs_dirent     *children;
+        unsigned long           subdirs;
+        struct rb_root          inode_tree;
+        struct rb_root          name_tree;
 };
 struct sysfs_elem_symlink {
@@ -56,9 +60,16 @@ struct sysfs_dirent {
        struct lockdep_map      dep_map;
 #endif
        struct sysfs_dirent     *s_parent;
-        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
+        struct rb_node          inode_node;
+        struct rb_node          name_node;
+        union {
+                struct completion       *completion;
+                struct sysfs_dirent     *removed_list;
+        } u;
        const void              *s_ns; /* namespace tag */
        union {
                struct sysfs_elem_dir           s_dir;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 45174b534377..feb361e252ac 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -335,9 +335,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define ubifs_dbg_msg(fmt, ...) do {               \
+#define ubifs_dbg_msg(fmt, ...) do {                        \
-        if (0)                                     \
+        if (0)                                              \
-                pr_debug(fmt "\n", ##__VA_ARGS__); \
+                printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
 } while (0)
 #define dbg_dump_stack()
diff --git a/fs/xattr.c b/fs/xattr.c
index f060663ab70c..67583de8218c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/evm.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/fsnotify.h>
@@ -166,6 +167,64 @@ out_noalloc:
 }
 EXPORT_SYMBOL_GPL(xattr_getsecurity);
+/*
+ * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
+ *
+ * Allocate memory, if not already allocated, or re-allocate correct size,
+ * before retrieving the extended attribute.
+ *
+ * Returns the result of alloc, if failed, or the getxattr operation.
+ */
+ssize_t
+vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
+                   size_t xattr_size, gfp_t flags)
+{
+        struct inode *inode = dentry->d_inode;
+        char *value = *xattr_value;
+        int error;
+        error = xattr_permission(inode, name, MAY_READ);
+        if (error)
+                return error;
+        if (!inode->i_op->getxattr)
+                return -EOPNOTSUPP;
+        error = inode->i_op->getxattr(dentry, name, NULL, 0);
+        if (error < 0)
+                return error;
+        if (!value || (error > xattr_size)) {
+                value = krealloc(*xattr_value, error + 1, flags);
+                if (!value)
+                        return -ENOMEM;
+                memset(value, 0, error + 1);
+        }
+        error = inode->i_op->getxattr(dentry, name, value, error);
+        *xattr_value = value;
+        return error;
+}
+/* Compare an extended attribute value with the given value */
+int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
+                  const char *value, size_t size, gfp_t flags)
+{
+        char *xattr_value = NULL;
+        int rc;
+        rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
+        if (rc < 0)
+                return rc;
+        if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
+                rc = -EINVAL;
+        else
+                rc = 0;
+        kfree(xattr_value);
+        return rc;
+}
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 {
@@ -243,8 +302,10 @@ vfs_removexattr(struct dentry *dentry, const char *name)
        error = inode->i_op->removexattr(dentry, name);
        mutex_unlock(&inode->i_mutex);
-        if (!error)
+        if (!error) {
                fsnotify_xattr(dentry);
+                evm_inode_post_removexattr(dentry, name);
+        }
        return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 75bb316529dd..427a4e82a588 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,44 +16,53 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-y += -I$(src)                   # needed for trace events
-ccflags-$(CONFIG_XFS_DEBUG) += -g
-XFS_LINUX := linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 obj-$(CONFIG_XFS_FS)            += xfs.o
-xfs-y                           += linux-2.6/xfs_trace.o
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y                           += xfs_trace.o
-xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
-                                   xfs_dquot.o \
-                                   xfs_dquot_item.o \
-                                   xfs_trans_dquot.o \
-                                   xfs_qm_syscalls.o \
-                                   xfs_qm_bhv.o \
-                                   xfs_qm.o)
-xfs-$(CONFIG_XFS_QUOTA)         += linux-2.6/xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
-endif
-xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += $(XFS_LINUX)/xfs_acl.o
-xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
+# highlevel code
+xfs-y                           += xfs_aops.o \
+                                   xfs_bit.o \
+                                   xfs_buf.o \
+                                   xfs_dfrag.o \
+                                   xfs_discard.o \
+                                   xfs_error.o \
+                                   xfs_export.o \
+                                   xfs_file.o \
+                                   xfs_filestream.o \
+                                   xfs_fsops.o \
+                                   xfs_fs_subr.o \
+                                   xfs_globals.o \
+                                   xfs_iget.o \
+                                   xfs_ioctl.o \
+                                   xfs_iomap.o \
+                                   xfs_iops.o \
+                                   xfs_itable.o \
+                                   xfs_message.o \
+                                   xfs_mru_cache.o \
+                                   xfs_super.o \
+                                   xfs_sync.o \
+                                   xfs_xattr.o \
+                                   xfs_rename.o \
+                                   xfs_rw.o \
+                                   xfs_utils.o \
+                                   xfs_vnodeops.o \
+                                   kmem.o \
+                                   uuid.o
+# code shared with libxfs
 xfs-y                           += xfs_alloc.o \
                                   xfs_alloc_btree.o \
                                   xfs_attr.o \
                                   xfs_attr_leaf.o \
-                                   xfs_bit.o \
                                   xfs_bmap.o \
                                   xfs_bmap_btree.o \
                                   xfs_btree.o \
-                                   xfs_buf_item.o \
                                   xfs_da_btree.o \
                                   xfs_dir2.o \
                                   xfs_dir2_block.o \
@@ -61,49 +70,37 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_dir2_leaf.o \
                                   xfs_dir2_node.o \
                                   xfs_dir2_sf.o \
-                                   xfs_error.o \
-                                   xfs_extfree_item.o \
-                                   xfs_filestream.o \
-                                   xfs_fsops.o \
                                   xfs_ialloc.o \
                                   xfs_ialloc_btree.o \
-                                   xfs_iget.o \
                                   xfs_inode.o \
-                                   xfs_inode_item.o \
-                                   xfs_iomap.o \
-                                   xfs_itable.o \
-                                   xfs_dfrag.o \
-                                   xfs_log.o \
-                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
-                                   xfs_mru_cache.o \
+                                   xfs_trans.o
-                                   xfs_rename.o \
-                                   xfs_trans.o \
+# low-level transaction/log code
+xfs-y                           += xfs_log.o \
+                                   xfs_log_cil.o \
+                                   xfs_buf_item.o \
+                                   xfs_extfree_item.o \
+                                   xfs_inode_item.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
-                                   xfs_utils.o \
-                                   xfs_vnodeops.o \
-                                   xfs_rw.o
-# Objects in linux/
-xfs-y                           += $(addprefix $(XFS_LINUX)/, \
-                                   kmem.o \
-                                   xfs_aops.o \
-                                   xfs_buf.o \
-                                   xfs_discard.o \
-                                   xfs_export.o \
-                                   xfs_file.o \
-                                   xfs_fs_subr.o \
-                                   xfs_globals.o \
-                                   xfs_ioctl.o \
-                                   xfs_iops.o \
-                                   xfs_message.o \
-                                   xfs_super.o \
-                                   xfs_sync.o \
-                                   xfs_xattr.o)
-# Objects in support/
+# optional features
-xfs-y                           += support/uuid.o
+xfs-$(CONFIG_XFS_QUOTA)         += xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o \
+                                   xfs_quotaops.o
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)           += xfs_qm_stats.o
+endif
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
+xfs-$(CONFIG_SYSCTL)            += xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)            += xfs_ioctl32.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c
index a907de565db3..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/kmem.c
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h
index f7c8f7a9ea6d..292eff198030 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/kmem.h
@@ -61,12 +61,7 @@ extern void  kmem_free(const void *);
 static inline void *kmem_zalloc_large(size_t size)
 {
-        void *ptr;
+        return vzalloc(size);
-        ptr = vmalloc(size);
-        if (ptr)
-                memset(ptr, 0, size);
-        return ptr;
 }
 static inline void kmem_free_large(void *ptr)
 {
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5c..ff6a19873e5c 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184c..387e695a184c 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index b83f76b6d410..b83f76b6d410 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..4732d71262cc 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 53ec3ea9a625..d8b11b7f94aa 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,5 +24,6 @@
 #define XFS_BUF_LOCK_TRACKING 1
 #endif
-#include <linux-2.6/xfs_linux.h>
+#include "xfs_linux.h"
 #endif  /* __XFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c
index b6c4b3795c4a..b6c4b3795c4a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6530769a999b..4805f009f923 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -103,7 +103,7 @@ typedef struct xfs_agf {
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)((bp)->b_addr))
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
@@ -156,7 +156,7 @@ typedef struct xfs_agi {
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
 #define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)((bp)->b_addr))
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_agnumber_t agno, struct xfs_buf **bpp);
@@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
 #define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
 #define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
 #define XFS_AGFL_SIZE(mp)       ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
-#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)((bp)->b_addr))
 typedef struct xfs_agfl {
        __be32          agfl_bno[1];    /* actually XFS_AGFL_SIZE(mp) */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1e00b3ef6274..ce84ffd0264c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -451,9 +451,8 @@ xfs_alloc_read_agfl(
                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
        if (error)
                return error;
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
+        xfs_buf_set_ref(bp, XFS_AGFL_REF);
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
        *bpp = bp;
        return 0;
 }
@@ -2116,7 +2115,7 @@ xfs_read_agf(
        if (!*bpp)
                return 0;
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        ASSERT(!(*bpp)->b_error);
        agf = XFS_BUF_TO_AGF(*bpp);
        /*
@@ -2140,7 +2139,7 @@ xfs_read_agf(
                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
-        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
        return 0;
 }
@@ -2168,7 +2167,7 @@ xfs_alloc_read_agf(
                return error;
        if (!*bpp)
                return 0;
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        ASSERT(!(*bpp)->b_error);
        agf = XFS_BUF_TO_AGF(*bpp);
        pag = xfs_perag_get(mp, agno);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..11b2aad982d4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -38,40 +38,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC          37
-#define to_ioend_wq(v)  (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-void __init
-xfs_ioend_init(void)
-{
-        int i;
-        for (i = 0; i < NVSYNC; i++)
-                init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-void
-xfs_ioend_wait(
-        xfs_inode_t     *ip)
-{
-        wait_queue_head_t *wq = to_ioend_wq(ip);
-        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-STATIC void
-xfs_ioend_wake(
-        xfs_inode_t     *ip)
-{
-        if (atomic_dec_and_test(&ip->i_iocount))
-                wake_up(to_ioend_wq(ip));
-}
 void
 xfs_count_page_state(
        struct page             *page,
@@ -115,25 +81,20 @@ xfs_destroy_ioend(
        xfs_ioend_t             *ioend)
 {
        struct buffer_head      *bh, *next;
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
                bh->b_end_io(bh, !ioend->io_error);
        }
-        /*
+        if (ioend->io_iocb) {
-         * Volume managers supporting multiple paths can send back ENODEV
+                if (ioend->io_isasync) {
-         * when the final path disappears.  In this case continuing to fill
+                        aio_complete(ioend->io_iocb, ioend->io_error ?
-         * the page cache with dirty data which cannot be written out is
+                                        ioend->io_error : ioend->io_result, 0);
-         * evil, so prevent that.
+                }
-         */
+                inode_dio_done(ioend->io_inode);
-        if (unlikely(ioend->io_error == -ENODEV)) {
-                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
-                                      __FILE__, __LINE__);
        }
-        xfs_ioend_wake(ip);
        mempool_free(ioend, xfs_ioend_pool);
 }
@@ -156,6 +117,15 @@ xfs_ioend_new_eof(
 }
 /*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+        return ioend->io_offset + ioend->io_size >
+                XFS_I(ioend->io_inode)->i_d.di_size;
+}
+/*
 * Update on-disk file size now that data has been written to disk.  The
 * current in-memory file size is i_size.  If a write is beyond eof i_new_size
 * will be the intended file size until i_size is updated.  If this write does
@@ -173,9 +143,6 @@ xfs_setfilesize(
        xfs_inode_t             *ip = XFS_I(ioend->io_inode);
        xfs_fsize_t             isize;
-        if (unlikely(ioend->io_error))
-                return 0;
        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
                return EAGAIN;
@@ -192,6 +159,9 @@ xfs_setfilesize(
 /*
 * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
 */
 STATIC void
 xfs_finish_ioend(
@@ -200,8 +170,10 @@ xfs_finish_ioend(
        if (atomic_dec_and_test(&ioend->io_remaining)) {
                if (ioend->io_type == IO_UNWRITTEN)
                        queue_work(xfsconvertd_workqueue, &ioend->io_work);
-                else
+                else if (xfs_ioend_is_append(ioend))
                        queue_work(xfsdatad_workqueue, &ioend->io_work);
+                else
+                        xfs_destroy_ioend(ioend);
        }
 }
@@ -216,17 +188,24 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                error = -EIO;
+                goto done;
+        }
+        if (ioend->io_error)
+                goto done;
        /*
         * For unwritten extents we need to issue transactions to convert a
         * range to normal written extens after the data I/O has finished.
         */
-        if (ioend->io_type == IO_UNWRITTEN &&
+        if (ioend->io_type == IO_UNWRITTEN) {
-            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                 ioend->io_size);
-                if (error)
+                if (error) {
-                        ioend->io_error = error;
+                        ioend->io_error = -error;
+                        goto done;
+                }
        }
        /*
@@ -236,6 +215,7 @@ xfs_end_io(
        error = xfs_setfilesize(ioend);
        ASSERT(!error || error == EAGAIN);
+done:
        /*
         * If we didn't complete processing of the ioend, requeue it to the
         * tail of the workqueue for another attempt later. Otherwise destroy
@@ -247,8 +227,6 @@ xfs_end_io(
                /* ensure we don't spin on blocked ioends */
                delay(1);
        } else {
-                if (ioend->io_iocb)
-                        aio_complete(ioend->io_iocb, ioend->io_result, 0);
                xfs_destroy_ioend(ioend);
        }
 }
@@ -285,13 +263,13 @@ xfs_alloc_ioend(
         * all the I/O from calling the completion routine too early.
         */
        atomic_set(&ioend->io_remaining, 1);
+        ioend->io_isasync = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
        ioend->io_inode = inode;
        ioend->io_buffer_head = NULL;
        ioend->io_buffer_tail = NULL;
-        atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
        ioend->io_offset = 0;
        ioend->io_size = 0;
        ioend->io_iocb = NULL;
@@ -337,8 +315,8 @@ xfs_map_blocks(
                count = mp->m_maxioffset - offset;
        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-                          bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+                                imap, &nimaps, bmapi_flags);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (error)
@@ -551,7 +529,6 @@ xfs_cancel_ioend(
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
-                xfs_ioend_wake(XFS_I(ioend->io_inode));
                mempool_free(ioend, xfs_ioend_pool);
        } while ((ioend = next) != NULL);
 }
@@ -1161,8 +1138,8 @@ __xfs_get_blocks(
        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-                          XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+                                &imap, &nimaps, XFS_BMAPI_ENTIRE);
        if (error)
                goto out_unlock;
@@ -1310,28 +1287,17 @@ xfs_end_io_direct_write(
        ioend->io_offset = offset;
        ioend->io_size = size;
+        ioend->io_iocb = iocb;
+        ioend->io_result = ret;
        if (private && size > 0)
                ioend->io_type = IO_UNWRITTEN;
        if (is_async) {
-                /*
+                ioend->io_isasync = 1;
-                 * If we are converting an unwritten extent we need to delay
-                 * the AIO completion until after the unwrittent extent
-                 * conversion has completed, otherwise do it ASAP.
-                 */
-                if (ioend->io_type == IO_UNWRITTEN) {
-                        ioend->io_iocb = iocb;
-                        ioend->io_result = ret;
-                } else {
-                        aio_complete(iocb, ret, 0);
-                }
                xfs_finish_ioend(ioend);
        } else {
                xfs_finish_ioend_sync(ioend);
        }
-        /* XXX: probably should move into the real I/O completion handler */
-        inode_dio_done(ioend->io_inode);
 }
 STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h
index 71f721e1a71f..116dd5c37034 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -47,6 +47,7 @@ typedef struct xfs_ioend {
        unsigned int            io_type;        /* delalloc / unwritten */
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
+        unsigned int            io_isasync : 1; /* needs aio_complete */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
@@ -60,9 +61,6 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
 extern void xfs_count_page_state(struct page *, int *, int *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cbae424fe1ba..1e5d97f86ea8 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -319,7 +319,7 @@ xfs_attr_set_int(
                return (error);
        }
-        xfs_trans_ijoin(args.trans, dp);
+        xfs_trans_ijoin(args.trans, dp, 0);
        /*
         * If the attribute list is non-existent or a shortform list,
@@ -389,7 +389,7 @@ xfs_attr_set_int(
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args.trans, dp);
+                        xfs_trans_ijoin(args.trans, dp, 0);
                /*
                 * Commit the leaf transformation.  We'll need another (linked)
@@ -537,7 +537,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
         * No need to make quota reservations here. We expect to release some
         * blocks not allocate in the common case.
         */
-        xfs_trans_ijoin(args.trans, dp);
+        xfs_trans_ijoin(args.trans, dp, 0);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -809,7 +809,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
         * No need to make quota reservations here. We expect to release some
         * blocks, not allocate, in the common case.
         */
-        xfs_trans_ijoin(trans, dp);
+        xfs_trans_ijoin(trans, dp, 0);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -823,18 +823,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
        if (error)
                goto out;
-        /*
-         * Signal synchronous inactive transactions unless this is a
-         * synchronous mount filesystem in which case we know that we're here
-         * because we've been called out of xfs_inactive which means that the
-         * last reference is gone and the unlink transaction has already hit
-         * the disk so async inactive transactions are safe.
-         */
-        if (!(mp->m_flags & XFS_MOUNT_WSYNC)) {
-                if (dp->i_d.di_anextents > 0)
-                        xfs_trans_set_sync(trans);
-        }
        error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
        if (error)
                goto out;
@@ -973,7 +961,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, dp);
+                        xfs_trans_ijoin(args->trans, dp, 0);
                /*
                 * Commit the current trans (including the inode) and start
@@ -1075,7 +1063,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                         * in all transactions.
                         */
                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp);
+                                xfs_trans_ijoin(args->trans, dp, 0);
                } else
                        xfs_da_buf_done(bp);
@@ -1149,7 +1137,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, dp);
+                        xfs_trans_ijoin(args->trans, dp, 0);
        } else
                xfs_da_buf_done(bp);
        return(0);
@@ -1303,7 +1291,7 @@ restart:
                         * in all transactions.
                         */
                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp);
+                                xfs_trans_ijoin(args->trans, dp, 0);
                        /*
                         * Commit the node conversion and start the next
@@ -1340,7 +1328,7 @@ restart:
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, dp);
+                        xfs_trans_ijoin(args->trans, dp, 0);
        } else {
                /*
                 * Addition succeeded, update Btree hashvals.
@@ -1452,7 +1440,7 @@ restart:
                         * in all transactions.
                         */
                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp);
+                                xfs_trans_ijoin(args->trans, dp, 0);
                }
                /*
@@ -1584,7 +1572,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, dp);
+                        xfs_trans_ijoin(args->trans, dp, 0);
                /*
                 * Commit the Btree join operation and start a new trans.
@@ -1635,7 +1623,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                         * in all transactions.
                         */
                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp);
+                                xfs_trans_ijoin(args->trans, dp, 0);
                } else
                        xfs_da_brelse(args->trans, bp);
        }
@@ -1975,10 +1963,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
        lblkno = args->rmtblkno;
        while (valuelen > 0) {
                nmap = ATTR_RMTVALUE_MAPSIZE;
-                error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                  args->rmtblkcnt,
+                                       args->rmtblkcnt, map, &nmap,
-                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                       XFS_BMAPI_ATTRFORK);
-                                  NULL, 0, map, &nmap, NULL);
                if (error)
                        return(error);
                ASSERT(nmap >= 1);
@@ -2052,10 +2039,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                 */
                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
-                error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+                error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
                                  blkcnt,
-                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                                        XFS_BMAPI_WRITE,
                                  args->firstblock, args->total, &map, &nmap,
                                  args->flist);
                if (!error) {
@@ -2074,7 +2060,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, dp);
+                        xfs_trans_ijoin(args->trans, dp, 0);
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2104,14 +2090,11 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                 */
                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
-                error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+                error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
-                                  args->rmtblkcnt,
+                                       args->rmtblkcnt, &map, &nmap,
-                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                       XFS_BMAPI_ATTRFORK);
-                                  args->firstblock, 0, &map, &nmap,
+                if (error)
-                                  NULL);
-                if (error) {
                        return(error);
-                }
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
                       (map.br_startblock != HOLESTARTBLOCK));
@@ -2121,17 +2104,17 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
                                 XBF_LOCK | XBF_DONT_BLOCK);
-                ASSERT(bp);
+                if (!bp)
-                ASSERT(!XFS_BUF_GETERROR(bp));
+                        return ENOMEM;
                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
                                                        XFS_BUF_SIZE(bp);
                xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
                if (tmp < XFS_BUF_SIZE(bp))
                        xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
-                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+                error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
-                        return (error);
+                xfs_buf_relse(bp);
-                }
+                if (error)
+                        return error;
                src += tmp;
                valuelen -= tmp;
@@ -2167,16 +2150,12 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * Try to remember where we decided to put the value.
                 */
-                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
-                error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-                                        args->rmtblkcnt,
+                                       args->rmtblkcnt, &map, &nmap,
-                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                       XFS_BMAPI_ATTRFORK);
-                                        args->firstblock, 0, &map, &nmap,
+                if (error)
-                                        args->flist);
-                if (error) {
                        return(error);
-                }
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
                       (map.br_startblock != HOLESTARTBLOCK));
@@ -2189,8 +2168,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                 */
                bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
                if (bp) {
-                        XFS_BUF_STALE(bp);
+                        xfs_buf_stale(bp);
-                        XFS_BUF_UNDELAYWRITE(bp);
                        xfs_buf_relse(bp);
                        bp = NULL;
                }
@@ -2228,7 +2206,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                 * a new one.  We need the inode to be in all transactions.
                 */
                if (committed)
-                        xfs_trans_ijoin(args->trans, args->dp);
+                        xfs_trans_ijoin(args->trans, args->dp, 0);
                /*
                 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 8fad9602542b..d4906e7c9787 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2926,9 +2926,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                 * Try to remember where we decided to put the value.
                 */
                nmap = 1;
-                error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+                error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
-                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                       &map, &nmap, XFS_BMAPI_ATTRFORK);
-                                        NULL, 0, &map, &nmap, NULL);
                if (error) {
                        return(error);
                }
@@ -2948,6 +2947,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                        bp = xfs_trans_get_buf(*trans,
                                        dp->i_mount->m_ddev_targp,
                                        dblkno, dblkcnt, XBF_LOCK);
+                        if (!bp)
+                                return ENOMEM;
                        xfs_trans_binval(*trans, bp);
                        /*
                         * Roll to next transaction.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ab3e5c6c4642..c68baeb0974a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -50,17 +50,22 @@
 #include "xfs_trace.h"
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
-#endif
 kmem_zone_t             *xfs_bmap_free_item_zone;
 /*
 * Prototypes for internal bmap routines.
 */
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(
+        struct xfs_btree_cur    *cur,
+        struct xfs_inode        *ip,
+        int                     whichfork);
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)         do { } while (0)
+#endif
 /*
 * Called from xfs_bmap_add_attrfork to handle extents format files.
@@ -85,58 +90,6 @@ xfs_bmap_add_attrfork_local(
        int                     *flags);        /* inode logging flags */
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent_delay_real(
-        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp); /* inode logging flags */
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a delayed allocation.
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent_hole_delay(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp); /* inode logging flags */
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
- * to a real allocation.
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent_hole_real(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_btree_cur_t         *cur,   /* if null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork); /* data or attr fork */
-/*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
- * allocation to a real allocation or vice versa.
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent_unwritten_real(
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp); /* inode logging flags */
-/*
 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
 * It figures out where to ask the underlying allocator to put the new extent.
 */
@@ -215,19 +168,6 @@ xfs_bmap_search_extents(
        xfs_bmbt_irec_t *prevp);        /* out: previous extent entry found */
 /*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int                              /* error */
-xfs_bmap_isaeof(
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_fileoff_t   off,            /* file offset in fsblocks */
-        int             whichfork,      /* data or attribute fork */
-        char            *aeof);         /* return value */
-/*
 * Compute the worst-case number of indirect blocks that will be used
 * for ip's delayed extent of length "len".
 */
@@ -431,188 +371,13 @@ xfs_bmap_add_attrfork_local(
 }
 /*
- * Called by xfs_bmapi to update file extent records and the btree
+ * Convert a delayed allocation to a real allocation.
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int                              /* error */
-xfs_bmap_add_extent(
-        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork) /* data or attr fork */
-{
-        xfs_btree_cur_t         *cur;   /* btree cursor or null */
-        xfs_filblks_t           da_new; /* new count del alloc blocks used */
-        xfs_filblks_t           da_old; /* old count del alloc blocks used */
-        int                     error;  /* error return value */
-        xfs_ifork_t             *ifp;   /* inode fork ptr */
-        int                     logflags; /* returned value */
-        xfs_extnum_t            nextents; /* number of extents in file now */
-        XFS_STATS_INC(xs_add_exlist);
-        cur = *curp;
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        da_old = da_new = 0;
-        error = 0;
-        ASSERT(*idx >= 0);
-        ASSERT(*idx <= nextents);
-        /*
-         * This is the first extent added to a new/empty file.
-         * Special case this one, so other routines get to assume there are
-         * already extents in the list.
-         */
-        if (nextents == 0) {
-                xfs_iext_insert(ip, *idx, 1, new,
-                                whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-                ASSERT(cur == NULL);
-                if (!isnullstartblock(new->br_startblock)) {
-                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-                } else
-                        logflags = 0;
-        }
-        /*
-         * Any kind of new delayed allocation goes here.
-         */
-        else if (isnullstartblock(new->br_startblock)) {
-                if (cur)
-                        ASSERT((cur->bc_private.b.flags &
-                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                                       &logflags);
-        }
-        /*
-         * Real allocation off the end of the file.
-         */
-        else if (*idx == nextents) {
-                if (cur)
-                        ASSERT((cur->bc_private.b.flags &
-                                XFS_BTCUR_BPRV_WASDEL) == 0);
-                error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, whichfork);
-        } else {
-                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
-                /*
-                 * Get the record referred to by idx.
-                 */
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
-                /*
-                 * If it's a real allocation record, and the new allocation ends
-                 * after the start of the referred to record, then we're filling
-                 * in a delayed or unwritten allocation with a real one, or
-                 * converting real back to unwritten.
-                 */
-                if (!isnullstartblock(new->br_startblock) &&
-                    new->br_startoff + new->br_blockcount > prev.br_startoff) {
-                        if (prev.br_state != XFS_EXT_UNWRITTEN &&
-                            isnullstartblock(prev.br_startblock)) {
-                                da_old = startblockval(prev.br_startblock);
-                                if (cur)
-                                        ASSERT(cur->bc_private.b.flags &
-                                                XFS_BTCUR_BPRV_WASDEL);
-                                error = xfs_bmap_add_extent_delay_real(tp, ip,
-                                                idx, &cur, new, &da_new,
-                                                first, flist, &logflags);
-                        } else {
-                                ASSERT(new->br_state == XFS_EXT_NORM ||
-                                       new->br_state == XFS_EXT_UNWRITTEN);
-                                error = xfs_bmap_add_extent_unwritten_real(ip,
-                                                idx, &cur, new, &logflags);
-                                if (error)
-                                        goto done;
-                        }
-                }
-                /*
-                 * Otherwise we're filling in a hole with an allocation.
-                 */
-                else {
-                        if (cur)
-                                ASSERT((cur->bc_private.b.flags &
-                                        XFS_BTCUR_BPRV_WASDEL) == 0);
-                        error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, whichfork);
-                }
-        }
-        if (error)
-                goto done;
-        ASSERT(*curp == cur || *curp == NULL);
-        /*
-         * Convert to a btree if necessary.
-         */
-        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
-                int     tmp_logflags;   /* partial log flag return val */
-                ASSERT(cur == NULL);
-                error = xfs_bmap_extents_to_btree(tp, ip, first,
-                        flist, &cur, da_old > 0, &tmp_logflags, whichfork);
-                logflags |= tmp_logflags;
-                if (error)
-                        goto done;
-        }
-        /*
-         * Adjust for changes in reserved delayed indirect blocks.
-         * Nothing to do for disk quotas here.
-         */
-        if (da_old || da_new) {
-                xfs_filblks_t   nblks;
-                nblks = da_new;
-                if (cur)
-                        nblks += cur->bc_private.b.allocated;
-                ASSERT(nblks <= da_old);
-                if (nblks < da_old)
-                        xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                                (int64_t)(da_old - nblks), 0);
-        }
-        /*
-         * Clear out the allocated field, done with it now in any case.
-         */
-        if (cur) {
-                cur->bc_private.b.allocated = 0;
-                *curp = cur;
-        }
-done:
-#ifdef DEBUG
-        if (!error)
-                xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
-#endif
-        *logflagsp = logflags;
-        return error;
-}
-/*
- * Called by xfs_bmap_add_extent to handle cases converting a delayed
- * allocation to a real allocation.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_delay_real(
-        struct xfs_trans        *tp,    /* transaction pointer */
+        struct xfs_bmalloca     *bma)
-        xfs_inode_t             *ip,    /* incore inode pointer */
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
-        xfs_fsblock_t           *first, /* pointer to firstblock variable */
-        xfs_bmap_free_t         *flist, /* list of extents to be freed */
-        int                     *logflagsp) /* inode logging flags */
 {
-        xfs_btree_cur_t         *cur;   /* btree cursor */
+        struct xfs_bmbt_irec    *new = &bma->got;
        int                     diff;   /* temp value */
        xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
        int                     error;  /* error return value */
@@ -623,10 +388,22 @@ xfs_bmap_add_extent_delay_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
-        xfs_filblks_t           temp=0; /* value for dnew calculations */
+        xfs_filblks_t           da_new; /* new count del alloc blocks used */
-        xfs_filblks_t           temp2=0;/* value for dnew calculations */
+        xfs_filblks_t           da_old; /* old count del alloc blocks used */
+        xfs_filblks_t           temp=0; /* value for da_new calculations */
+        xfs_filblks_t           temp2=0;/* value for da_new calculations */
        int                     tmp_rval;       /* partial logging flags */
+        ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+        ASSERT(bma->idx >= 0);
+        ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        ASSERT(!bma->cur ||
+               (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+        XFS_STATS_INC(xs_add_exlist);
 #define LEFT            r[0]
 #define RIGHT           r[1]
 #define PREV            r[2]
@@ -634,14 +411,15 @@ xfs_bmap_add_extent_delay_real(
        /*
         * Set up a bunch of variables to make the tests simpler.
         */
-        cur = *curp;
+        ep = xfs_iext_get_ext(ifp, bma->idx);
-        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        new_endoff = new->br_startoff + new->br_blockcount;
        ASSERT(PREV.br_startoff <= new->br_startoff);
        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+        da_old = startblockval(PREV.br_startblock);
+        da_new = 0;
        /*
         * Set flags determining what part of the previous delayed allocation
         * extent is being replaced by a real allocation.
@@ -655,9 +433,9 @@ xfs_bmap_add_extent_delay_real(
         * Check and set flags if this segment has a left neighbor.
         * Don't set contiguous if the combined extent would be too large.
         */
-        if (*idx > 0) {
+        if (bma->idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT);
                if (isnullstartblock(LEFT.br_startblock))
                        state |= BMAP_LEFT_DELAY;
@@ -675,9 +453,9 @@ xfs_bmap_add_extent_delay_real(
         * Don't set contiguous if the combined extent would be too large.
         * Also check for all-three-contiguous being too large.
         */
-        if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+        if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
                if (isnullstartblock(RIGHT.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
@@ -708,38 +486,41 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left and right neighbors are both contiguous with new.
                 */
-                --*idx;
+                bma->idx--;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
                        LEFT.br_blockcount + PREV.br_blockcount +
                        RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, *idx + 1, 2, state);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
-                ip->i_d.di_nextents--;
+                bma->ip->i_d.di_nextents--;
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
                else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
                                        RIGHT.br_startblock,
-                                        RIGHT.br_blockcount, &i)))
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_btree_delete(cur, &i)))
+                        error = xfs_btree_delete(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                        error = xfs_btree_decrement(bma->cur, 0, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
                                        PREV.br_blockcount +
-                                        RIGHT.br_blockcount, LEFT.br_state)))
+                                        RIGHT.br_blockcount, LEFT.br_state);
+                        if (error)
                                goto done;
                }
-                *dnew = 0;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -747,30 +528,31 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The left neighbor is contiguous, the right is not.
                 */
-                --*idx;
+                bma->idx--;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
                        LEFT.br_blockcount + PREV.br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, *idx + 1, 1, state);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock, LEFT.br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
-                                        PREV.br_blockcount, LEFT.br_state)))
+                                        PREV.br_blockcount, LEFT.br_state);
+                        if (error)
                                goto done;
                }
-                *dnew = 0;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -778,30 +560,30 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in all of a previously delayed allocation extent.
                 * The right neighbor is contiguous, the left is not.
                 */
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
                xfs_bmbt_set_blockcount(ep,
                        PREV.br_blockcount + RIGHT.br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, *idx + 1, 1, state);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
                                        RIGHT.br_startblock,
-                                        RIGHT.br_blockcount, &i)))
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
                                        new->br_startblock,
                                        PREV.br_blockcount +
-                                        RIGHT.br_blockcount, PREV.br_state)))
+                                        RIGHT.br_blockcount, PREV.br_state);
+                        if (error)
                                goto done;
                }
-                *dnew = 0;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -810,27 +592,27 @@ xfs_bmap_add_extent_delay_real(
                 * Neither the left nor right neighbors are contiguous with
                 * the new one.
                 */
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
                xfs_bmbt_set_startblock(ep, new->br_startblock);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                ip->i_d.di_nextents++;
+                bma->ip->i_d.di_nextents++;
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
                else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
                                        new->br_startblock, new->br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_btree_insert(cur, &i)))
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                *dnew = 0;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -838,39 +620,40 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1),
                        LEFT.br_blockcount + new->br_blockcount);
                xfs_bmbt_set_startoff(ep,
                        PREV.br_startoff + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_);
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock, LEFT.br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
                                        new->br_blockcount,
-                                        LEFT.br_state)))
+                                        LEFT.br_state);
+                        if (error)
                                goto done;
                }
-                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
                        startblockval(PREV.br_startblock));
-                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                --*idx;
+                bma->idx--;
-                *dnew = temp;
                break;
        case BMAP_LEFT_FILLING:
@@ -878,43 +661,43 @@ xfs_bmap_add_extent_delay_real(
                 * Filling in the first part of a previous delayed allocation.
                 * The left neighbor is not contiguous.
                 */
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
                xfs_bmbt_set_startoff(ep, new_endoff);
                temp = PREV.br_blockcount - new->br_blockcount;
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, *idx, 1, new, state);
+                xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-                ip->i_d.di_nextents++;
+                bma->ip->i_d.di_nextents++;
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
                else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
                                        new->br_startblock, new->br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_btree_insert(cur, &i)))
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
-                        error = xfs_bmap_extents_to_btree(tp, ip,
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                        first, flist, &cur, 1, &tmp_rval,
+                                        bma->firstblock, bma->flist,
-                                        XFS_DATA_FORK);
+                                        &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
                        rval |= tmp_rval;
                        if (error)
                                goto done;
                }
-                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
                        startblockval(PREV.br_startblock) -
-                        (cur ? cur->bc_private.b.allocated : 0));
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, *idx + 1);
+                ep = xfs_iext_get_ext(ifp, bma->idx + 1);
-                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-                *dnew = temp;
                break;
        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -923,38 +706,39 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is contiguous with the new allocation.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + RIGHT.br_blockcount,
                        RIGHT.br_state);
-                trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_);
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_DEXT;
                else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff,
                                        RIGHT.br_startblock,
-                                        RIGHT.br_blockcount, &i)))
+                                        RIGHT.br_blockcount, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
                                        RIGHT.br_blockcount,
-                                        RIGHT.br_state)))
+                                        RIGHT.br_state);
+                        if (error)
                                goto done;
                }
-                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
                        startblockval(PREV.br_startblock));
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                ++*idx;
+                bma->idx++;
-                *dnew = temp;
                break;
        case BMAP_RIGHT_FILLING:
@@ -963,42 +747,43 @@ xfs_bmap_add_extent_delay_real(
                 * The right neighbor is not contiguous.
                 */
                temp = PREV.br_blockcount - new->br_blockcount;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);
-                xfs_iext_insert(ip, *idx + 1, 1, new, state);
+                xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
-                ip->i_d.di_nextents++;
+                bma->ip->i_d.di_nextents++;
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
                else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
                                        new->br_startblock, new->br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_btree_insert(cur, &i)))
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
-                        error = xfs_bmap_extents_to_btree(tp, ip,
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                first, flist, &cur, 1, &tmp_rval,
+                                bma->firstblock, bma->flist, &bma->cur, 1,
-                                XFS_DATA_FORK);
+                                &tmp_rval, XFS_DATA_FORK);
                        rval |= tmp_rval;
                        if (error)
                                goto done;
                }
-                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
                        startblockval(PREV.br_startblock) -
-                        (cur ? cur->bc_private.b.allocated : 0));
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-                ep = xfs_iext_get_ext(ifp, *idx);
+                ep = xfs_iext_get_ext(ifp, bma->idx);
-                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock(da_new));
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                ++*idx;
+                bma->idx++;
-                *dnew = temp;
                break;
        case 0:
@@ -1024,82 +809,65 @@ xfs_bmap_add_extent_delay_real(
                 */
                temp = new->br_startoff - PREV.br_startoff;
                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-                trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_);
                xfs_bmbt_set_blockcount(ep, temp);      /* truncate PREV */
                LEFT = *new;
                RIGHT.br_state = PREV.br_state;
                RIGHT.br_startblock = nullstartblock(
-                                (int)xfs_bmap_worst_indlen(ip, temp2));
+                                (int)xfs_bmap_worst_indlen(bma->ip, temp2));
                RIGHT.br_startoff = new_endoff;
                RIGHT.br_blockcount = temp2;
                /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
-                xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
+                xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
-                ip->i_d.di_nextents++;
+                bma->ip->i_d.di_nextents++;
-                if (cur == NULL)
+                if (bma->cur == NULL)
                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
                else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                        error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff,
                                        new->br_startblock, new->br_blockcount,
-                                        &i)))
+                                        &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_btree_insert(cur, &i)))
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
-                        error = xfs_bmap_extents_to_btree(tp, ip,
+                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-                                        first, flist, &cur, 1, &tmp_rval,
+                                        bma->firstblock, bma->flist, &bma->cur,
-                                        XFS_DATA_FORK);
+                                        1, &tmp_rval, XFS_DATA_FORK);
                        rval |= tmp_rval;
                        if (error)
                                goto done;
                }
-                temp = xfs_bmap_worst_indlen(ip, temp);
+                temp = xfs_bmap_worst_indlen(bma->ip, temp);
-                temp2 = xfs_bmap_worst_indlen(ip, temp2);
+                temp2 = xfs_bmap_worst_indlen(bma->ip, temp2);
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
-                        (cur ? cur->bc_private.b.allocated : 0));
+                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
-                if (diff > 0 &&
+                if (diff > 0) {
-                    xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        error = xfs_icsb_modify_counters(bma->ip->i_mount,
-                                             -((int64_t)diff), 0)) {
+                                        XFS_SBS_FDBLOCKS,
-                        /*
+                                        -((int64_t)diff), 0);
-                         * Ick gross gag me with a spoon.
+                        ASSERT(!error);
-                         */
+                        if (error)
-                        ASSERT(0);      /* want to see if this ever happens! */
+                                goto done;
-                        while (diff > 0) {
-                                if (temp) {
-                                        temp--;
-                                        diff--;
-                                        if (!diff ||
-                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), 0))
-                                                break;
-                                }
-                                if (temp2) {
-                                        temp2--;
-                                        diff--;
-                                        if (!diff ||
-                                            !xfs_icsb_modify_counters(ip->i_mount,
-                                                    XFS_SBS_FDBLOCKS,
-                                                    -((int64_t)diff), 0))
-                                                break;
-                                }
-                        }
                }
-                ep = xfs_iext_get_ext(ifp, *idx);
+                ep = xfs_iext_get_ext(ifp, bma->idx);
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2),
                        nullstartblock((int)temp2));
-                trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_);
-                ++*idx;
+                bma->idx++;
-                *dnew = temp + temp2;
+                da_new = temp + temp2;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1114,9 +882,40 @@ xfs_bmap_add_extent_delay_real(
                 */
                ASSERT(0);
        }
-        *curp = cur;
+        /* convert to a btree if necessary */
+        if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(bma->cur == NULL);
+                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                bma->firstblock, bma->flist, &bma->cur,
+                                da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+                bma->logflags |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* adjust for changes in reserved delayed indirect blocks */
+        if (da_old || da_new) {
+                temp = da_new;
+                if (bma->cur)
+                        temp += bma->cur->bc_private.b.allocated;
+                ASSERT(temp <= da_old);
+                if (temp < da_old)
+                        xfs_icsb_modify_counters(bma->ip->i_mount,
+                                        XFS_SBS_FDBLOCKS,
+                                        (int64_t)(da_old - temp), 0);
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (bma->cur)
+                bma->cur->bc_private.b.allocated = 0;
+        xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
 done:
-        *logflagsp = rval;
+        bma->logflags |= rval;
        return error;
 #undef  LEFT
 #undef  RIGHT
@@ -1124,15 +923,17 @@ done:
 }
 /*
- * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * Convert an unwritten allocation to a real allocation or vice versa.
- * allocation to a real allocation or vice versa.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_unwritten_real(
+        struct xfs_trans        *tp,
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_extnum_t            *idx,   /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp) /* inode logging flags */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
@@ -1148,15 +949,25 @@ xfs_bmap_add_extent_unwritten_real(
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
+        *logflagsp = 0;
+        cur = *curp;
+        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        ASSERT(*idx >= 0);
+        ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        XFS_STATS_INC(xs_add_exlist);
 #define LEFT            r[0]
 #define RIGHT           r[1]
 #define PREV            r[2]
        /*
         * Set up a bunch of variables to make the tests simpler.
         */
        error = 0;
-        cur = *curp;
-        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        ep = xfs_iext_get_ext(ifp, *idx);
        xfs_bmbt_get_all(ep, &PREV);
        newext = new->br_state;
@@ -1406,10 +1217,11 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
+                        error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + new->br_blockcount,
-                                LEFT.br_state))
+                                LEFT.br_state);
+                        if (error)
                                goto done;
                }
                break;
@@ -1607,9 +1419,29 @@ xfs_bmap_add_extent_unwritten_real(
                 */
                ASSERT(0);
        }
-        *curp = cur;
+        /* convert to a btree if necessary */
+        if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur,
+                                0, &tmp_logflags, XFS_DATA_FORK);
+                *logflagsp |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                *curp = cur;
+        }
+        xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
 done:
-        *logflagsp = rval;
+        *logflagsp |= rval;
        return error;
 #undef  LEFT
 #undef  RIGHT
@@ -1617,16 +1449,13 @@ done:
 }
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * Convert a hole to a delayed allocation.
- * to a delayed allocation.
 */
-/*ARGSUSED*/
+STATIC void
-STATIC int                              /* error */
 xfs_bmap_add_extent_hole_delay(
        xfs_inode_t             *ip,    /* incore inode pointer */
        xfs_extnum_t            *idx,   /* extent number to update/insert */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
+        xfs_bmbt_irec_t         *new)   /* new data to add to file extents */
-        int                     *logflagsp) /* inode logging flags */
 {
        xfs_ifork_t             *ifp;   /* inode fork pointer */
        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
@@ -1761,23 +1590,17 @@ xfs_bmap_add_extent_hole_delay(
                 * Nothing to do for disk quota accounting here.
                 */
        }
-        *logflagsp = 0;
-        return 0;
 }
 /*
- * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * Convert a hole to a real allocation.
- * to a real allocation.
 */
 STATIC int                              /* error */
 xfs_bmap_add_extent_hole_real(
-        xfs_inode_t             *ip,    /* incore inode pointer */
+        struct xfs_bmalloca     *bma,
-        xfs_extnum_t            *idx,   /* extent number to update/insert */
+        int                     whichfork)
-        xfs_btree_cur_t         *cur,   /* if null, not a btree */
-        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
-        int                     whichfork) /* data or attr fork */
 {
+        struct xfs_bmbt_irec    *new = &bma->got;
        int                     error;  /* error return value */
        int                     i;      /* temp state */
        xfs_ifork_t             *ifp;   /* inode fork pointer */
@@ -1786,19 +1609,26 @@ xfs_bmap_add_extent_hole_real(
        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
-        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-        ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
-        state = 0;
+        ASSERT(bma->idx >= 0);
+        ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+        ASSERT(!isnullstartblock(new->br_startblock));
+        ASSERT(!bma->cur ||
+               !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+        XFS_STATS_INC(xs_add_exlist);
+        state = 0;
        if (whichfork == XFS_ATTR_FORK)
                state |= BMAP_ATTRFORK;
        /*
         * Check and set flags if this segment has a left neighbor.
         */
-        if (*idx > 0) {
+        if (bma->idx > 0) {
                state |= BMAP_LEFT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
                if (isnullstartblock(left.br_startblock))
                        state |= BMAP_LEFT_DELAY;
        }
@@ -1807,9 +1637,9 @@ xfs_bmap_add_extent_hole_real(
         * Check and set flags if this segment has a current value.
         * Not true if we're inserting into the "hole" at eof.
         */
-        if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+        if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
                state |= BMAP_RIGHT_VALID;
-                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
                if (isnullstartblock(right.br_startblock))
                        state |= BMAP_RIGHT_DELAY;
        }
@@ -1846,39 +1676,42 @@ xfs_bmap_add_extent_hole_real(
                 * left and on the right.
                 * Merge all three into a single extent record.
                 */
-                --*idx;
+                --bma->idx;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
                        left.br_blockcount + new->br_blockcount +
                        right.br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_iext_remove(ip, *idx + 1, 1, state);
+                xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
-                XFS_IFORK_NEXT_SET(ip, whichfork,
+                XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                        XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
-                if (cur == NULL) {
+                if (bma->cur == NULL) {
                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur,
+                        error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
-                                        right.br_startoff,
+                                        right.br_startblock, right.br_blockcount,
-                                        right.br_startblock,
+                                        &i);
-                                        right.br_blockcount, &i)))
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_btree_delete(cur, &i)))
+                        error = xfs_btree_delete(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_btree_decrement(cur, 0, &i)))
+                        error = xfs_btree_decrement(bma->cur, 0, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
                                                new->br_blockcount +
                                                right.br_blockcount,
-                                        left.br_state)))
+                                        left.br_state);
+                        if (error)
                                goto done;
                }
                break;
@@ -1889,27 +1722,28 @@ xfs_bmap_add_extent_hole_real(
                 * on the left.
                 * Merge the new allocation with the left neighbor.
                 */
-                --*idx;
+                --bma->idx;
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
                        left.br_blockcount + new->br_blockcount);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                if (cur == NULL) {
+                if (bma->cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur,
+                        error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
-                                        left.br_startoff,
+                                        left.br_startblock, left.br_blockcount,
-                                        left.br_startblock,
+                                        &i);
-                                        left.br_blockcount, &i)))
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
+                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
                                                new->br_blockcount,
-                                        left.br_state)))
+                                        left.br_state);
+                        if (error)
                                goto done;
                }
                break;
@@ -1920,28 +1754,30 @@ xfs_bmap_add_extent_hole_real(
                 * on the right.
                 * Merge the new allocation with the right neighbor.
                 */
-                trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+                xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
                        new->br_startoff, new->br_startblock,
                        new->br_blockcount + right.br_blockcount,
                        right.br_state);
-                trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+                trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
-                if (cur == NULL) {
+                if (bma->cur == NULL) {
                        rval = xfs_ilog_fext(whichfork);
                } else {
                        rval = 0;
-                        if ((error = xfs_bmbt_lookup_eq(cur,
+                        error = xfs_bmbt_lookup_eq(bma->cur,
                                        right.br_startoff,
                                        right.br_startblock,
-                                        right.br_blockcount, &i)))
+                                        right.br_blockcount, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
                                                right.br_blockcount,
-                                        right.br_state)))
+                                        right.br_state);
+                        if (error)
                                goto done;
                }
                break;
@@ -1952,28 +1788,50 @@ xfs_bmap_add_extent_hole_real(
                 * real allocation.
                 * Insert a new entry.
                 */
-                xfs_iext_insert(ip, *idx, 1, new, state);
+                xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-                XFS_IFORK_NEXT_SET(ip, whichfork,
+                XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+                        XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
-                if (cur == NULL) {
+                if (bma->cur == NULL) {
                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else {
                        rval = XFS_ILOG_CORE;
-                        if ((error = xfs_bmbt_lookup_eq(cur,
+                        error = xfs_bmbt_lookup_eq(bma->cur,
                                        new->br_startoff,
                                        new->br_startblock,
-                                        new->br_blockcount, &i)))
+                                        new->br_blockcount, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
-                        cur->bc_rec.b.br_state = new->br_state;
+                        bma->cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_btree_insert(cur, &i)))
+                        error = xfs_btree_insert(bma->cur, &i);
+                        if (error)
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                break;
        }
+        /* convert to a btree if necessary */
+        if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(bma->cur == NULL);
+                error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
+                                bma->firstblock, bma->flist, &bma->cur,
+                                0, &tmp_logflags, whichfork);
+                bma->logflags |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /* clear out the allocated field, done with it now in any case. */
+        if (bma->cur)
+                bma->cur->bc_private.b.allocated = 0;
+        xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
-        *logflagsp = rval;
+        bma->logflags |= rval;
        return error;
 }
@@ -2160,26 +2018,26 @@ xfs_bmap_adjacent(
                XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
        mp = ap->ip->i_mount;
-        nullfb = ap->firstblock == NULLFSBLOCK;
+        nullfb = *ap->firstblock == NULLFSBLOCK;
        rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
-        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
        /*
         * If allocating at eof, and there's a previous real block,
         * try to use its last block as our starting point.
         */
-        if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+        if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
-            !isnullstartblock(ap->prevp->br_startblock) &&
+            !isnullstartblock(ap->prev.br_startblock) &&
-            ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+            ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
-                    ap->prevp->br_startblock)) {
+                    ap->prev.br_startblock)) {
-                ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+                ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
                /*
                 * Adjust for the gap between prevp and us.
                 */
-                adjust = ap->off -
+                adjust = ap->offset -
-                        (ap->prevp->br_startoff + ap->prevp->br_blockcount);
+                        (ap->prev.br_startoff + ap->prev.br_blockcount);
                if (adjust &&
-                    ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
+                    ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
-                        ap->rval += adjust;
+                        ap->blkno += adjust;
        }
        /*
         * If not at eof, then compare the two neighbor blocks.
@@ -2196,17 +2054,17 @@ xfs_bmap_adjacent(
                 * If there's a previous (left) block, select a requested
                 * start block based on it.
                 */
-                if (ap->prevp->br_startoff != NULLFILEOFF &&
+                if (ap->prev.br_startoff != NULLFILEOFF &&
-                    !isnullstartblock(ap->prevp->br_startblock) &&
+                    !isnullstartblock(ap->prev.br_startblock) &&
-                    (prevbno = ap->prevp->br_startblock +
+                    (prevbno = ap->prev.br_startblock +
-                               ap->prevp->br_blockcount) &&
+                               ap->prev.br_blockcount) &&
-                    ISVALID(prevbno, ap->prevp->br_startblock)) {
+                    ISVALID(prevbno, ap->prev.br_startblock)) {
                        /*
                         * Calculate gap to end of previous block.
                         */
-                        adjust = prevdiff = ap->off -
+                        adjust = prevdiff = ap->offset -
-                                (ap->prevp->br_startoff +
+                                (ap->prev.br_startoff +
-                                 ap->prevp->br_blockcount);
+                                 ap->prev.br_blockcount);
                        /*
                         * Figure the startblock based on the previous block's
                         * end and the gap size.
@@ -2215,9 +2073,9 @@ xfs_bmap_adjacent(
                         * allocating, or using it gives us an invalid block
                         * number, then just use the end of the previous block.
                         */
-                        if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+                        if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
                            ISVALID(prevbno + prevdiff,
-                                    ap->prevp->br_startblock))
+                                    ap->prev.br_startblock))
                                prevbno += adjust;
                        else
                                prevdiff += adjust;
@@ -2238,16 +2096,16 @@ xfs_bmap_adjacent(
                 * If there's a following (right) block, select a requested
                 * start block based on it.
                 */
-                if (!isnullstartblock(ap->gotp->br_startblock)) {
+                if (!isnullstartblock(ap->got.br_startblock)) {
                        /*
                         * Calculate gap to start of next block.
                         */
-                        adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+                        adjust = gotdiff = ap->got.br_startoff - ap->offset;
                        /*
                         * Figure the startblock based on the next block's
                         * start and the gap size.
                         */
-                        gotbno = ap->gotp->br_startblock;
+                        gotbno = ap->got.br_startblock;
                        /*
                         * Heuristic!
                         * If the gap is large relative to the piece we're
@@ -2255,12 +2113,12 @@ xfs_bmap_adjacent(
                         * number, then just use the start of the next block
                         * offset by our length.
                         */
-                        if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+                        if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
                            ISVALID(gotbno - gotdiff, gotbno))
                                gotbno -= adjust;
-                        else if (ISVALID(gotbno - ap->alen, gotbno)) {
+                        else if (ISVALID(gotbno - ap->length, gotbno)) {
-                                gotbno -= ap->alen;
+                                gotbno -= ap->length;
-                                gotdiff += adjust - ap->alen;
+                                gotdiff += adjust - ap->length;
                        } else
                                gotdiff += adjust;
                        /*
@@ -2278,14 +2136,14 @@ xfs_bmap_adjacent(
                        gotbno = NULLFSBLOCK;
                /*
                 * If both valid, pick the better one, else the only good
-                 * one, else ap->rval is already set (to 0 or the inode block).
+                 * one, else ap->blkno is already set (to 0 or the inode block).
                 */
                if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
-                        ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+                        ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
                else if (prevbno != NULLFSBLOCK)
-                        ap->rval = prevbno;
+                        ap->blkno = prevbno;
                else if (gotbno != NULLFSBLOCK)
-                        ap->rval = gotbno;
+                        ap->blkno = gotbno;
        }
 #undef ISVALID
 }
@@ -2305,24 +2163,24 @@ xfs_bmap_rtalloc(
        mp = ap->ip->i_mount;
        align = xfs_get_extsz_hint(ap->ip);
        prod = align / mp->m_sb.sb_rextsize;
-        error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+        error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
                                        align, 1, ap->eof, 0,
-                                        ap->conv, &ap->off, &ap->alen);
+                                        ap->conv, &ap->offset, &ap->length);
        if (error)
                return error;
-        ASSERT(ap->alen);
+        ASSERT(ap->length);
-        ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+        ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
        /*
         * If the offset & length are not perfectly aligned
         * then kill prod, it will just get us in trouble.
         */
-        if (do_mod(ap->off, align) || ap->alen % align)
+        if (do_mod(ap->offset, align) || ap->length % align)
                prod = 1;
        /*
         * Set ralen to be the actual requested length in rtextents.
         */
-        ralen = ap->alen / mp->m_sb.sb_rextsize;
+        ralen = ap->length / mp->m_sb.sb_rextsize;
        /*
         * If the old value was close enough to MAXEXTLEN that
         * we rounded up to it, cut it back so it's valid again.
@@ -2337,21 +2195,21 @@ xfs_bmap_rtalloc(
         * Lock out other modifications to the RT bitmap inode.
         */
        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
         * pick an extent that will space things out in the rt area.
         */
-        if (ap->eof && ap->off == 0) {
+        if (ap->eof && ap->offset == 0) {
                xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
                error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
                if (error)
                        return error;
-                ap->rval = rtx * mp->m_sb.sb_rextsize;
+                ap->blkno = rtx * mp->m_sb.sb_rextsize;
        } else {
-                ap->rval = 0;
+                ap->blkno = 0;
        }
        xfs_bmap_adjacent(ap);
@@ -2359,23 +2217,23 @@ xfs_bmap_rtalloc(
        /*
         * Realtime allocation, done through xfs_rtallocate_extent.
         */
-        atype = ap->rval == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+        atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
-        do_div(ap->rval, mp->m_sb.sb_rextsize);
+        do_div(ap->blkno, mp->m_sb.sb_rextsize);
-        rtb = ap->rval;
+        rtb = ap->blkno;
-        ap->alen = ralen;
+        ap->length = ralen;
-        if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+        if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
                                &ralen, atype, ap->wasdel, prod, &rtb)))
                return error;
        if (rtb == NULLFSBLOCK && prod > 1 &&
-            (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+            (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
-                                           ap->alen, &ralen, atype,
+                                           ap->length, &ralen, atype,
                                           ap->wasdel, 1, &rtb)))
                return error;
-        ap->rval = rtb;
+        ap->blkno = rtb;
-        if (ap->rval != NULLFSBLOCK) {
+        if (ap->blkno != NULLFSBLOCK) {
-                ap->rval *= mp->m_sb.sb_rextsize;
+                ap->blkno *= mp->m_sb.sb_rextsize;
                ralen *= mp->m_sb.sb_rextsize;
-                ap->alen = ralen;
+                ap->length = ralen;
                ap->ip->i_d.di_nblocks += ralen;
                xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
                if (ap->wasdel)
@@ -2388,7 +2246,7 @@ xfs_bmap_rtalloc(
                        ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
                                        XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
        } else {
-                ap->alen = 0;
+                ap->length = 0;
        }
        return 0;
 }
@@ -2503,7 +2361,7 @@ xfs_bmap_btalloc_nullfb(
         * AG as the stream may have moved.
         */
        if (xfs_inode_is_filestream(ap->ip))
-                ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+                ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
        return 0;
 }
@@ -2528,52 +2386,52 @@ xfs_bmap_btalloc(
        mp = ap->ip->i_mount;
        align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
        if (unlikely(align)) {
-                error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+                error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
                                                align, 0, ap->eof, 0, ap->conv,
-                                                &ap->off, &ap->alen);
+                                                &ap->offset, &ap->length);
                ASSERT(!error);
-                ASSERT(ap->alen);
+                ASSERT(ap->length);
        }
-        nullfb = ap->firstblock == NULLFSBLOCK;
+        nullfb = *ap->firstblock == NULLFSBLOCK;
-        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
        if (nullfb) {
                if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
                        ag = xfs_filestream_lookup_ag(ap->ip);
                        ag = (ag != NULLAGNUMBER) ? ag : 0;
-                        ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+                        ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
                } else {
-                        ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+                        ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
                }
        } else
-                ap->rval = ap->firstblock;
+                ap->blkno = *ap->firstblock;
        xfs_bmap_adjacent(ap);
        /*
-         * If allowed, use ap->rval; otherwise must use firstblock since
+         * If allowed, use ap->blkno; otherwise must use firstblock since
         * it's in the right allocation group.
         */
-        if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+        if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
                ;
        else
-                ap->rval = ap->firstblock;
+                ap->blkno = *ap->firstblock;
        /*
         * Normal allocation, done through xfs_alloc_vextent.
         */
        tryagain = isaligned = 0;
        args.tp = ap->tp;
        args.mp = mp;
-        args.fsbno = ap->rval;
+        args.fsbno = ap->blkno;
        /* Trim the allocation back to the maximum an AG can fit. */
-        args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
+        args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
-        args.firstblock = ap->firstblock;
+        args.firstblock = *ap->firstblock;
        blen = 0;
        if (nullfb) {
                error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
                if (error)
                        return error;
-        } else if (ap->low) {
+        } else if (ap->flist->xbf_low) {
                if (xfs_inode_is_filestream(ap->ip))
                        args.type = XFS_ALLOCTYPE_FIRST_AG;
                else
@@ -2587,14 +2445,14 @@ xfs_bmap_btalloc(
        /* apply extent size hints if obtained earlier */
        if (unlikely(align)) {
                args.prod = align;
-                if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+                if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
                        args.mod = (xfs_extlen_t)(args.prod - args.mod);
        } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
                args.prod = 1;
                args.mod = 0;
        } else {
                args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
-                if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+                if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
                        args.mod = (xfs_extlen_t)(args.prod - args.mod);
        }
        /*
@@ -2606,8 +2464,8 @@ xfs_bmap_btalloc(
         * is >= the stripe unit and the allocation offset is
         * at the end of file.
         */
-        if (!ap->low && ap->aeof) {
+        if (!ap->flist->xbf_low && ap->aeof) {
-                if (!ap->off) {
+                if (!ap->offset) {
                        args.alignment = mp->m_dalign;
                        atype = args.type;
                        isaligned = 1;
@@ -2660,7 +2518,7 @@ xfs_bmap_btalloc(
                 * turned on.
                 */
                args.type = atype;
-                args.fsbno = ap->rval;
+                args.fsbno = ap->blkno;
                args.alignment = mp->m_dalign;
                args.minlen = nextminlen;
                args.minalignslop = 0;
@@ -2674,7 +2532,7 @@ xfs_bmap_btalloc(
                 * try again.
                 */
                args.type = atype;
-                args.fsbno = ap->rval;
+                args.fsbno = ap->blkno;
                args.alignment = 0;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
@@ -2683,7 +2541,7 @@ xfs_bmap_btalloc(
            args.minlen > ap->minlen) {
                args.minlen = ap->minlen;
                args.type = XFS_ALLOCTYPE_START_BNO;
-                args.fsbno = ap->rval;
+                args.fsbno = ap->blkno;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
@@ -2694,13 +2552,26 @@ xfs_bmap_btalloc(
                args.minleft = 0;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
-                ap->low = 1;
+                ap->flist->xbf_low = 1;
        }
        if (args.fsbno != NULLFSBLOCK) {
-                ap->firstblock = ap->rval = args.fsbno;
+                /*
+                 * check the allocation happened at the same or higher AG than
+                 * the first block that was allocated.
+                 */
+                ASSERT(*ap->firstblock == NULLFSBLOCK ||
+                       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+                       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+                       (ap->flist->xbf_low &&
+                        XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+                        XFS_FSB_TO_AGNO(mp, args.fsbno)));
+                ap->blkno = args.fsbno;
+                if (*ap->firstblock == NULLFSBLOCK)
+                        *ap->firstblock = args.fsbno;
                ASSERT(nullfb || fb_agno == args.agno ||
-                       (ap->low && fb_agno < args.agno));
+                       (ap->flist->xbf_low && fb_agno < args.agno));
-                ap->alen = args.len;
+                ap->length = args.len;
                ap->ip->i_d.di_nblocks += args.len;
                xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
                if (ap->wasdel)
@@ -2714,8 +2585,8 @@ xfs_bmap_btalloc(
                                        XFS_TRANS_DQ_BCOUNT,
                        (long) args.len);
        } else {
-                ap->rval = NULLFSBLOCK;
+                ap->blkno = NULLFSBLOCK;
-                ap->alen = 0;
+                ap->length = 0;
        }
        return 0;
 }
@@ -3383,8 +3254,7 @@ xfs_bmap_local_to_extents(
                ASSERT(args.len == 1);
                *firstblock = args.fsbno;
                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-                memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
+                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-                        ifp->if_bytes);
                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
                xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -3590,7 +3460,7 @@ xfs_bmap_add_attrfork(
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
@@ -3783,19 +3653,11 @@ xfs_bmap_compute_maxlevels(
 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
 * caller.  Frees all the extents that need freeing, which must be done
 * last due to locking considerations.  We never free any extents in
- * the first transaction.  This is to allow the caller to make the first
+ * the first transaction.
- * transaction a synchronous one so that the pointers to the data being
- * broken in this transaction will be permanent before the data is actually
- * freed.  This is necessary to prevent blocks from being reallocated
- * and written to before the free and reallocation are actually permanent.
- * We do not just make the first transaction synchronous here, because
- * there are more efficient ways to gain the same protection in some cases
- * (see the file truncation code).
 *
 * Return 1 if the given transaction was committed and a new one
 * started, and 0 otherwise in the committed parameter.
 */
-/*ARGSUSED*/
 int                                             /* error */
 xfs_bmap_finish(
        xfs_trans_t             **tp,           /* transaction pointer addr */
@@ -3995,42 +3857,122 @@ xfs_bmap_last_before(
        return 0;
 }
+STATIC int
+xfs_bmap_last_extent(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     whichfork,
+        struct xfs_bmbt_irec    *rec,
+        int                     *is_empty)
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        int                     error;
+        int                     nextents;
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *is_empty = 1;
+                return 0;
+        }
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+        *is_empty = 0;
+        return 0;
+}
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+        struct xfs_bmalloca     *bma,
+        int                     whichfork)
+{
+        struct xfs_bmbt_irec    rec;
+        int                     is_empty;
+        int                     error;
+        bma->aeof = 0;
+        error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+                                     &is_empty);
+        if (error || is_empty)
+                return error;
+        /*
+         * Check if we are allocation or past the last extent, or at least into
+         * the last delayed allocated extent.
+         */
+        bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+                (bma->offset >= rec.br_startoff &&
+                 isnullstartblock(rec.br_startblock));
+        return 0;
+}
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           endoff,
+        int                     whichfork,
+        int                     *eof)
+{
+        struct xfs_bmbt_irec    rec;
+        int                     error;
+        error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+        if (error || *eof)
+                return error;
+        *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+        return 0;
+}
 /*
 * Returns the file-relative block number of the first block past eof in
 * the file.  This is not based on i_size, it is based on the extent records.
 * Returns 0 for local files, as they do not have extent records.
 */
-int                                             /* error */
+int
 xfs_bmap_last_offset(
-        xfs_trans_t     *tp,                    /* transaction pointer */
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,                    /* incore inode */
+        struct xfs_inode        *ip,
-        xfs_fileoff_t   *last_block,            /* last block */
+        xfs_fileoff_t           *last_block,
-        int             whichfork)              /* data or attr fork */
+        int                     whichfork)
 {
-        xfs_bmbt_rec_host_t *ep;                /* pointer to last extent */
+        struct xfs_bmbt_irec    rec;
-        int             error;                  /* error return value */
+        int                     is_empty;
-        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        int                     error;
-        xfs_extnum_t    nextents;               /* number of extent entries */
+        *last_block = 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+                return 0;
        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
               return XFS_ERROR(EIO);
-        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-                *last_block = 0;
+        error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
-                return 0;
+        if (error || is_empty)
-        }
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-            (error = xfs_iread_extents(tp, ip, whichfork)))
                return error;
-        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        if (!nextents) {
+        *last_block = rec.br_startoff + rec.br_blockcount;
-                *last_block = 0;
-                return 0;
-        }
-        ep = xfs_iext_get_ext(ifp, nextents - 1);
-        *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
        return 0;
 }
@@ -4160,7 +4102,6 @@ xfs_bmap_read_extents(
                xfs_extnum_t    num_recs;
                xfs_extnum_t    start;
                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
@@ -4283,9 +4224,8 @@ xfs_bmap_validate_ret(
                ASSERT(i == 0 ||
                       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
                       mval[i].br_startoff);
-                if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+                ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-                        ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+                       mval[i].br_startblock != HOLESTARTBLOCK);
-                               mval[i].br_startblock != HOLESTARTBLOCK);
                ASSERT(mval[i].br_state == XFS_EXT_NORM ||
                       mval[i].br_state == XFS_EXT_UNWRITTEN);
        }
@@ -4294,66 +4234,609 @@ xfs_bmap_validate_ret(
 /*
- * Map file blocks to filesystem blocks.
+ * Trim the returned map to the required bounds
- * File range is given by the bno/len pair.
+ */
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+STATIC void
- * into a hole or past eof.
+xfs_bmapi_trim_map(
- * Only allocates blocks from a single allocation group,
+        struct xfs_bmbt_irec    *mval,
- * to avoid locking problems.
+        struct xfs_bmbt_irec    *got,
+        xfs_fileoff_t           *bno,
+        xfs_filblks_t           len,
+        xfs_fileoff_t           obno,
+        xfs_fileoff_t           end,
+        int                     n,
+        int                     flags)
+{
+        if ((flags & XFS_BMAPI_ENTIRE) ||
+            got->br_startoff + got->br_blockcount <= obno) {
+                *mval = *got;
+                if (isnullstartblock(got->br_startblock))
+                        mval->br_startblock = DELAYSTARTBLOCK;
+                return;
+        }
+        if (obno > *bno)
+                *bno = obno;
+        ASSERT((*bno >= obno) || (n == 0));
+        ASSERT(*bno < end);
+        mval->br_startoff = *bno;
+        if (isnullstartblock(got->br_startblock))
+                mval->br_startblock = DELAYSTARTBLOCK;
+        else
+                mval->br_startblock = got->br_startblock +
+                                        (*bno - got->br_startoff);
+        /*
+         * Return the minimum of what we got and what we asked for for
+         * the length.  We can use the len variable here because it is
+         * modified below and we could have been there before coming
+         * here if the first part of the allocation didn't overlap what
+         * was asked for.
+         */
+        mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno,
+                        got->br_blockcount - (*bno - got->br_startoff));
+        mval->br_state = got->br_state;
+        ASSERT(mval->br_blockcount <= len);
+        return;
+}
+/*
+ * Update and validate the extent map to return
+ */
+STATIC void
+xfs_bmapi_update_map(
+        struct xfs_bmbt_irec    **map,
+        xfs_fileoff_t           *bno,
+        xfs_filblks_t           *len,
+        xfs_fileoff_t           obno,
+        xfs_fileoff_t           end,
+        int                     *n,
+        int                     flags)
+{
+        xfs_bmbt_irec_t *mval = *map;
+        ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+               ((mval->br_startoff + mval->br_blockcount) <= end));
+        ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) ||
+               (mval->br_startoff < obno));
+        *bno = mval->br_startoff + mval->br_blockcount;
+        *len = end - *bno;
+        if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+                /* update previous map with new information */
+                ASSERT(mval->br_startblock == mval[-1].br_startblock);
+                ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+                ASSERT(mval->br_state == mval[-1].br_state);
+                mval[-1].br_blockcount = mval->br_blockcount;
+                mval[-1].br_state = mval->br_state;
+        } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock != HOLESTARTBLOCK &&
+                   mval->br_startblock == mval[-1].br_startblock +
+                                          mval[-1].br_blockcount &&
+                   ((flags & XFS_BMAPI_IGSTATE) ||
+                        mval[-1].br_state == mval->br_state)) {
+                ASSERT(mval->br_startoff ==
+                       mval[-1].br_startoff + mval[-1].br_blockcount);
+                mval[-1].br_blockcount += mval->br_blockcount;
+        } else if (*n > 0 &&
+                   mval->br_startblock == DELAYSTARTBLOCK &&
+                   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+                   mval->br_startoff ==
+                   mval[-1].br_startoff + mval[-1].br_blockcount) {
+                mval[-1].br_blockcount += mval->br_blockcount;
+                mval[-1].br_state = mval->br_state;
+        } else if (!((*n == 0) &&
+                     ((mval->br_startoff + mval->br_blockcount) <=
+                      obno))) {
+                mval++;
+                (*n)++;
+        }
+        *map = mval;
+}
+/*
+ * Map file blocks to filesystem blocks without allocation.
+ */
+int
+xfs_bmapi_read(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        struct xfs_bmbt_irec    *mval,
+        int                     *nmap,
+        int                     flags)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp;
+        struct xfs_bmbt_irec    got;
+        struct xfs_bmbt_irec    prev;
+        xfs_fileoff_t           obno;
+        xfs_fileoff_t           end;
+        xfs_extnum_t            lastx;
+        int                     error;
+        int                     eof;
+        int                     n = 0;
+        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        ASSERT(*nmap >= 1);
+        ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
+                           XFS_BMAPI_IGSTATE)));
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_blk_mapr);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(NULL, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+        end = bno + len;
+        obno = bno;
+        while (bno < end && n < *nmap) {
+                /* Reading past eof, act as though there's a hole up to end. */
+                if (eof)
+                        got.br_startoff = end;
+                if (got.br_startoff > bno) {
+                        /* Reading in a hole.  */
+                        mval->br_startoff = bno;
+                        mval->br_startblock = HOLESTARTBLOCK;
+                        mval->br_blockcount =
+                                XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+                        mval->br_state = XFS_EXT_NORM;
+                        bno += mval->br_blockcount;
+                        len -= mval->br_blockcount;
+                        mval++;
+                        n++;
+                        continue;
+                }
+                /* set up the extent map to return. */
+                xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+                /* If we're done, stop now. */
+                if (bno >= end || n >= *nmap)
+                        break;
+                /* Else go on to the next record. */
+                if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+                else
+                        eof = 1;
+        }
+        *nmap = n;
+        return 0;
+}
+STATIC int
+xfs_bmapi_reserve_delalloc(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           aoff,
+        xfs_filblks_t           len,
+        struct xfs_bmbt_irec    *got,
+        struct xfs_bmbt_irec    *prev,
+        xfs_extnum_t            *lastx,
+        int                     eof)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        xfs_extlen_t            alen;
+        xfs_extlen_t            indlen;
+        char                    rt = XFS_IS_REALTIME_INODE(ip);
+        xfs_extlen_t            extsz;
+        int                     error;
+        alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+        if (!eof)
+                alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+        /* Figure out the extent size, adjust alen */
+        extsz = xfs_get_extsz_hint(ip);
+        if (extsz) {
+                /*
+                 * Make sure we don't exceed a single extent length when we
+                 * align the extent by reducing length we are going to
+                 * allocate by the maximum amount extent size aligment may
+                 * require.
+                 */
+                alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
+                error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+                                               1, 0, &aoff, &alen);
+                ASSERT(!error);
+        }
+        if (rt)
+                extsz = alen / mp->m_sb.sb_rextsize;
+        /*
+         * Make a transaction-less quota reservation for delayed allocation
+         * blocks.  This number gets adjusted later.  We return if we haven't
+         * allocated blocks already inside this loop.
+         */
+        error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
+                        rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+        if (error)
+                return error;
+        /*
+         * Split changing sb for alen and indlen since they could be coming
+         * from different places.
+         */
+        indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+        ASSERT(indlen > 0);
+        if (rt) {
+                error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                          -((int64_t)extsz), 0);
+        } else {
+                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                                 -((int64_t)alen), 0);
+        }
+        if (error)
+                goto out_unreserve_quota;
+        error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                         -((int64_t)indlen), 0);
+        if (error)
+                goto out_unreserve_blocks;
+        ip->i_delayed_blks += alen;
+        got->br_startoff = aoff;
+        got->br_startblock = nullstartblock(indlen);
+        got->br_blockcount = alen;
+        got->br_state = XFS_EXT_NORM;
+        xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+        /*
+         * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+         * might have merged it into one of the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+        ASSERT(got->br_startoff <= aoff);
+        ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+        ASSERT(isnullstartblock(got->br_startblock));
+        ASSERT(got->br_state == XFS_EXT_NORM);
+        return 0;
+out_unreserve_blocks:
+        if (rt)
+                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+        else
+                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+out_unreserve_quota:
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ?
+                                XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+        return error;
+}
+/*
+ * Map file blocks to filesystem blocks, adding delayed allocations as needed.
+ */
+int
+xfs_bmapi_delay(
+        struct xfs_inode        *ip,    /* incore inode */
+        xfs_fileoff_t           bno,    /* starting file offs. mapped */
+        xfs_filblks_t           len,    /* length to map in file */
+        struct xfs_bmbt_irec    *mval,  /* output: map values */
+        int                     *nmap,  /* i/o: mval size/count */
+        int                     flags)  /* XFS_BMAPI_... */
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        struct xfs_bmbt_irec    got;    /* current file extent record */
+        struct xfs_bmbt_irec    prev;   /* previous file extent record */
+        xfs_fileoff_t           obno;   /* old block number (offset) */
+        xfs_fileoff_t           end;    /* end of mapped file region */
+        xfs_extnum_t            lastx;  /* last useful extent number */
+        int                     eof;    /* we've hit the end of extents */
+        int                     n = 0;  /* current extent index */
+        int                     error = 0;
+        ASSERT(*nmap >= 1);
+        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+        ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_blk_mapw);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+                if (error)
+                        return error;
+        }
+        xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev);
+        end = bno + len;
+        obno = bno;
+        while (bno < end && n < *nmap) {
+                if (eof || got.br_startoff > bno) {
+                        error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got,
+                                                           &prev, &lastx, eof);
+                        if (error) {
+                                if (n == 0) {
+                                        *nmap = 0;
+                                        return error;
+                                }
+                                break;
+                        }
+                }
+                /* set up the extent map to return. */
+                xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags);
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
+                /* If we're done, stop now. */
+                if (bno >= end || n >= *nmap)
+                        break;
+                /* Else go on to the next record. */
+                prev = got;
+                if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
+                else
+                        eof = 1;
+        }
+        *nmap = n;
+        return 0;
+}
+STATIC int
+xfs_bmapi_allocate(
+        struct xfs_bmalloca     *bma,
+        int                     flags)
+{
+        struct xfs_mount        *mp = bma->ip->i_mount;
+        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+        int                     tmp_logflags = 0;
+        int                     error;
+        int                     rt;
+        rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
+        /*
+         * For the wasdelay case, we could also just allocate the stuff asked
+         * for in this bmap call but that wouldn't be as good.
+         */
+        if (bma->wasdel) {
+                bma->length = (xfs_extlen_t)bma->got.br_blockcount;
+                bma->offset = bma->got.br_startoff;
+                if (bma->idx != NULLEXTNUM && bma->idx) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
+                                         &bma->prev);
+                }
+        } else {
+                bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+                if (!bma->eof)
+                        bma->length = XFS_FILBLKS_MIN(bma->length,
+                                        bma->got.br_startoff - bma->offset);
+        }
+        /*
+         * Indicate if this is the first user data in the file, or just any
+         * user data.
+         */
+        if (!(flags & XFS_BMAPI_METADATA)) {
+                bma->userdata = (bma->offset == 0) ?
+                        XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+        }
+        bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+        /*
+         * Only want to do the alignment at the eof if it is userdata and
+         * allocation length is larger than a stripe unit.
+         */
+        if (mp->m_dalign && bma->length >= mp->m_dalign &&
+            !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+                error = xfs_bmap_isaeof(bma, whichfork);
+                if (error)
+                        return error;
+        }
+        error = xfs_bmap_alloc(bma);
+        if (error)
+                return error;
+        if (bma->flist->xbf_low)
+                bma->minleft = 0;
+        if (bma->cur)
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+        if (bma->blkno == NULLFSBLOCK)
+                return 0;
+        if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+                bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+                bma->cur->bc_private.b.flist = bma->flist;
+        }
+        /*
+         * Bump the number of extents we've allocated
+         * in this call.
+         */
+        bma->nallocs++;
+        if (bma->cur)
+                bma->cur->bc_private.b.flags =
+                        bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+        bma->got.br_startoff = bma->offset;
+        bma->got.br_startblock = bma->blkno;
+        bma->got.br_blockcount = bma->length;
+        bma->got.br_state = XFS_EXT_NORM;
+        /*
+         * A wasdelay extent has been initialized, so shouldn't be flagged
+         * as unwritten.
+         */
+        if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+            xfs_sb_version_hasextflgbit(&mp->m_sb))
+                bma->got.br_state = XFS_EXT_UNWRITTEN;
+        if (bma->wasdel)
+                error = xfs_bmap_add_extent_delay_real(bma);
+        else
+                error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+        bma->logflags |= tmp_logflags;
+        if (error)
+                return error;
+        /*
+         * Update our extent pointer, given that xfs_bmap_add_extent_delay_real
+         * or xfs_bmap_add_extent_hole_real might have merged it into one of
+         * the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+        ASSERT(bma->got.br_startoff <= bma->offset);
+        ASSERT(bma->got.br_startoff + bma->got.br_blockcount >=
+               bma->offset + bma->length);
+        ASSERT(bma->got.br_state == XFS_EXT_NORM ||
+               bma->got.br_state == XFS_EXT_UNWRITTEN);
+        return 0;
+}
+STATIC int
+xfs_bmapi_convert_unwritten(
+        struct xfs_bmalloca     *bma,
+        struct xfs_bmbt_irec    *mval,
+        xfs_filblks_t           len,
+        int                     flags)
+{
+        int                     whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                                                XFS_ATTR_FORK : XFS_DATA_FORK;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+        int                     tmp_logflags = 0;
+        int                     error;
+        /* check if we need to do unwritten->real conversion */
+        if (mval->br_state == XFS_EXT_UNWRITTEN &&
+            (flags & XFS_BMAPI_PREALLOC))
+                return 0;
+        /* check if we need to do real->unwritten conversion */
+        if (mval->br_state == XFS_EXT_NORM &&
+            (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) !=
+                        (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+                return 0;
+        /*
+         * Modify (by adding) the state flag, if writing.
+         */
+        ASSERT(mval->br_blockcount <= len);
+        if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) {
+                bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp,
+                                        bma->ip, whichfork);
+                bma->cur->bc_private.b.firstblock = *bma->firstblock;
+                bma->cur->bc_private.b.flist = bma->flist;
+        }
+        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+                                ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+        error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+                        &bma->cur, mval, bma->firstblock, bma->flist,
+                        &tmp_logflags);
+        bma->logflags |= tmp_logflags;
+        if (error)
+                return error;
+        /*
+         * Update our extent pointer, given that
+         * xfs_bmap_add_extent_unwritten_real might have merged it into one
+         * of the neighbouring ones.
+         */
+        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got);
+        /*
+         * We may have combined previously unwritten space with written space,
+         * so generate another request.
+         */
+        if (mval->br_blockcount < len)
+                return EAGAIN;
+        return 0;
+}
+/*
+ * Map file blocks to filesystem blocks, and allocate blocks or convert the
+ * extent state if necessary.  Details behaviour is controlled by the flags
+ * parameter.  Only allocates blocks from a single allocation group, to avoid
+ * locking problems.
+ *
 * The returned value in "firstblock" from the first call in a transaction
 * must be remembered and presented to subsequent calls in "firstblock".
 * An upper bound for the number of blocks to be allocated is supplied to
 * the first call in "total"; if no allocation group has that many free
 * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
 */
-int                                     /* error */
+int
-xfs_bmapi(
+xfs_bmapi_write(
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        xfs_inode_t     *ip,            /* incore inode */
+        struct xfs_inode        *ip,            /* incore inode */
-        xfs_fileoff_t   bno,            /* starting file offs. mapped */
+        xfs_fileoff_t           bno,            /* starting file offs. mapped */
-        xfs_filblks_t   len,            /* length to map in file */
+        xfs_filblks_t           len,            /* length to map in file */
-        int             flags,          /* XFS_BMAPI_... */
+        int                     flags,          /* XFS_BMAPI_... */
-        xfs_fsblock_t   *firstblock,    /* first allocated block
+        xfs_fsblock_t           *firstblock,    /* first allocated block
-                                           controls a.g. for allocs */
+                                                   controls a.g. for allocs */
-        xfs_extlen_t    total,          /* total blocks needed */
+        xfs_extlen_t            total,          /* total blocks needed */
-        xfs_bmbt_irec_t *mval,          /* output: map values */
+        struct xfs_bmbt_irec    *mval,          /* output: map values */
-        int             *nmap,          /* i/o: mval size/count */
+        int                     *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t *flist)         /* i/o: list extents to free */
+        struct xfs_bmap_free    *flist)         /* i/o: list extents to free */
 {
-        xfs_fsblock_t   abno;           /* allocated block number */
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_extlen_t    alen;           /* allocated extent length */
+        struct xfs_ifork        *ifp;
-        xfs_fileoff_t   aoff;           /* allocated file offset */
+        struct xfs_bmalloca     bma = { 0 };    /* args for xfs_bmap_alloc */
-        xfs_bmalloca_t  bma = { 0 };    /* args for xfs_bmap_alloc */
+        xfs_fileoff_t           end;            /* end of mapped file region */
-        xfs_btree_cur_t *cur;           /* bmap btree cursor */
+        int                     eof;            /* after the end of extents */
-        xfs_fileoff_t   end;            /* end of mapped file region */
+        int                     error;          /* error return */
-        int             eof;            /* we've hit the end of extents */
+        int                     n;              /* current extent index */
-        xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+        xfs_fileoff_t           obno;           /* old block number (offset) */
-        int             error;          /* error return */
+        int                     whichfork;      /* data or attr fork */
-        xfs_bmbt_irec_t got;            /* current file extent record */
+        char                    inhole;         /* current location is hole in file */
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        char                    wasdelay;       /* old extent was delayed */
-        xfs_extlen_t    indlen;         /* indirect blocks length */
-        xfs_extnum_t    lastx;          /* last useful extent number */
-        int             logflags;       /* flags for transaction logging */
-        xfs_extlen_t    minleft;        /* min blocks left after allocation */
-        xfs_extlen_t    minlen;         /* min allocation size */
-        xfs_mount_t     *mp;            /* xfs mount structure */
-        int             n;              /* current extent index */
-        int             nallocs;        /* number of extents alloc'd */
-        xfs_extnum_t    nextents;       /* number of extents in file */
-        xfs_fileoff_t   obno;           /* old block number (offset) */
-        xfs_bmbt_irec_t prev;           /* previous file extent record */
-        int             tmp_logflags;   /* temp flags holder */
-        int             whichfork;      /* data or attr fork */
-        char            inhole;         /* current location is hole in file */
-        char            wasdelay;       /* old extent was delayed */
-        char            wr;             /* this is a write request */
-        char            rt;             /* this is a realtime file */
 #ifdef DEBUG
-        xfs_fileoff_t   orig_bno;       /* original block number value */
+        xfs_fileoff_t           orig_bno;       /* original block number value */
-        int             orig_flags;     /* original flags arg value */
+        int                     orig_flags;     /* original flags arg value */
-        xfs_filblks_t   orig_len;       /* original value of len arg */
+        xfs_filblks_t           orig_len;       /* original value of len arg */
-        xfs_bmbt_irec_t *orig_mval;     /* original value of mval */
+        struct xfs_bmbt_irec    *orig_mval;     /* original value of mval */
-        int             orig_nmap;      /* original value of *nmap */
+        int                     orig_nmap;      /* original value of *nmap */
        orig_bno = bno;
        orig_len = len;
@@ -4361,488 +4844,133 @@ xfs_bmapi(
        orig_mval = mval;
        orig_nmap = *nmap;
 #endif
        ASSERT(*nmap >= 1);
-        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+        ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+        ASSERT(tp != NULL);
        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
                XFS_ATTR_FORK : XFS_DATA_FORK;
-        mp = ip->i_mount;
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
-                XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+                XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
        }
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
-        if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
-                XFS_STATS_INC(xs_blk_mapw);
+        XFS_STATS_INC(xs_blk_mapw);
-        else
-                XFS_STATS_INC(xs_blk_mapr);
-        /*
-         * IGSTATE flag is used to combine extents which
-         * differ only due to the state of the extents.
-         * This technique is used from xfs_getbmap()
-         * when the caller does not wish to see the
-         * separation (which is the default).
-         *
-         * This technique is also used when writing a
-         * buffer which has been partially written,
-         * (usually by being flushed during a chunkread),
-         * to ensure one write takes place. This also
-         * prevents a change in the xfs inode extents at
-         * this time, intentionally. This change occurs
-         * on completion of the write operation, in
-         * xfs_strat_comp(), where the xfs_bmapi() call
-         * is transactioned, and the extents combined.
-         */
-        if ((flags & XFS_BMAPI_IGSTATE) && wr)  /* if writing unwritten space */
-                wr = 0;                         /* no allocations are allowed */
-        ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
-        logflags = 0;
-        nallocs = 0;
-        cur = NULL;
        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-                ASSERT(wr && tp);
+                error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
-                if ((error = xfs_bmap_local_to_extents(tp, ip,
+                                                  &bma.logflags, whichfork);
-                                firstblock, total, &logflags, whichfork)))
+                if (error)
                        goto error0;
        }
-        if (wr && *firstblock == NULLFSBLOCK) {
+        if (*firstblock == NULLFSBLOCK) {
                if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-                        minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+                        bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
                else
-                        minleft = 1;
+                        bma.minleft = 1;
-        } else
+        } else {
-                minleft = 0;
+                bma.minleft = 0;
-        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+        }
-            (error = xfs_iread_extents(tp, ip, whichfork)))
-                goto error0;
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                error = xfs_iread_extents(tp, ip, whichfork);
-                &prev);
+                if (error)
-        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+                        goto error0;
+        }
+        xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
+                                &bma.prev);
        n = 0;
        end = bno + len;
        obno = bno;
-        bma.ip = NULL;
+        bma.tp = tp;
+        bma.ip = ip;
+        bma.total = total;
+        bma.userdata = 0;
+        bma.flist = flist;
+        bma.firstblock = firstblock;
        while (bno < end && n < *nmap) {
-                /*
+                inhole = eof || bma.got.br_startoff > bno;
-                 * Reading past eof, act as though there's a hole
+                wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
-                 * up to end.
-                 */
-                if (eof && !wr)
-                        got.br_startoff = end;
-                inhole = eof || got.br_startoff > bno;
-                wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-                        isnullstartblock(got.br_startblock);
                /*
                 * First, deal with the hole before the allocated space
                 * that we found, if any.
                 */
-                if (wr && (inhole || wasdelay)) {
+                if (inhole || wasdelay) {
-                        /*
+                        bma.eof = eof;
-                         * For the wasdelay case, we could also just
+                        bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-                         * allocate the stuff asked for in this bmap call
+                        bma.wasdel = wasdelay;
-                         * but that wouldn't be as good.
+                        bma.length = len;
-                         */
+                        bma.offset = bno;
-                        if (wasdelay) {
-                                alen = (xfs_extlen_t)got.br_blockcount;
+                        error = xfs_bmapi_allocate(&bma, flags);
-                                aoff = got.br_startoff;
-                                if (lastx != NULLEXTNUM && lastx) {
-                                        ep = xfs_iext_get_ext(ifp, lastx - 1);
-                                        xfs_bmbt_get_all(ep, &prev);
-                                }
-                        } else {
-                                alen = (xfs_extlen_t)
-                                        XFS_FILBLKS_MIN(len, MAXEXTLEN);
-                                if (!eof)
-                                        alen = (xfs_extlen_t)
-                                                XFS_FILBLKS_MIN(alen,
-                                                        got.br_startoff - bno);
-                                aoff = bno;
-                        }
-                        minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
-                        if (flags & XFS_BMAPI_DELAY) {
-                                xfs_extlen_t    extsz;
-                                /* Figure out the extent size, adjust alen */
-                                extsz = xfs_get_extsz_hint(ip);
-                                if (extsz) {
-                                        /*
-                                         * make sure we don't exceed a single
-                                         * extent length when we align the
-                                         * extent by reducing length we are
-                                         * going to allocate by the maximum
-                                         * amount extent size aligment may
-                                         * require.
-                                         */
-                                        alen = XFS_FILBLKS_MIN(len,
-                                                   MAXEXTLEN - (2 * extsz - 1));
-                                        error = xfs_bmap_extsize_align(mp,
-                                                        &got, &prev, extsz,
-                                                        rt, eof,
-                                                        flags&XFS_BMAPI_DELAY,
-                                                        flags&XFS_BMAPI_CONVERT,
-                                                        &aoff, &alen);
-                                        ASSERT(!error);
-                                }
-                                if (rt)
-                                        extsz = alen / mp->m_sb.sb_rextsize;
-                                /*
-                                 * Make a transaction-less quota reservation for
-                                 * delayed allocation blocks. This number gets
-                                 * adjusted later.  We return if we haven't
-                                 * allocated blocks already inside this loop.
-                                 */
-                                error = xfs_trans_reserve_quota_nblks(
-                                                NULL, ip, (long)alen, 0,
-                                                rt ? XFS_QMOPT_RES_RTBLKS :
-                                                     XFS_QMOPT_RES_REGBLKS);
-                                if (error) {
-                                        if (n == 0) {
-                                                *nmap = 0;
-                                                ASSERT(cur == NULL);
-                                                return error;
-                                        }
-                                        break;
-                                }
-                                /*
-                                 * Split changing sb for alen and indlen since
-                                 * they could be coming from different places.
-                                 */
-                                indlen = (xfs_extlen_t)
-                                        xfs_bmap_worst_indlen(ip, alen);
-                                ASSERT(indlen > 0);
-                                if (rt) {
-                                        error = xfs_mod_incore_sb(mp,
-                                                        XFS_SBS_FREXTENTS,
-                                                        -((int64_t)extsz), 0);
-                                } else {
-                                        error = xfs_icsb_modify_counters(mp,
-                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)alen), 0);
-                                }
-                                if (!error) {
-                                        error = xfs_icsb_modify_counters(mp,
-                                                        XFS_SBS_FDBLOCKS,
-                                                        -((int64_t)indlen), 0);
-                                        if (error && rt)
-                                                xfs_mod_incore_sb(mp,
-                                                        XFS_SBS_FREXTENTS,
-                                                        (int64_t)extsz, 0);
-                                        else if (error)
-                                                xfs_icsb_modify_counters(mp,
-                                                        XFS_SBS_FDBLOCKS,
-                                                        (int64_t)alen, 0);
-                                }
-                                if (error) {
-                                        if (XFS_IS_QUOTA_ON(mp))
-                                                /* unreserve the blocks now */
-                                                (void)
-                                                xfs_trans_unreserve_quota_nblks(
-                                                        NULL, ip,
-                                                        (long)alen, 0, rt ?
-                                                        XFS_QMOPT_RES_RTBLKS :
-                                                        XFS_QMOPT_RES_REGBLKS);
-                                        break;
-                                }
-                                ip->i_delayed_blks += alen;
-                                abno = nullstartblock(indlen);
-                        } else {
-                                /*
-                                 * If first time, allocate and fill in
-                                 * once-only bma fields.
-                                 */
-                                if (bma.ip == NULL) {
-                                        bma.tp = tp;
-                                        bma.ip = ip;
-                                        bma.prevp = &prev;
-                                        bma.gotp = &got;
-                                        bma.total = total;
-                                        bma.userdata = 0;
-                                }
-                                /* Indicate if this is the first user data
-                                 * in the file, or just any user data.
-                                 */
-                                if (!(flags & XFS_BMAPI_METADATA)) {
-                                        bma.userdata = (aoff == 0) ?
-                                                XFS_ALLOC_INITIAL_USER_DATA :
-                                                XFS_ALLOC_USERDATA;
-                                }
-                                /*
-                                 * Fill in changeable bma fields.
-                                 */
-                                bma.eof = eof;
-                                bma.firstblock = *firstblock;
-                                bma.alen = alen;
-                                bma.off = aoff;
-                                bma.conv = !!(flags & XFS_BMAPI_CONVERT);
-                                bma.wasdel = wasdelay;
-                                bma.minlen = minlen;
-                                bma.low = flist->xbf_low;
-                                bma.minleft = minleft;
-                                /*
-                                 * Only want to do the alignment at the
-                                 * eof if it is userdata and allocation length
-                                 * is larger than a stripe unit.
-                                 */
-                                if (mp->m_dalign && alen >= mp->m_dalign &&
-                                    (!(flags & XFS_BMAPI_METADATA)) &&
-                                    (whichfork == XFS_DATA_FORK)) {
-                                        if ((error = xfs_bmap_isaeof(ip, aoff,
-                                                        whichfork, &bma.aeof)))
-                                                goto error0;
-                                } else
-                                        bma.aeof = 0;
-                                /*
-                                 * Call allocator.
-                                 */
-                                if ((error = xfs_bmap_alloc(&bma)))
-                                        goto error0;
-                                /*
-                                 * Copy out result fields.
-                                 */
-                                abno = bma.rval;
-                                if ((flist->xbf_low = bma.low))
-                                        minleft = 0;
-                                alen = bma.alen;
-                                aoff = bma.off;
-                                ASSERT(*firstblock == NULLFSBLOCK ||
-                                       XFS_FSB_TO_AGNO(mp, *firstblock) ==
-                                       XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
-                                       (flist->xbf_low &&
-                                        XFS_FSB_TO_AGNO(mp, *firstblock) <
-                                        XFS_FSB_TO_AGNO(mp, bma.firstblock)));
-                                *firstblock = bma.firstblock;
-                                if (cur)
-                                        cur->bc_private.b.firstblock =
-                                                *firstblock;
-                                if (abno == NULLFSBLOCK)
-                                        break;
-                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                        cur = xfs_bmbt_init_cursor(mp, tp,
-                                                ip, whichfork);
-                                        cur->bc_private.b.firstblock =
-                                                *firstblock;
-                                        cur->bc_private.b.flist = flist;
-                                }
-                                /*
-                                 * Bump the number of extents we've allocated
-                                 * in this call.
-                                 */
-                                nallocs++;
-                        }
-                        if (cur)
-                                cur->bc_private.b.flags =
-                                        wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
-                        got.br_startoff = aoff;
-                        got.br_startblock = abno;
-                        got.br_blockcount = alen;
-                        got.br_state = XFS_EXT_NORM;    /* assume normal */
-                        /*
-                         * Determine state of extent, and the filesystem.
-                         * A wasdelay extent has been initialized, so
-                         * shouldn't be flagged as unwritten.
-                         */
-                        if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
-                                        got.br_state = XFS_EXT_UNWRITTEN;
-                        }
-                        error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &got,
-                                firstblock, flist, &tmp_logflags,
-                                whichfork);
-                        logflags |= tmp_logflags;
                        if (error)
                                goto error0;
-                        ep = xfs_iext_get_ext(ifp, lastx);
+                        if (bma.blkno == NULLFSBLOCK)
-                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+                                break;
-                        xfs_bmbt_get_all(ep, &got);
-                        ASSERT(got.br_startoff <= aoff);
-                        ASSERT(got.br_startoff + got.br_blockcount >=
-                                aoff + alen);
-#ifdef DEBUG
-                        if (flags & XFS_BMAPI_DELAY) {
-                                ASSERT(isnullstartblock(got.br_startblock));
-                                ASSERT(startblockval(got.br_startblock) > 0);
-                        }
-                        ASSERT(got.br_state == XFS_EXT_NORM ||
-                               got.br_state == XFS_EXT_UNWRITTEN);
-#endif
-                        /*
-                         * Fall down into the found allocated space case.
-                         */
-                } else if (inhole) {
-                        /*
-                         * Reading in a hole.
-                         */
-                        mval->br_startoff = bno;
-                        mval->br_startblock = HOLESTARTBLOCK;
-                        mval->br_blockcount =
-                                XFS_FILBLKS_MIN(len, got.br_startoff - bno);
-                        mval->br_state = XFS_EXT_NORM;
-                        bno += mval->br_blockcount;
-                        len -= mval->br_blockcount;
-                        mval++;
-                        n++;
-                        continue;
-                }
-                /*
-                 * Then deal with the allocated space we found.
-                 */
-                ASSERT(ep != NULL);
-                if (!(flags & XFS_BMAPI_ENTIRE) &&
-                    (got.br_startoff + got.br_blockcount > obno)) {
-                        if (obno > bno)
-                                bno = obno;
-                        ASSERT((bno >= obno) || (n == 0));
-                        ASSERT(bno < end);
-                        mval->br_startoff = bno;
-                        if (isnullstartblock(got.br_startblock)) {
-                                ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-                                mval->br_startblock = DELAYSTARTBLOCK;
-                        } else
-                                mval->br_startblock =
-                                        got.br_startblock +
-                                        (bno - got.br_startoff);
-                        /*
-                         * Return the minimum of what we got and what we
-                         * asked for for the length.  We can use the len
-                         * variable here because it is modified below
-                         * and we could have been there before coming
-                         * here if the first part of the allocation
-                         * didn't overlap what was asked for.
-                         */
-                        mval->br_blockcount =
-                                XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
-                                        (bno - got.br_startoff));
-                        mval->br_state = got.br_state;
-                        ASSERT(mval->br_blockcount <= len);
-                } else {
-                        *mval = got;
-                        if (isnullstartblock(mval->br_startblock)) {
-                                ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
-                                mval->br_startblock = DELAYSTARTBLOCK;
-                        }
                }
-                /*
+                /* Deal with the allocated space we found.  */
-                 * Check if writing previously allocated but
+                xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno,
-                 * unwritten extents.
+                                                        end, n, flags);
-                 */
-                if (wr &&
+                /* Execute unwritten extent conversion if necessary */
-                    ((mval->br_state == XFS_EXT_UNWRITTEN &&
+                error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
-                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+                if (error == EAGAIN)
-                     (mval->br_state == XFS_EXT_NORM &&
+                        continue;
-                      ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+                if (error)
-                                (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
+                        goto error0;
-                        /*
-                         * Modify (by adding) the state flag, if writing.
+                /* update the extent map to return */
-                         */
+                xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags);
-                        ASSERT(mval->br_blockcount <= len);
-                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                cur = xfs_bmbt_init_cursor(mp,
-                                        tp, ip, whichfork);
-                                cur->bc_private.b.firstblock =
-                                        *firstblock;
-                                cur->bc_private.b.flist = flist;
-                        }
-                        mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
-                                                ? XFS_EXT_NORM
-                                                : XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, mval,
-                                firstblock, flist, &tmp_logflags,
-                                whichfork);
-                        logflags |= tmp_logflags;
-                        if (error)
-                                goto error0;
-                        ep = xfs_iext_get_ext(ifp, lastx);
-                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-                        xfs_bmbt_get_all(ep, &got);
-                        /*
-                         * We may have combined previously unwritten
-                         * space with written space, so generate
-                         * another request.
-                         */
-                        if (mval->br_blockcount < len)
-                                continue;
-                }
-                ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-                       ((mval->br_startoff + mval->br_blockcount) <= end));
-                ASSERT((flags & XFS_BMAPI_ENTIRE) ||
-                       (mval->br_blockcount <= len) ||
-                       (mval->br_startoff < obno));
-                bno = mval->br_startoff + mval->br_blockcount;
-                len = end - bno;
-                if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
-                        ASSERT(mval->br_startblock == mval[-1].br_startblock);
-                        ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
-                        ASSERT(mval->br_state == mval[-1].br_state);
-                        mval[-1].br_blockcount = mval->br_blockcount;
-                        mval[-1].br_state = mval->br_state;
-                } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
-                           mval[-1].br_startblock != DELAYSTARTBLOCK &&
-                           mval[-1].br_startblock != HOLESTARTBLOCK &&
-                           mval->br_startblock ==
-                           mval[-1].br_startblock + mval[-1].br_blockcount &&
-                           ((flags & XFS_BMAPI_IGSTATE) ||
-                                mval[-1].br_state == mval->br_state)) {
-                        ASSERT(mval->br_startoff ==
-                               mval[-1].br_startoff + mval[-1].br_blockcount);
-                        mval[-1].br_blockcount += mval->br_blockcount;
-                } else if (n > 0 &&
-                           mval->br_startblock == DELAYSTARTBLOCK &&
-                           mval[-1].br_startblock == DELAYSTARTBLOCK &&
-                           mval->br_startoff ==
-                           mval[-1].br_startoff + mval[-1].br_blockcount) {
-                        mval[-1].br_blockcount += mval->br_blockcount;
-                        mval[-1].br_state = mval->br_state;
-                } else if (!((n == 0) &&
-                             ((mval->br_startoff + mval->br_blockcount) <=
-                              obno))) {
-                        mval++;
-                        n++;
-                }
                /*
                 * If we're done, stop now.  Stop when we've allocated
                 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
                 * the transaction may get too big.
                 */
-                if (bno >= end || n >= *nmap || nallocs >= *nmap)
+                if (bno >= end || n >= *nmap || bma.nallocs >= *nmap)
                        break;
-                /*
-                 * Else go on to the next record.
+                /* Else go on to the next record. */
-                 */
+                bma.prev = bma.got;
-                prev = got;
+                if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-                if (++lastx < nextents) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
-                        ep = xfs_iext_get_ext(ifp, lastx);
+                                         &bma.got);
-                        xfs_bmbt_get_all(ep, &got);
+                } else
-                } else {
                        eof = 1;
-                }
        }
        *nmap = n;
        /*
         * Transform from btree to extents, give it cur.
         */
-        if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
            XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
-                ASSERT(wr && cur);
+                int             tmp_logflags = 0;
-                error = xfs_bmap_btree_to_extents(tp, ip, cur,
+                ASSERT(bma.cur);
+                error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
                        &tmp_logflags, whichfork);
-                logflags |= tmp_logflags;
+                bma.logflags |= tmp_logflags;
                if (error)
                        goto error0;
        }
@@ -4856,34 +4984,33 @@ error0:
         * Log everything.  Do this after conversion, there's no point in
         * logging the extent records if we've converted to btree format.
         */
-        if ((logflags & xfs_ilog_fext(whichfork)) &&
+        if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-                logflags &= ~xfs_ilog_fext(whichfork);
+                bma.logflags &= ~xfs_ilog_fext(whichfork);
-        else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+        else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-                logflags &= ~xfs_ilog_fbroot(whichfork);
+                bma.logflags &= ~xfs_ilog_fbroot(whichfork);
        /*
         * Log whatever the flags say, even if error.  Otherwise we might miss
         * detecting a case where the data is changed, there's an error,
         * and it's not logged so we don't shutdown when we should.
         */
-        if (logflags) {
+        if (bma.logflags)
-                ASSERT(tp && wr);
+                xfs_trans_log_inode(tp, ip, bma.logflags);
-                xfs_trans_log_inode(tp, ip, logflags);
-        }
+        if (bma.cur) {
-        if (cur) {
                if (!error) {
                        ASSERT(*firstblock == NULLFSBLOCK ||
                               XFS_FSB_TO_AGNO(mp, *firstblock) ==
                               XFS_FSB_TO_AGNO(mp,
-                                       cur->bc_private.b.firstblock) ||
+                                       bma.cur->bc_private.b.firstblock) ||
                               (flist->xbf_low &&
                                XFS_FSB_TO_AGNO(mp, *firstblock) <
                                XFS_FSB_TO_AGNO(mp,
-                                        cur->bc_private.b.firstblock)));
+                                        bma.cur->bc_private.b.firstblock)));
-                        *firstblock = cur->bc_private.b.firstblock;
+                        *firstblock = bma.cur->bc_private.b.firstblock;
                }
-                xfs_btree_del_cursor(cur,
+                xfs_btree_del_cursor(bma.cur,
                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
        }
        if (!error)
@@ -4893,58 +5020,6 @@ error0:
 }
 /*
- * Map file blocks to filesystem blocks, simple version.
- * One block (extent) only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int                                             /* error */
-xfs_bmapi_single(
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_inode_t     *ip,            /* incore inode */
-        int             whichfork,      /* data or attr fork */
-        xfs_fsblock_t   *fsb,           /* output: mapped block */
-        xfs_fileoff_t   bno)            /* starting file offs. mapped */
-{
-        int             eof;            /* we've hit the end of extents */
-        int             error;          /* error return */
-        xfs_bmbt_irec_t got;            /* current file extent record */
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        xfs_extnum_t    lastx;          /* last useful extent number */
-        xfs_bmbt_irec_t prev;           /* previous file extent record */
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        if (unlikely(
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
-               XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
-                                ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-        }
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return XFS_ERROR(EIO);
-        XFS_STATS_INC(xs_blk_mapr);
-        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-            (error = xfs_iread_extents(tp, ip, whichfork)))
-                return error;
-        (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-                &prev);
-        /*
-         * Reading past eof, act as though there's a hole
-         * up to end.
-         */
-        if (eof || got.br_startoff > bno) {
-                *fsb = NULLFSBLOCK;
-                return 0;
-        }
-        ASSERT(!isnullstartblock(got.br_startblock));
-        ASSERT(bno < got.br_startoff + got.br_blockcount);
-        *fsb = got.br_startblock + (bno - got.br_startoff);
-        return 0;
-}
-/*
 * Unmap (remove) blocks from a file.
 * If nexts is nonzero then the number of extents to remove is limited to
 * that value.  If not all extents in the block range can be removed then
@@ -5115,9 +5190,9 @@ xfs_bunmapi(
                                del.br_blockcount = mod;
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
-                        error = xfs_bmap_add_extent(tp, ip, &lastx, &cur, &del,
+                        error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-                                firstblock, flist, &logflags,
+                                        &lastx, &cur, &del, firstblock, flist,
-                                XFS_DATA_FORK);
+                                        &logflags);
                        if (error)
                                goto error0;
                        goto nodelete;
@@ -5173,18 +5248,18 @@ xfs_bunmapi(
                                }
                                prev.br_state = XFS_EXT_UNWRITTEN;
                                lastx--;
-                                error = xfs_bmap_add_extent(tp, ip, &lastx,
+                                error = xfs_bmap_add_extent_unwritten_real(tp,
-                                                &cur, &prev, firstblock, flist,
+                                                ip, &lastx, &cur, &prev,
-                                                &logflags, XFS_DATA_FORK);
+                                                firstblock, flist, &logflags);
                                if (error)
                                        goto error0;
                                goto nodelete;
                        } else {
                                ASSERT(del.br_state == XFS_EXT_NORM);
                                del.br_state = XFS_EXT_UNWRITTEN;
-                                error = xfs_bmap_add_extent(tp, ip, &lastx,
+                                error = xfs_bmap_add_extent_unwritten_real(tp,
-                                                &cur, &del, firstblock, flist,
+                                                ip, &lastx, &cur, &del,
-                                                &logflags, XFS_DATA_FORK);
+                                                firstblock, flist, &logflags);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5506,10 +5581,9 @@ xfs_getbmap(
        do {
                nmap = (nexleft > subnex) ? subnex : nexleft;
-                error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+                error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-                                  XFS_BB_TO_FSB(mp, bmv->bmv_length),
+                                       XFS_BB_TO_FSB(mp, bmv->bmv_length),
-                                  bmapi_flags, NULL, 0, map, &nmap,
+                                       map, &nmap, bmapi_flags);
-                                  NULL);
                if (error)
                        goto out_free_map;
                ASSERT(nmap <= subnex);
@@ -5583,89 +5657,6 @@ xfs_getbmap(
        return error;
 }
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- */
-STATIC int                              /* error */
-xfs_bmap_isaeof(
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_fileoff_t   off,            /* file offset in fsblocks */
-        int             whichfork,      /* data or attribute fork */
-        char            *aeof)          /* return value */
-{
-        int             error;          /* error return value */
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        xfs_bmbt_rec_host_t *lastrec;   /* extent record pointer */
-        xfs_extnum_t    nextents;       /* number of file extents */
-        xfs_bmbt_irec_t s;              /* expanded extent record */
-        ASSERT(whichfork == XFS_DATA_FORK);
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-            (error = xfs_iread_extents(NULL, ip, whichfork)))
-                return error;
-        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        if (nextents == 0) {
-                *aeof = 1;
-                return 0;
-        }
-        /*
-         * Go to the last extent
-         */
-        lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-        xfs_bmbt_get_all(lastrec, &s);
-        /*
-         * Check we are allocating in the last extent (for delayed allocations)
-         * or past the last extent for non-delayed allocations.
-         */
-        *aeof = (off >= s.br_startoff &&
-                 off < s.br_startoff + s.br_blockcount &&
-                 isnullstartblock(s.br_startblock)) ||
-                off >= s.br_startoff + s.br_blockcount;
-        return 0;
-}
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.
- */
-int                                     /* error */
-xfs_bmap_eof(
-        xfs_inode_t     *ip,            /* incore inode pointer */
-        xfs_fileoff_t   endoff,         /* file offset in fsblocks */
-        int             whichfork,      /* data or attribute fork */
-        int             *eof)           /* result value */
-{
-        xfs_fsblock_t   blockcount;     /* extent block count */
-        int             error;          /* error return value */
-        xfs_ifork_t     *ifp;           /* inode fork pointer */
-        xfs_bmbt_rec_host_t *lastrec;   /* extent record pointer */
-        xfs_extnum_t    nextents;       /* number of file extents */
-        xfs_fileoff_t   startoff;       /* extent starting file offset */
-        ASSERT(whichfork == XFS_DATA_FORK);
-        ifp = XFS_IFORK_PTR(ip, whichfork);
-        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-            (error = xfs_iread_extents(NULL, ip, whichfork)))
-                return error;
-        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-        if (nextents == 0) {
-                *eof = 1;
-                return 0;
-        }
-        /*
-         * Go to the last extent
-         */
-        lastrec = xfs_iext_get_ext(ifp, nextents - 1);
-        startoff = xfs_bmbt_get_startoff(lastrec);
-        blockcount = xfs_bmbt_get_blockcount(lastrec);
-        *eof = endoff >= startoff + blockcount;
-        return 0;
-}
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
@@ -6100,9 +6091,8 @@ xfs_bmap_punch_delalloc_range(
                 * trying to remove a real extent (which requires a
                 * transaction) or a hole, which is probably a bad idea...
                 */
-                error = xfs_bmapi(NULL, ip, start_fsb, 1,
+                error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
-                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
+                                       XFS_BMAPI_ENTIRE);
-                                &nimaps, NULL);
                if (error) {
                        /* something screwed, just bail */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index c62234bde053..89ee672d378a 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -62,27 +62,23 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAP_MAX_NMAP       4
 /*
- * Flags for xfs_bmapi
+ * Flags for xfs_bmapi_*
 */
-#define XFS_BMAPI_WRITE         0x001   /* write operation: allocate space */
+#define XFS_BMAPI_ENTIRE        0x001   /* return entire extent, not trimmed */
-#define XFS_BMAPI_DELAY         0x002   /* delayed write operation */
+#define XFS_BMAPI_METADATA      0x002   /* mapping metadata not user data */
-#define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
+#define XFS_BMAPI_ATTRFORK      0x004   /* use attribute fork not data */
-#define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
+#define XFS_BMAPI_PREALLOC      0x008   /* preallocation op: unwritten space */
-#define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
+#define XFS_BMAPI_IGSTATE       0x010   /* Ignore state - */
-#define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
                                        /* combine contig. space */
-#define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
+#define XFS_BMAPI_CONTIG        0x020   /* must allocate only one extent */
 /*
 * unwritten extent conversion - this needs write cache flushing and no additional
 * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
 * from written to unwritten, otherwise convert from unwritten to written.
 */
-#define XFS_BMAPI_CONVERT       0x200
+#define XFS_BMAPI_CONVERT       0x040
 #define XFS_BMAPI_FLAGS \
-        { XFS_BMAPI_WRITE,      "WRITE" }, \
-        { XFS_BMAPI_DELAY,      "DELAY" }, \
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
@@ -113,21 +109,28 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 * Argument structure for xfs_bmap_alloc.
 */
 typedef struct xfs_bmalloca {
-        xfs_fsblock_t           firstblock; /* i/o first block allocated */
+        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-        xfs_fsblock_t           rval;   /* starting block of new extent */
+        struct xfs_bmap_free    *flist; /* bmap freelist */
-        xfs_fileoff_t           off;    /* offset in file filling in */
        struct xfs_trans        *tp;    /* transaction pointer */
        struct xfs_inode        *ip;    /* incore inode pointer */
-        struct xfs_bmbt_irec    *prevp; /* extent before the new one */
+        struct xfs_bmbt_irec    prev;   /* extent before the new one */
-        struct xfs_bmbt_irec    *gotp;  /* extent after, or delayed */
+        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-        xfs_extlen_t            alen;   /* i/o length asked/allocated */
+        xfs_fileoff_t           offset; /* offset in file filling in */
+        xfs_extlen_t            length; /* i/o length asked/allocated */
+        xfs_fsblock_t           blkno;  /* starting block of new extent */
+        struct xfs_btree_cur    *cur;   /* btree cursor */
+        xfs_extnum_t            idx;    /* current extent index */
+        int                     nallocs;/* number of extents alloc'd */
+        int                     logflags;/* flags for transaction logging */
        xfs_extlen_t            total;  /* total blocks needed for xaction */
        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
        xfs_extlen_t            minleft; /* amount must be left after alloc */
        char                    eof;    /* set if allocating past last extent */
        char                    wasdel; /* replacing a delayed allocation */
        char                    userdata;/* set if is user data */
-        char                    low;    /* low on space, using seq'l ags */
        char                    aeof;   /* allocated space at eof */
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
@@ -152,251 +155,62 @@ typedef struct xfs_bmalloca {
        { BMAP_RIGHT_FILLING,   "RF" }, \
        { BMAP_ATTRFORK,        "ATTR" }
-/*
- * Add bmap trace insert entries for all the contents of the extent list.
- *
- * Quite excessive tracing.  Only do this for debug builds.
- */
 #if defined(__KERNEL) && defined(DEBUG)
-void
+void    xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
-xfs_bmap_trace_exlist(
+                int whichfork, unsigned long caller_ip);
-        struct xfs_inode        *ip,            /* incore inode pointer */
-        xfs_extnum_t            cnt,            /* count of entries in list */
-        int                     whichfork,
-        unsigned long           caller_ip);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
 #else
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
 #endif
-/*
+int     xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
- * Convert inode from non-attributed to attributed.
+void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
- * Must not be in a transaction, ip must not be locked.
+                struct xfs_bmap_free *flist, struct xfs_mount *mp);
- */
+void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
-int                                     /* error code */
+void    xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
-xfs_bmap_add_attrfork(
+int     xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
-        struct xfs_inode        *ip,    /* incore inode pointer */
+                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
-        int                     size,   /* space needed for new attribute */
+int     xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
-        int                     rsvd);  /* flag for reserved block allocation */
+                xfs_fileoff_t *last_block, int whichfork);
+int     xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
-/*
+                xfs_fileoff_t *unused, int whichfork);
- * Add the extent to the list of extents to be free at transaction end.
+int     xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
- * The list is maintained sorted (by block number).
+int     xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
- */
+                int whichfork);
-void
+int     xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
-xfs_bmap_add_free(
+                xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-        xfs_fsblock_t           bno,            /* fs block number of extent */
+                int *nmap, int flags);
-        xfs_filblks_t           len,            /* length of extent */
+int     xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno,
-        xfs_bmap_free_t         *flist,         /* list of extents */
+                xfs_filblks_t len, struct xfs_bmbt_irec *mval,
-        struct xfs_mount        *mp);           /* mount point structure */
+                int *nmap, int flags);
+int     xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
-/*
+                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
- * Routine to clean up the free list data structure when
+                xfs_fsblock_t *firstblock, xfs_extlen_t total,
- * an error occurs during a transaction.
+                struct xfs_bmbt_irec *mval, int *nmap,
- */
+                struct xfs_bmap_free *flist);
-void
+int     xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
-xfs_bmap_cancel(
+                xfs_fileoff_t bno, xfs_filblks_t len, int flags,
-        xfs_bmap_free_t         *flist);        /* free list to clean up */
+                xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+                struct xfs_bmap_free *flist, int *done);
-/*
+int     xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
- * Compute and fill in the value of the maximum depth of a bmap btree
+                xfs_extnum_t num);
- * in this filesystem.  Done once, during mount.
+uint    xfs_default_attroffset(struct xfs_inode *ip);
- */
-void
-xfs_bmap_compute_maxlevels(
-        struct xfs_mount        *mp,    /* file system mount structure */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Returns the file-relative block number of the first unused block in the file.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- */
-int                                             /* error */
-xfs_bmap_first_unused(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        xfs_extlen_t            len,            /* size of hole to find */
-        xfs_fileoff_t           *unused,        /* unused block num */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int                                             /* error */
-xfs_bmap_last_before(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        xfs_fileoff_t           *last_block,    /* last block */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent list.
- * Returns 0 for local files, as they do not have an extent list.
- */
-int                                             /* error */
-xfs_bmap_last_offset(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        xfs_fileoff_t           *unused,        /* last block num */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int
-xfs_bmap_one_block(
-        struct xfs_inode        *ip,            /* incore inode */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Read in the extents to iu_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in.
- */
-int                                             /* error */
-xfs_bmap_read_extents(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        int                     whichfork);     /* data or attr fork */
-/*
- * Map file blocks to filesystem blocks.
- * File range is given by the bno/len pair.
- * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
- * into a hole or past eof.
- * Only allocates blocks from a single allocation group,
- * to avoid locking problems.
- * The returned value in "firstblock" from the first call in a transaction
- * must be remembered and presented to subsequent calls in "firstblock".
- * An upper bound for the number of blocks to be allocated is supplied to
- * the first call in "total"; if no allocation group has that many free
- * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
- */
-int                                             /* error */
-xfs_bmapi(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        xfs_fileoff_t           bno,            /* starting file offs. mapped */
-        xfs_filblks_t           len,            /* length to map in file */
-        int                     flags,          /* XFS_BMAPI_... */
-        xfs_fsblock_t           *firstblock,    /* first allocated block
-                                                   controls a.g. for allocs */
-        xfs_extlen_t            total,          /* total blocks needed */
-        struct xfs_bmbt_irec    *mval,          /* output: map values */
-        int                     *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t         *flist);        /* i/o: list extents to free */
-/*
- * Map file blocks to filesystem blocks, simple version.
- * One block only, read-only.
- * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
- * For the other flag values, the effect is as if XFS_BMAPI_METADATA
- * was set and all the others were clear.
- */
-int                                             /* error */
-xfs_bmapi_single(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        int                     whichfork,      /* data or attr fork */
-        xfs_fsblock_t           *fsb,           /* output: mapped block */
-        xfs_fileoff_t           bno);           /* starting file offs. mapped */
-/*
- * Unmap (remove) blocks from a file.
- * If nexts is nonzero then the number of extents to remove is limited to
- * that value.  If not all extents in the block range can be removed then
- * *done is set.
- */
-int                                             /* error */
-xfs_bunmapi(
-        struct xfs_trans        *tp,            /* transaction pointer */
-        struct xfs_inode        *ip,            /* incore inode */
-        xfs_fileoff_t           bno,            /* starting offset to unmap */
-        xfs_filblks_t           len,            /* length to unmap in file */
-        int                     flags,          /* XFS_BMAPI_... */
-        xfs_extnum_t            nexts,          /* number of extents max */
-        xfs_fsblock_t           *firstblock,    /* first allocated block
-                                                   controls a.g. for allocs */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        int                     *done);         /* set if not done yet */
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-        struct xfs_ifork        *ifp,
-        xfs_extnum_t            idx,
-        xfs_extnum_t            num);
-uint
-xfs_default_attroffset(
-        struct xfs_inode        *ip);
 #ifdef __KERNEL__
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                             /* error */
-xfs_bmap_finish(
-        struct xfs_trans        **tp,           /* transaction pointer addr */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        int                     *committed);    /* xact committed or not */
 /* bmap to userspace formatter - copy to user & advance pointer */
 typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
-/*
+int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- * Get inode's extents as described in bmv, and format for output.
+                int *committed);
- */
+int     xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
-int                                             /* error code */
+                xfs_bmap_format_t formatter, void *arg);
-xfs_getbmap(
+int     xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
-        xfs_inode_t             *ip,
+                int whichfork, int *eof);
-        struct getbmapx         *bmv,           /* user bmap structure */
+int     xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-        xfs_bmap_format_t       formatter,      /* format to user */
+                int whichfork, int *count);
-        void                    *arg);          /* formatter arg */
+int     xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+                xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary
- */
-int
-xfs_bmap_eof(
-        struct xfs_inode        *ip,
-        xfs_fileoff_t           endoff,
-        int                     whichfork,
-        int                     *eof);
-/*
- * Count fsblocks of the given fork.
- */
-int
-xfs_bmap_count_blocks(
-        xfs_trans_t             *tp,
-        struct xfs_inode        *ip,
-        int                     whichfork,
-        int                     *count);
-int
-xfs_bmap_punch_delalloc_range(
-        struct xfs_inode        *ip,
-        xfs_fileoff_t           start_fsb,
-        xfs_fileoff_t           length);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cabf4b5604aa..1f19f03af9d3 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -275,8 +275,7 @@ xfs_btree_dup_cursor(
                                return error;
                        }
                        new->bc_bufs[i] = bp;
-                        ASSERT(bp);
+                        ASSERT(!xfs_buf_geterror(bp));
-                        ASSERT(!XFS_BUF_GETERROR(bp));
                } else
                        new->bc_bufs[i] = NULL;
        }
@@ -467,8 +466,7 @@ xfs_btree_get_bufl(
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
        return bp;
 }
@@ -491,8 +489,7 @@ xfs_btree_get_bufs(
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
-        ASSERT(bp);
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(!XFS_BUF_GETERROR(bp));
        return bp;
 }
@@ -632,9 +629,9 @@ xfs_btree_read_bufl(
                        mp->m_bsize, lock, &bp))) {
                return error;
        }
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        ASSERT(!xfs_buf_geterror(bp));
        if (bp)
-                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+                xfs_buf_set_ref(bp, refval);
        *bpp = bp;
        return 0;
 }
@@ -942,13 +939,13 @@ xfs_btree_set_refs(
        switch (cur->bc_btnum) {
        case XFS_BTNUM_BNO:
        case XFS_BTNUM_CNT:
-                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
                break;
        case XFS_BTNUM_INO:
-                XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
                break;
        case XFS_BTNUM_BMAP:
-                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
                break;
        default:
                ASSERT(0);
@@ -973,8 +970,8 @@ xfs_btree_get_buf_block(
        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
                                 mp->m_bsize, flags);
-        ASSERT(*bpp);
+        if (!*bpp)
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
+                return ENOMEM;
        *block = XFS_BUF_TO_BLOCK(*bpp);
        return 0;
@@ -1006,8 +1003,7 @@ xfs_btree_read_buf_block(
        if (error)
                return error;
-        ASSERT(*bpp != NULL);
+        ASSERT(!xfs_buf_geterror(*bpp));
-        ASSERT(!XFS_BUF_GETERROR(*bpp));
        xfs_btree_set_refs(cur, *bpp);
        *block = XFS_BUF_TO_BLOCK(*bpp);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 8d05a6a46ce3..5b240de104c0 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -262,7 +262,7 @@ typedef struct xfs_btree_cur
 /*
 * Convert from buffer to btree block header.
 */
-#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)((bp)->b_addr))
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c
index d1fe74506c4c..cf0ac056815f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,6 @@
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
@@ -66,10 +65,6 @@ struct workqueue_struct *xfsconvertd_workqueue;
 #define xb_to_km(flags) \
         (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
-#define xfs_buf_allocate(flags) \
-        kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
-        kmem_zone_free(xfs_buf_zone, (bp));
 static inline int
 xfs_buf_is_vmapped(
@@ -152,6 +147,7 @@ xfs_buf_stale(
        struct xfs_buf  *bp)
 {
        bp->b_flags |= XBF_STALE;
+        xfs_buf_delwri_dequeue(bp);
        atomic_set(&(bp)->b_lru_ref, 0);
        if (!list_empty(&bp->b_lru)) {
                struct xfs_buftarg *btp = bp->b_target;
@@ -167,14 +163,19 @@ xfs_buf_stale(
        ASSERT(atomic_read(&bp->b_hold) >= 1);
 }
-STATIC void
+struct xfs_buf *
-_xfs_buf_initialize(
+xfs_buf_alloc(
-        xfs_buf_t               *bp,
+        struct xfs_buftarg      *target,
-        xfs_buftarg_t           *target,
        xfs_off_t               range_base,
        size_t                  range_length,
        xfs_buf_flags_t         flags)
 {
+        struct xfs_buf          *bp;
+        bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
+        if (unlikely(!bp))
+                return NULL;
        /*
         * We don't want certain flags to appear in b_flags.
         */
@@ -203,8 +204,9 @@ _xfs_buf_initialize(
        init_waitqueue_head(&bp->b_waiters);
        XFS_STATS_INC(xb_create);
        trace_xfs_buf_init(bp, _RET_IP_);
+        return bp;
 }
 /*
@@ -277,7 +279,7 @@ xfs_buf_free(
        } else if (bp->b_flags & _XBF_KMEM)
                kmem_free(bp->b_addr);
        _xfs_buf_free_pages(bp);
-        xfs_buf_deallocate(bp);
+        kmem_zone_free(xfs_buf_zone, bp);
 }
 /*
@@ -416,10 +418,7 @@ _xfs_buf_map_pages(
 /*
 *      Look up, and creates if absent, a lockable buffer for
 *      a given range of an inode.  The buffer is returned
- *      locked.  If other overlapping buffers exist, they are
+ *      locked. No I/O is implied by this call.
- *      released before the new buffer is created and locked,
- *      which may imply that this call will block until those buffers
- *      are unlocked.  No I/O is implied by this call.
 */
 xfs_buf_t *
 _xfs_buf_find(
@@ -481,8 +480,6 @@ _xfs_buf_find(
        /* No match found */
        if (new_bp) {
-                _xfs_buf_initialize(new_bp, btp, range_base,
-                                range_length, flags);
                rb_link_node(&new_bp->b_rbnode, parent, rbp);
                rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
                /* the buffer keeps the perag reference until it is freed */
@@ -525,35 +522,51 @@ found:
 }
 /*
- *      Assembles a buffer covering the specified range.
+ * Assembles a buffer covering the specified range. The code is optimised for
- *      Storage in memory for all portions of the buffer will be allocated,
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
- *      although backing storage may not be.
+ * more hits than misses.
 */
-xfs_buf_t *
+struct xfs_buf *
 xfs_buf_get(
        xfs_buftarg_t           *target,/* target for buffer            */
        xfs_off_t               ioff,   /* starting offset of range     */
        size_t                  isize,  /* length of range              */
        xfs_buf_flags_t         flags)
 {
-        xfs_buf_t               *bp, *new_bp;
+        struct xfs_buf          *bp;
+        struct xfs_buf          *new_bp;
        int                     error = 0;
-        new_bp = xfs_buf_allocate(flags);
+        bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
+        if (likely(bp))
+                goto found;
+        new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
+                               flags);
        if (unlikely(!new_bp))
                return NULL;
        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+        if (!bp) {
+                kmem_zone_free(xfs_buf_zone, new_bp);
+                return NULL;
+        }
        if (bp == new_bp) {
                error = xfs_buf_allocate_memory(bp, flags);
                if (error)
                        goto no_buffer;
-        } else {
+        } else
-                xfs_buf_deallocate(new_bp);
+                kmem_zone_free(xfs_buf_zone, new_bp);
-                if (unlikely(bp == NULL))
-                        return NULL;
-        }
+        /*
+         * Now we have a workable buffer, fill in the block number so
+         * that we can do IO on it.
+         */
+        bp->b_bn = ioff;
+        bp->b_count_desired = bp->b_buffer_length;
+found:
        if (!(bp->b_flags & XBF_MAPPED)) {
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
@@ -564,18 +577,10 @@ xfs_buf_get(
        }
        XFS_STATS_INC(xb_get);
-        /*
-         * Always fill in the block number now, the mapped cases can do
-         * their own overlay of this later.
-         */
-        bp->b_bn = ioff;
-        bp->b_count_desired = bp->b_buffer_length;
        trace_xfs_buf_get(bp, flags, _RET_IP_);
        return bp;
- no_buffer:
+no_buffer:
        if (flags & (XBF_LOCK | XBF_TRYLOCK))
                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
@@ -596,7 +601,7 @@ _xfs_buf_read(
        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
        status = xfs_buf_iorequest(bp);
-        if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+        if (status || bp->b_error || (flags & XBF_ASYNC))
                return status;
        return xfs_buf_iowait(bp);
 }
@@ -679,7 +684,6 @@ xfs_buf_read_uncached(
        /* set up the buffer for a read IO */
        XFS_BUF_SET_ADDR(bp, daddr);
        XFS_BUF_READ(bp);
-        XFS_BUF_BUSY(bp);
        xfsbdstrat(mp, bp);
        error = xfs_buf_iowait(bp);
@@ -690,19 +694,6 @@ xfs_buf_read_uncached(
        return bp;
 }
-xfs_buf_t *
-xfs_buf_get_empty(
-        size_t                  len,
-        xfs_buftarg_t           *target)
-{
-        xfs_buf_t               *bp;
-        bp = xfs_buf_allocate(0);
-        if (bp)
-                _xfs_buf_initialize(bp, target, 0, len, 0);
-        return bp;
-}
 /*
 * Return a buffer allocated as an empty buffer and associated to external
 * memory via xfs_buf_associate_memory() back to it's empty state.
@@ -788,10 +779,9 @@ xfs_buf_get_uncached(
        int                     error, i;
        xfs_buf_t               *bp;
-        bp = xfs_buf_allocate(0);
+        bp = xfs_buf_alloc(target, 0, len, 0);
        if (unlikely(bp == NULL))
                goto fail;
-        _xfs_buf_initialize(bp, target, 0, len, 0);
        error = _xfs_buf_get_pages(bp, page_count, 0);
        if (error)
@@ -819,7 +809,7 @@ xfs_buf_get_uncached(
                __free_page(bp->b_pages[i]);
        _xfs_buf_free_pages(bp);
 fail_free_buf:
-        xfs_buf_deallocate(bp);
+        kmem_zone_free(xfs_buf_zone, bp);
 fail:
        return NULL;
 }
@@ -938,12 +928,6 @@ void
 xfs_buf_unlock(
        struct xfs_buf          *bp)
 {
-        if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-                atomic_inc(&bp->b_hold);
-                bp->b_flags |= XBF_ASYNC;
-                xfs_buf_delwri_queue(bp, 0);
-        }
        XB_CLEAR_OWNER(bp);
        up(&bp->b_sema);
@@ -1020,9 +1004,19 @@ xfs_buf_ioerror(
        trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
+void
+xfs_buf_ioerror_alert(
+        struct xfs_buf          *bp,
+        const char              *func)
+{
+        xfs_alert(bp->b_target->bt_mount,
+"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
+                (__uint64_t)XFS_BUF_ADDR(bp), func,
+                bp->b_error, XFS_BUF_COUNT(bp));
+}
 int
 xfs_bwrite(
-        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
        int                     error;
@@ -1034,25 +1028,13 @@ xfs_bwrite(
        xfs_bdstrat_cb(bp);
        error = xfs_buf_iowait(bp);
-        if (error)
+        if (error) {
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                xfs_force_shutdown(bp->b_target->bt_mount,
-        xfs_buf_relse(bp);
+                                   SHUTDOWN_META_IO_ERROR);
+        }
        return error;
 }
-void
-xfs_bdwrite(
-        void                    *mp,
-        struct xfs_buf          *bp)
-{
-        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_flags &= ~XBF_READ;
-        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-        xfs_buf_delwri_queue(bp, 1);
-}
 /*
 * Called when we want to stop a buffer from getting written or read.
 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
@@ -1069,15 +1051,14 @@ xfs_bioerror(
        /*
         * No need to wait until the buffer is unpinned, we aren't flushing it.
         */
-        XFS_BUF_ERROR(bp, EIO);
+        xfs_buf_ioerror(bp, EIO);
        /*
         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
         */
        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_UNDONE(bp);
-        XFS_BUF_STALE(bp);
+        xfs_buf_stale(bp);
        xfs_buf_ioend(bp, 0);
@@ -1094,7 +1075,7 @@ STATIC int
 xfs_bioerror_relse(
        struct xfs_buf  *bp)
 {
-        int64_t         fl = XFS_BUF_BFLAGS(bp);
+        int64_t         fl = bp->b_flags;
        /*
         * No need to wait until the buffer is unpinned.
         * We aren't flushing it.
@@ -1104,9 +1085,8 @@ xfs_bioerror_relse(
         * change that interface.
         */
        XFS_BUF_UNREAD(bp);
-        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_DONE(bp);
-        XFS_BUF_STALE(bp);
+        xfs_buf_stale(bp);
        bp->b_iodone = NULL;
        if (!(fl & XBF_ASYNC)) {
                /*
@@ -1115,8 +1095,8 @@ xfs_bioerror_relse(
                 * There's no reason to mark error for
                 * ASYNC buffers.
                 */
-                XFS_BUF_ERROR(bp, EIO);
+                xfs_buf_ioerror(bp, EIO);
-                XFS_BUF_FINISH_IOWAIT(bp);
+                complete(&bp->b_iowait);
        } else {
                xfs_buf_relse(bp);
        }
@@ -1276,15 +1256,10 @@ xfs_buf_iorequest(
 {
        trace_xfs_buf_iorequest(bp, _RET_IP_);
-        if (bp->b_flags & XBF_DELWRI) {
+        ASSERT(!(bp->b_flags & XBF_DELWRI));
-                xfs_buf_delwri_queue(bp, 1);
-                return 0;
-        }
-        if (bp->b_flags & XBF_WRITE) {
+        if (bp->b_flags & XBF_WRITE)
                xfs_buf_wait_unpin(bp);
-        }
        xfs_buf_hold(bp);
        /* Set the count to 1 initially, this will stop an I/O
@@ -1324,7 +1299,7 @@ xfs_buf_offset(
        struct page             *page;
        if (bp->b_flags & XBF_MAPPED)
-                return XFS_BUF_PTR(bp) + offset;
+                return bp->b_addr + offset;
        offset += bp->b_offset;
        page = bp->b_pages[offset >> PAGE_SHIFT];
@@ -1482,9 +1457,13 @@ xfs_setsize_buftarg_flags(
        btp->bt_smask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
+                char name[BDEVNAME_SIZE];
+                bdevname(btp->bt_bdev, name);
                xfs_warn(btp->bt_mount,
                        "Cannot set_blocksize to %u on device %s\n",
-                        sectorsize, XFS_BUFTARG_NAME(btp));
+                        sectorsize, name);
                return EINVAL;
        }
@@ -1515,12 +1494,12 @@ xfs_setsize_buftarg(
 }
 STATIC int
-xfs_alloc_delwrite_queue(
+xfs_alloc_delwri_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
 {
-        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
+        INIT_LIST_HEAD(&btp->bt_delwri_queue);
-        spin_lock_init(&btp->bt_delwrite_lock);
+        spin_lock_init(&btp->bt_delwri_lock);
        btp->bt_flags = 0;
        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
        if (IS_ERR(btp->bt_task))
@@ -1550,7 +1529,7 @@ xfs_alloc_buftarg(
        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
-        if (xfs_alloc_delwrite_queue(btp, fsname))
+        if (xfs_alloc_delwri_queue(btp, fsname))
                goto error;
        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1566,56 +1545,48 @@ error:
 /*
 *      Delayed write buffer handling
 */
-STATIC void
+void
 xfs_buf_delwri_queue(
-        xfs_buf_t               *bp,
+        xfs_buf_t               *bp)
-        int                     unlock)
 {
-        struct list_head        *dwq = &bp->b_target->bt_delwrite_queue;
+        struct xfs_buftarg      *btp = bp->b_target;
-        spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
        trace_xfs_buf_delwri_queue(bp, _RET_IP_);
-        ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+        ASSERT(!(bp->b_flags & XBF_READ));
-        spin_lock(dwlk);
+        spin_lock(&btp->bt_delwri_lock);
-        /* If already in the queue, dequeue and place at tail */
        if (!list_empty(&bp->b_list)) {
+                /* if already in the queue, move it to the tail */
                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-                if (unlock)
+                list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
-                        atomic_dec(&bp->b_hold);
+        } else {
-                list_del(&bp->b_list);
-        }
-        if (list_empty(dwq)) {
                /* start xfsbufd as it is about to have something to do */
-                wake_up_process(bp->b_target->bt_task);
+                if (list_empty(&btp->bt_delwri_queue))
-        }
+                        wake_up_process(bp->b_target->bt_task);
-        bp->b_flags |= _XBF_DELWRI_Q;
+                atomic_inc(&bp->b_hold);
-        list_add_tail(&bp->b_list, dwq);
+                bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
+                list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
+        }
        bp->b_queuetime = jiffies;
-        spin_unlock(dwlk);
+        spin_unlock(&btp->bt_delwri_lock);
-        if (unlock)
-                xfs_buf_unlock(bp);
 }
 void
 xfs_buf_delwri_dequeue(
        xfs_buf_t               *bp)
 {
-        spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
        int                     dequeued = 0;
-        spin_lock(dwlk);
+        spin_lock(&bp->b_target->bt_delwri_lock);
        if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
                list_del_init(&bp->b_list);
                dequeued = 1;
        }
        bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-        spin_unlock(dwlk);
+        spin_unlock(&bp->b_target->bt_delwri_lock);
        if (dequeued)
                xfs_buf_rele(bp);
@@ -1647,16 +1618,9 @@ xfs_buf_delwri_promote(
        if (bp->b_queuetime < jiffies - age)
                return;
        bp->b_queuetime = jiffies - age;
-        spin_lock(&btp->bt_delwrite_lock);
+        spin_lock(&btp->bt_delwri_lock);
-        list_move(&bp->b_list, &btp->bt_delwrite_queue);
+        list_move(&bp->b_list, &btp->bt_delwri_queue);
-        spin_unlock(&btp->bt_delwrite_lock);
+        spin_unlock(&btp->bt_delwri_lock);
-}
-STATIC void
-xfs_buf_runall_queues(
-        struct workqueue_struct *queue)
-{
-        flush_workqueue(queue);
 }
 /*
@@ -1670,18 +1634,16 @@ xfs_buf_delwri_split(
        unsigned long   age)
 {
        xfs_buf_t       *bp, *n;
-        struct list_head *dwq = &target->bt_delwrite_queue;
-        spinlock_t      *dwlk = &target->bt_delwrite_lock;
        int             skipped = 0;
        int             force;
        force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
        INIT_LIST_HEAD(list);
-        spin_lock(dwlk);
+        spin_lock(&target->bt_delwri_lock);
-        list_for_each_entry_safe(bp, n, dwq, b_list) {
+        list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
                ASSERT(bp->b_flags & XBF_DELWRI);
-                if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) {
+                if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
                        if (!force &&
                            time_before(jiffies, bp->b_queuetime + age)) {
                                xfs_buf_unlock(bp);
@@ -1695,10 +1657,9 @@ xfs_buf_delwri_split(
                } else
                        skipped++;
        }
-        spin_unlock(dwlk);
+        spin_unlock(&target->bt_delwri_lock);
        return skipped;
 }
 /*
@@ -1748,7 +1709,7 @@ xfsbufd(
                }
                /* sleep for a long time if there is nothing to do. */
-                if (list_empty(&target->bt_delwrite_queue))
+                if (list_empty(&target->bt_delwri_queue))
                        tout = MAX_SCHEDULE_TIMEOUT;
                schedule_timeout_interruptible(tout);
@@ -1784,9 +1745,7 @@ xfs_flush_buftarg(
        LIST_HEAD(wait_list);
        struct blk_plug plug;
-        xfs_buf_runall_queues(xfsconvertd_workqueue);
+        flush_workqueue(xfslogd_workqueue);
-        xfs_buf_runall_queues(xfsdatad_workqueue);
-        xfs_buf_runall_queues(xfslogd_workqueue);
        set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
        pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
@@ -1867,11 +1826,3 @@ xfs_buf_terminate(void)
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
 }
-#ifdef CONFIG_KDB_MODULES
-struct list_head *
-xfs_get_buftarg_list(void)
-{
-        return &xfs_buftarg_list;
-}
-#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6a83b46b4bcf..5bab046e859f 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -105,8 +105,8 @@ typedef struct xfs_buftarg {
        /* per device delwri queue */
        struct task_struct      *bt_task;
-        struct list_head        bt_delwrite_queue;
+        struct list_head        bt_delwri_queue;
-        spinlock_t              bt_delwrite_lock;
+        spinlock_t              bt_delwri_lock;
        unsigned long           bt_flags;
        /* LRU control structures */
@@ -175,7 +175,8 @@ extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
 extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
                                xfs_buf_flags_t);
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t,
+                              xfs_buf_flags_t);
 extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
 extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
@@ -197,14 +198,14 @@ extern void xfs_buf_unlock(xfs_buf_t *);
        ((bp)->b_sema.count <= 0)
 /* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
@@ -221,53 +222,32 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 /* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_queue(struct xfs_buf *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
+extern void xfs_buf_delwri_dequeue(struct xfs_buf *);
+extern void xfs_buf_delwri_promote(struct xfs_buf *);
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
-#define xfs_buf_target_name(target)     \
-        ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
-#define XFS_BUF_BFLAGS(bp)      ((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp) \
        ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
                            XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
 void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp)       xfs_buf_stale(bp);
 #define XFS_BUF_UNSTALE(bp)     ((bp)->b_flags &= ~XBF_STALE)
 #define XFS_BUF_ISSTALE(bp)     ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_SUPER_STALE(bp) do {                            \
-                                        XFS_BUF_STALE(bp);      \
-                                        xfs_buf_delwri_dequeue(bp);     \
-                                        XFS_BUF_DONE(bp);       \
-                                } while (0)
-#define XFS_BUF_DELAYWRITE(bp)          ((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)        xfs_buf_delwri_dequeue(bp)
-#define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
-#define XFS_BUF_ERROR(bp,no)    xfs_buf_ioerror(bp,no)
+#define XFS_BUF_ISDELAYWRITE(bp)        ((bp)->b_flags & XBF_DELWRI)
-#define XFS_BUF_GETERROR(bp)    xfs_buf_geterror(bp)
-#define XFS_BUF_ISERROR(bp)     (xfs_buf_geterror(bp) ? 1 : 0)
 #define XFS_BUF_DONE(bp)        ((bp)->b_flags |= XBF_DONE)
 #define XFS_BUF_UNDONE(bp)      ((bp)->b_flags &= ~XBF_DONE)
 #define XFS_BUF_ISDONE(bp)      ((bp)->b_flags & XBF_DONE)
-#define XFS_BUF_BUSY(bp)        do { } while (0)
-#define XFS_BUF_UNBUSY(bp)      do { } while (0)
-#define XFS_BUF_ISBUSY(bp)      (1)
 #define XFS_BUF_ASYNC(bp)       ((bp)->b_flags |= XBF_ASYNC)
 #define XFS_BUF_UNASYNC(bp)     ((bp)->b_flags &= ~XBF_ASYNC)
 #define XFS_BUF_ISASYNC(bp)     ((bp)->b_flags & XBF_ASYNC)
-#define XFS_BUF_HOLD(bp)        xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
 #define XFS_BUF_ISREAD(bp)      ((bp)->b_flags & XBF_READ)
@@ -276,10 +256,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNWRITE(bp)     ((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)     ((bp)->b_flags & XBF_WRITE)
-#define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
-#define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
 #define XFS_BUF_ADDR(bp)                ((bp)->b_bn)
 #define XFS_BUF_SET_ADDR(bp, bno)       ((bp)->b_bn = (xfs_daddr_t)(bno))
 #define XFS_BUF_OFFSET(bp)              ((bp)->b_file_offset)
@@ -289,23 +265,15 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_SIZE(bp)                ((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)       ((bp)->b_buffer_length = (cnt))
-static inline void
+static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
-xfs_buf_set_ref(
-        struct xfs_buf  *bp,
-        int             lru_ref)
 {
        atomic_set(&bp->b_lru_ref, lru_ref);
 }
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)    xfs_buf_set_ref(bp, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
-#define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
-#define XFS_BUF_FINISH_IOWAIT(bp)       complete(&bp->b_iowait);
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
-#define XFS_BUF_SET_TARGET(bp, target)  ((bp)->b_target = (target))
+        return atomic_read(&bp->b_pin_count);
-#define XFS_BUF_TARGET(bp)              ((bp)->b_target)
+}
-#define XFS_BUFTARG_NAME(target)        xfs_buf_target_name(target)
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
@@ -323,14 +291,7 @@ extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
-#ifdef CONFIG_KDB_MODULES
-extern struct list_head *xfs_get_buftarg_list(void);
-#endif
 #define xfs_getsize_buftarg(buftarg)    block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)   bdev_read_only((buftarg)->bt_bdev)
-#define xfs_binval(buftarg)             xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg)             xfs_flush_buftarg(buftarg, 1)
 #endif  /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 88492916c3dc..1a3513881bce 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -124,9 +124,9 @@ xfs_buf_item_log_check(
        bp = bip->bli_buf;
        ASSERT(XFS_BUF_COUNT(bp) > 0);
-        ASSERT(XFS_BUF_PTR(bp) != NULL);
+        ASSERT(bp->b_addr != NULL);
        orig = bip->bli_orig;
-        buffer = XFS_BUF_PTR(bp);
+        buffer = bp->b_addr;
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
                        xfs_emerg(bp->b_mount,
@@ -371,7 +371,6 @@ xfs_buf_item_pin(
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
@@ -479,13 +478,13 @@ xfs_buf_item_trylock(
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                return XFS_ITEM_PINNED;
        if (!xfs_buf_trylock(bp))
                return XFS_ITEM_LOCKED;
        /* take a reference to the buffer.  */
-        XFS_BUF_HOLD(bp);
+        xfs_buf_hold(bp);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
        trace_xfs_buf_item_trylock(bip);
@@ -630,7 +629,7 @@ xfs_buf_item_push(
 * the xfsbufd to get this buffer written. We have to unlock the buffer
 * to allow the xfsbufd to write it, too.
 */
-STATIC void
+STATIC bool
 xfs_buf_item_pushbuf(
        struct xfs_log_item     *lip)
 {
@@ -644,6 +643,7 @@ xfs_buf_item_pushbuf(
        xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
+        return true;
 }
 STATIC void
@@ -726,7 +726,7 @@ xfs_buf_item_init(
         * to have logged.
         */
        bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
-        memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
+        memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp));
        bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
 #endif
@@ -895,7 +895,6 @@ xfs_buf_attach_iodone(
 {
        xfs_log_item_t  *head_lip;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        lip->li_cb = cb;
@@ -960,7 +959,7 @@ xfs_buf_iodone_callbacks(
        static ulong            lasttime;
        static xfs_buftarg_t    *lasttarg;
-        if (likely(!XFS_BUF_GETERROR(bp)))
+        if (likely(!xfs_buf_geterror(bp)))
                goto do_callbacks;
        /*
@@ -968,19 +967,18 @@ xfs_buf_iodone_callbacks(
         * I/O errors, there's no point in giving this a retry.
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
-                XFS_BUF_SUPER_STALE(bp);
+                xfs_buf_stale(bp);
+                XFS_BUF_DONE(bp);
                trace_xfs_buf_item_iodone(bp, _RET_IP_);
                goto do_callbacks;
        }
-        if (XFS_BUF_TARGET(bp) != lasttarg ||
+        if (bp->b_target != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
                lasttime = jiffies;
-                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
+                xfs_buf_ioerror_alert(bp, __func__);
-                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                      (__uint64_t)XFS_BUF_ADDR(bp));
        }
-        lasttarg = XFS_BUF_TARGET(bp);
+        lasttarg = bp->b_target;
        /*
         * If the write was asynchronous then no one will be looking for the
@@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks(
         * around.
         */
        if (XFS_BUF_ISASYNC(bp)) {
-                XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+                xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
                if (!XFS_BUF_ISSTALE(bp)) {
-                        XFS_BUF_DELAYWRITE(bp);
+                        xfs_buf_delwri_queue(bp);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_SET_START(bp);
                }
                ASSERT(bp->b_iodone != NULL);
                trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1008,12 +1005,10 @@ xfs_buf_iodone_callbacks(
         * If the write of the buffer was synchronous, we want to make
         * sure to return the error to the caller of xfs_bwrite().
         */
-        XFS_BUF_STALE(bp);
+        xfs_buf_stale(bp);
        XFS_BUF_DONE(bp);
-        XFS_BUF_UNDELAYWRITE(bp);
        trace_xfs_buf_error_relse(bp, _RET_IP_);
-        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 do_callbacks:
        xfs_buf_do_callbacks(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 5bfcb8779f9f..77c74257c2a3 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1578,9 +1578,8 @@ xfs_da_grow_inode_int(
         */
        nmap = 1;
        ASSERT(args->firstblock != NULL);
-        error = xfs_bmapi(tp, dp, *bno, count,
+        error = xfs_bmapi_write(tp, dp, *bno, count,
-                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
-                        XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
                        args->flist);
        if (error)
@@ -1602,9 +1601,8 @@ xfs_da_grow_inode_int(
                for (b = *bno, mapi = 0; b < *bno + count; ) {
                        nmap = MIN(XFS_BMAP_MAX_NMAP, count);
                        c = (int)(*bno + count - b);
-                        error = xfs_bmapi(tp, dp, b, c,
+                        error = xfs_bmapi_write(tp, dp, b, c,
-                                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
+                                        xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                        XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
                                        &mapp[mapi], &nmap, args->flist);
                        if (error)
@@ -1975,33 +1973,16 @@ xfs_da_do_buf(
                /*
                 * Optimize the one-block case.
                 */
-                if (nfsb == 1) {
+                if (nfsb == 1)
-                        xfs_fsblock_t   fsb;
-                        if ((error =
-                            xfs_bmapi_single(trans, dp, whichfork, &fsb,
-                                    (xfs_fileoff_t)bno))) {
-                                return error;
-                        }
                        mapp = &map;
-                        if (fsb == NULLFSBLOCK) {
+                else
-                                nmap = 0;
-                        } else {
-                                map.br_startblock = fsb;
-                                map.br_startoff = (xfs_fileoff_t)bno;
-                                map.br_blockcount = 1;
-                                nmap = 1;
-                        }
-                } else {
                        mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
-                        nmap = nfsb;
-                        if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+                nmap = nfsb;
-                                        nfsb,
+                error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, mapp,
-                                        XFS_BMAPI_METADATA |
+                                       &nmap, xfs_bmapi_aflag(whichfork));
-                                                xfs_bmapi_aflag(whichfork),
+                if (error)
-                                        NULL, 0, mapp, &nmap, NULL)))
+                        goto exit0;
-                                goto exit0;
-                }
        } else {
                map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
                map.br_startoff = (xfs_fileoff_t)bno;
@@ -2050,7 +2031,7 @@ xfs_da_do_buf(
                case 0:
                        bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
                                mappedbno, nmapped, 0);
-                        error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+                        error = bp ? bp->b_error : XFS_ERROR(EIO);
                        break;
                case 1:
                case 2:
@@ -2072,13 +2053,10 @@ xfs_da_do_buf(
                if (!bp)
                        continue;
                if (caller == 1) {
-                        if (whichfork == XFS_ATTR_FORK) {
+                        if (whichfork == XFS_ATTR_FORK)
-                                XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+                                xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
-                                                XFS_ATTR_BTREE_REF);
+                        else
-                        } else {
+                                xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-                                XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
-                                                XFS_DIR_BTREE_REF);
-                        }
                }
                if (bplist) {
                        bplist[nbplist++] = bp;
@@ -2268,7 +2246,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
                dabuf->nbuf = 1;
                bp = bps[0];
                dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
-                dabuf->data = XFS_BUF_PTR(bp);
+                dabuf->data = bp->b_addr;
                dabuf->bps[0] = bp;
        } else {
                dabuf->nbuf = nbuf;
@@ -2279,7 +2257,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps)
                dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
                for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
                        bp = bps[i];
-                        memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+                        memcpy((char *)dabuf->data + off, bp->b_addr,
                                XFS_BUF_COUNT(bp));
                }
        }
@@ -2302,8 +2280,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf)
                for (i = off = 0; i < dabuf->nbuf;
                                i++, off += XFS_BUF_COUNT(bp)) {
                        bp = dabuf->bps[i];
-                        memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+                        memcpy(bp->b_addr, dabuf->data + off,
-                                XFS_BUF_COUNT(bp));
+                                                XFS_BUF_COUNT(bp));
                }
        }
 }
@@ -2340,7 +2318,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
        if (dabuf->nbuf == 1) {
-                ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+                ASSERT(dabuf->data == dabuf->bps[0]->b_addr);
                xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
                return;
        }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 9a84a85c03b1..654dc6f05bac 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -425,8 +425,8 @@ xfs_swap_extents(
        }
-        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
        xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -438,7 +438,7 @@ xfs_swap_extents(
        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+        error = xfs_trans_commit(tp, 0);
        trace_xfs_swap_extent_after(ip, 0);
        trace_xfs_swap_extent_after(tip, 1);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index dffba9ba0db6..a3721633abc8 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt {
                be32_to_cpu((dip)->di_nextents) : \
                be16_to_cpu((dip)->di_anextents))
-#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)((bp)->b_addr))
 /*
 * For block and character special files the 32bit dev_t is stored at the
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ca2386d82cdf..66e108f561a3 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -888,12 +888,10 @@ xfs_dir2_leaf_getdents(
                                 * we already have in the table.
                                 */
                                nmap = map_size - map_valid;
-                                error = xfs_bmapi(NULL, dp,
+                                error = xfs_bmapi_read(dp, map_off,
-                                        map_off,
                                        xfs_dir2_byte_to_da(mp,
                                                XFS_DIR2_LEAF_OFFSET) - map_off,
-                                        XFS_BMAPI_METADATA, NULL, 0,
+                                        &map[map_valid], &nmap, 0);
-                                        &map[map_valid], &nmap, NULL);
                                /*
                                 * Don't know if we should ignore this or
                                 * try to return an error.
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c
index 244e797dae32..8a24f0c6c860 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -38,7 +38,7 @@ xfs_trim_extents(
        struct xfs_mount        *mp,
        xfs_agnumber_t          agno,
        xfs_fsblock_t           start,
-        xfs_fsblock_t           len,
+        xfs_fsblock_t           end,
        xfs_fsblock_t           minlen,
        __uint64_t              *blocks_trimmed)
 {
@@ -100,7 +100,7 @@ xfs_trim_extents(
                 * down partially overlapping ranges for now.
                 */
                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
-                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                    XFS_AGB_TO_FSB(mp, agno, fbno) > end) {
                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
                        goto next_extent;
                }
@@ -145,7 +145,7 @@ xfs_ioc_trim(
        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
        unsigned int            granularity = q->limits.discard_granularity;
        struct fstrim_range     range;
-        xfs_fsblock_t           start, len, minlen;
+        xfs_fsblock_t           start, end, minlen;
        xfs_agnumber_t          start_agno, end_agno, agno;
        __uint64_t              blocks_trimmed = 0;
        int                     error, last_error = 0;
@@ -165,19 +165,19 @@ xfs_ioc_trim(
         * matter as trimming blocks is an advisory interface.
         */
        start = XFS_B_TO_FSBT(mp, range.start);
-        len = XFS_B_TO_FSBT(mp, range.len);
+        end = start + XFS_B_TO_FSBT(mp, range.len) - 1;
        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
-        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start >= mp->m_sb.sb_dblocks)
-        if (start_agno >= mp->m_sb.sb_agcount)
                return -XFS_ERROR(EINVAL);
+        if (end > mp->m_sb.sb_dblocks - 1)
+                end = mp->m_sb.sb_dblocks - 1;
-        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
-        if (end_agno >= mp->m_sb.sb_agcount)
+        end_agno = XFS_FSB_TO_AGNO(mp, end);
-                end_agno = mp->m_sb.sb_agcount - 1;
        for (agno = start_agno; agno <= end_agno; agno++) {
-                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                error = -xfs_trim_extents(mp, agno, start, end, minlen,
                                          &blocks_trimmed);
                if (error)
                        last_error = error;
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 837f31158d43..25d7280e9f6b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk(
        int             curid, i;
        ASSERT(tp);
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
-        d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+        d = bp->b_addr;
        /*
         * ID of the first dquot in the block - id's are zero based.
@@ -378,16 +377,14 @@ xfs_qm_dqalloc(
                return (ESRCH);
        }
-        xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
-        if ((error = xfs_bmapi(tp, quotip,
+        error = xfs_bmapi_write(tp, quotip, offset_fsb,
-                              offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+                                XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
-                              XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+                                &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
-                              &firstblock,
+                                &map, &nmaps, &flist);
-                              XFS_QM_DQALLOC_SPACE_RES(mp),
+        if (error)
-                              &map, &nmaps, &flist))) {
                goto error0;
-        }
        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
        ASSERT(nmaps == 1);
        ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -403,8 +400,11 @@ xfs_qm_dqalloc(
                               dqp->q_blkno,
                               mp->m_quotainfo->qi_dqchunklen,
                               0);
-        if (!bp || (error = XFS_BUF_GETERROR(bp)))
+        error = xfs_buf_geterror(bp);
+        if (error)
                goto error1;
        /*
         * Make a chunk of dquots out of this buffer and log
         * the entire thing.
@@ -486,9 +486,8 @@ xfs_qm_dqtobp(
        /*
         * Find the block map; no allocations yet
         */
-        error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+        error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
-                          XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+                               XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-                          NULL, 0, &map, &nmaps, NULL);
        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
        if (error)
@@ -534,13 +533,12 @@ xfs_qm_dqtobp(
                        return XFS_ERROR(error);
        }
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        /*
         * calculate the location of the dquot inside the buffer.
         */
-        ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddq = bp->b_addr + dqp->q_bufoffset;
        /*
         * A simple sanity check in case we got a corrupted dquot...
@@ -553,7 +551,6 @@ xfs_qm_dqtobp(
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
                }
-                XFS_BUF_BUSY(bp); /* We dirtied this */
        }
        *O_bpp = bp;
@@ -608,7 +605,7 @@ xfs_qm_dqread(
        dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
        /* Mark the buf so that this will stay incore a little longer */
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+        xfs_buf_set_ref(bp, XFS_DQUOT_REF);
        /*
         * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
@@ -622,7 +619,6 @@ xfs_qm_dqread(
         * this particular dquot was repaired. We still aren't afraid to
         * brelse it because we have the changes incore.
         */
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        xfs_trans_brelse(tp, bp);
@@ -1204,7 +1200,7 @@ xfs_qm_dqflush(
        /*
         * Calculate the location of the dquot inside the buffer.
         */
-        ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        ddqp = bp->b_addr + dqp->q_bufoffset;
        /*
         * A simple sanity check in case we got a corrupted dquot..
@@ -1240,15 +1236,17 @@ xfs_qm_dqflush(
         * If the buffer is pinned then push on the log so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp)) {
+        if (xfs_buf_ispinned(bp)) {
                trace_xfs_dqflush_force(dqp);
                xfs_log_force(mp, 0);
        }
        if (flags & SYNC_WAIT)
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
        else
-                xfs_bdwrite(mp, bp);
+                xfs_buf_delwri_queue(bp);
+        xfs_buf_relse(bp);
        trace_xfs_dqflush_done(dqp);
@@ -1447,7 +1445,7 @@ xfs_qm_dqflock_pushbuf_wait(
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
-                if (XFS_BUF_ISPINNED(bp))
+                if (xfs_buf_ispinned(bp))
                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 34b7e945dbfa..34b7e945dbfa 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..bb3f71d236d2 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
 * search the buffer cache can be a time consuming thing, and AIL lock is a
 * spinlock.
 */
-STATIC void
+STATIC bool
 xfs_qm_dquot_logitem_pushbuf(
        struct xfs_log_item     *lip)
 {
        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
        struct xfs_dquot        *dqp = qlip->qli_dquot;
        struct xfs_buf          *bp;
+        bool                    ret = true;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
        if (completion_done(&dqp->q_flush) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_dqunlock(dqp);
-                return;
+                return true;
        }
        bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
                        dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
-                return;
+                return true;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
+        if (xfs_buf_ispinned(bp))
+                ret = false;
        xfs_buf_relse(bp);
+        return ret;
 }
 /*
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c
index 75e5d322e48f..da108977b21f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -229,16 +229,16 @@ xfs_fs_nfs_commit_metadata(
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = 0;
+        xfs_lsn_t               lsn = 0;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        if (xfs_ipincount(ip)) {
+        if (xfs_ipincount(ip))
-                error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+                lsn = ip->i_itemp->ili_last_lsn;
-                                XFS_LOG_SYNC, NULL);
-        }
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return error;
+        if (!lsn)
+                return 0;
+        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h
index 3272b6ae7a35..3272b6ae7a35 100644
--- a/fs/xfs/linux-2.6/xfs_export.h
+++ b/fs/xfs/xfs_export.h
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c
index 7f7b42469ea7..753ed9b5c70b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -124,6 +124,35 @@ xfs_iozero(
        return (-status);
 }
+/*
+ * Fsync operations on directories are much simpler than on regular files,
+ * as there is no file data to flush, and thus also no need for explicit
+ * cache flush operations, and there are no non-transaction metadata updates
+ * on directories either.
+ */
+STATIC int
+xfs_dir_fsync(
+        struct file             *file,
+        loff_t                  start,
+        loff_t                  end,
+        int                     datasync)
+{
+        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
+        struct xfs_mount        *mp = ip->i_mount;
+        xfs_lsn_t               lsn = 0;
+        trace_xfs_dir_fsync(ip);
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if (xfs_ipincount(ip))
+                lsn = ip->i_itemp->ili_last_lsn;
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (!lsn)
+                return 0;
+        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+}
 STATIC int
 xfs_file_fsync(
        struct file             *file,
@@ -137,6 +166,7 @@ xfs_file_fsync(
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
+        xfs_lsn_t               lsn = 0;
        trace_xfs_file_fsync(ip);
@@ -149,10 +179,6 @@ xfs_file_fsync(
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        xfs_ioend_wait(ip);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        if (mp->m_flags & XFS_MOUNT_BARRIER) {
                /*
                 * If we have an RT and/or log subvolume we need to make sure
@@ -216,11 +242,11 @@ xfs_file_fsync(
                 * transaction.  So we play it safe and fire off the
                 * transaction anyway.
                 */
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                xfs_trans_set_sync(tp);
+                error = xfs_trans_commit(tp, 0);
-                error = _xfs_trans_commit(tp, 0, &log_flushed);
+                lsn = ip->i_itemp->ili_last_lsn;
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        } else {
                /*
@@ -231,14 +257,14 @@ xfs_file_fsync(
                 * disk yet, the inode will be still be pinned.  If it is,
                 * force the log.
                 */
-                if (xfs_ipincount(ip)) {
+                if (xfs_ipincount(ip))
-                        error = _xfs_log_force_lsn(mp,
+                        lsn = ip->i_itemp->ili_last_lsn;
-                                        ip->i_itemp->ili_last_lsn,
-                                        XFS_LOG_SYNC, &log_flushed);
-                }
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
+        if (!error && lsn)
+                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
        /*
         * If we only have a single device, and the log force about was
         * a no-op we might have to flush the data device cache here.
@@ -317,7 +343,19 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT)) {
+        /*
+         * Locking is a bit tricky here. If we take an exclusive lock
+         * for direct IO, we effectively serialise all new concurrent
+         * read IO to this file and block it behind IO that is currently in
+         * progress because IO in progress holds the IO lock shared. We only
+         * need to hold the lock exclusive to blow away the page cache, so
+         * only take lock exclusively if the page cache needs invalidation.
+         * This allows the normal direct IO case of no page cache pages to
+         * proceeed concurrently without serialisation.
+         */
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+        if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
@@ -330,8 +368,7 @@ xfs_file_aio_read(
                        }
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-        } else
+        }
-                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -407,11 +444,13 @@ xfs_aio_write_isize_update(
 */
 STATIC void
 xfs_aio_write_newsize_update(
-        struct xfs_inode        *ip)
+        struct xfs_inode        *ip,
+        xfs_fsize_t             new_size)
 {
-        if (ip->i_new_size) {
+        if (new_size == ip->i_new_size) {
                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
+                if (new_size == ip->i_new_size)
+                        ip->i_new_size = 0;
                if (ip->i_d.di_size > ip->i_size)
                        ip->i_d.di_size = ip->i_size;
                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +501,7 @@ xfs_file_splice_write(
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
        xfs_aio_write_isize_update(inode, ppos, ret);
-        xfs_aio_write_newsize_update(ip);
+        xfs_aio_write_newsize_update(ip, new_size);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -500,11 +539,9 @@ xfs_zero_last_block(
        last_fsb = XFS_B_TO_FSBT(mp, isize);
        nimaps = 1;
-        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+        error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-                          &nimaps, NULL);
+        if (error)
-        if (error) {
                return error;
-        }
        ASSERT(nimaps > 0);
        /*
         * If the block underlying isize is just a hole, then there
@@ -595,8 +632,8 @@ xfs_zero_eof(
        while (start_zero_fsb <= end_zero_fsb) {
                nimaps = 1;
                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+                error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                  0, NULL, 0, &imap, &nimaps, NULL);
+                                          &imap, &nimaps, 0);
                if (error) {
                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
                        return error;
@@ -659,6 +696,7 @@ xfs_file_aio_write_checks(
        struct file             *file,
        loff_t                  *pos,
        size_t                  *count,
+        xfs_fsize_t             *new_sizep,
        int                     *iolock)
 {
        struct inode            *inode = file->f_mapping->host;
@@ -666,6 +704,9 @@ xfs_file_aio_write_checks(
        xfs_fsize_t             new_size;
        int                     error = 0;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+        *new_sizep = 0;
+restart:
        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
        if (error) {
                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +714,41 @@ xfs_file_aio_write_checks(
                return error;
        }
-        new_size = *pos + *count;
-        if (new_size > ip->i_size)
-                ip->i_new_size = new_size;
        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
                file_update_time(file);
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
-         * write.
+         * write. There is no need to issue zeroing if another in-flght IO ends
+         * at or before this one If zeronig is needed and we are currently
+         * holding the iolock shared, we need to update it to exclusive which
+         * involves dropping all locks and relocking to maintain correct locking
+         * order. If we do this, restart the function to ensure all checks and
+         * values are still valid.
         */
-        if (*pos > ip->i_size)
+        if ((ip->i_new_size && *pos > ip->i_new_size) ||
+            (!ip->i_new_size && *pos > ip->i_size)) {
+                if (*iolock == XFS_IOLOCK_SHARED) {
+                        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                        *iolock = XFS_IOLOCK_EXCL;
+                        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+                        goto restart;
+                }
                error = -xfs_zero_eof(ip, *pos, ip->i_size);
+        }
+        /*
+         * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+         * We have already zeroed space beyond EOF (if necessary).  Only update
+         * ip->i_new_size if this IO ends beyond any other in-flight writes.
+         */
+        new_size = *pos + *count;
+        if (new_size > ip->i_size) {
+                if (new_size > ip->i_new_size)
+                        ip->i_new_size = new_size;
+                *new_sizep = new_size;
+        }
        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
@@ -721,7 +783,7 @@ xfs_file_aio_write_checks(
 * the dio layer.  To avoid the problem with aio, we also need to wait for
 * outstanding IOs to complete so that unwritten extent conversion is completed
 * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
 *
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
@@ -733,6 +795,7 @@ xfs_file_dio_aio_write(
        unsigned long           nr_segs,
        loff_t                  pos,
        size_t                  ocount,
+        xfs_fsize_t             *new_size,
        int                     *iolock)
 {
        struct file             *file = iocb->ki_filp;
@@ -753,18 +816,35 @@ xfs_file_dio_aio_write(
        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
                unaligned_io = 1;
-        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+        /*
+         * We don't need to take an exclusive lock unless there page cache needs
+         * to be invalidated or unaligned IO is being executed. We don't need to
+         * consider the EOF extension case here because
+         * xfs_file_aio_write_checks() will relock the inode as necessary for
+         * EOF zeroing cases and fill out the new inode size as appropriate.
+         */
+        if (unaligned_io || mapping->nrpages)
                *iolock = XFS_IOLOCK_EXCL;
        else
                *iolock = XFS_IOLOCK_SHARED;
-        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        xfs_rw_ilock(ip, *iolock);
-        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        /*
+         * Recheck if there are cached pages that need invalidate after we got
+         * the iolock to protect against other threads adding new pages while
+         * we were waiting for the iolock.
+         */
+        if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+                xfs_rw_iunlock(ip, *iolock);
+                *iolock = XFS_IOLOCK_EXCL;
+                xfs_rw_ilock(ip, *iolock);
+        }
+        ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
        if (ret)
                return ret;
        if (mapping->nrpages) {
-                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
                                                        FI_REMAPF_LOCKED);
                if (ret)
@@ -776,7 +856,7 @@ xfs_file_dio_aio_write(
         * otherwise demote the lock if we had to flush cached pages
         */
        if (unaligned_io)
-                xfs_ioend_wait(ip);
+                inode_dio_wait(inode);
        else if (*iolock == XFS_IOLOCK_EXCL) {
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
                *iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +878,7 @@ xfs_file_buffered_aio_write(
        unsigned long           nr_segs,
        loff_t                  pos,
        size_t                  ocount,
+        xfs_fsize_t             *new_size,
        int                     *iolock)
 {
        struct file             *file = iocb->ki_filp;
@@ -809,9 +890,9 @@ xfs_file_buffered_aio_write(
        size_t                  count = ocount;
        *iolock = XFS_IOLOCK_EXCL;
-        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        xfs_rw_ilock(ip, *iolock);
-        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
        if (ret)
                return ret;
@@ -851,6 +932,7 @@ xfs_file_aio_write(
        ssize_t                 ret;
        int                     iolock;
        size_t                  ocount = 0;
+        xfs_fsize_t             new_size = 0;
        XFS_STATS_INC(xs_write_calls);
@@ -870,10 +952,10 @@ xfs_file_aio_write(
        if (unlikely(file->f_flags & O_DIRECT))
                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                                                ocount, &iolock);
+                                                ocount, &new_size, &iolock);
        else
                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                                ocount, &iolock);
+                                                ocount, &new_size, &iolock);
        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
@@ -894,7 +976,7 @@ xfs_file_aio_write(
        }
 out_unlock:
-        xfs_aio_write_newsize_update(ip);
+        xfs_aio_write_newsize_update(ip, new_size);
        xfs_rw_iunlock(ip, iolock);
        return ret;
 }
@@ -1087,7 +1169,7 @@ const struct file_operations xfs_dir_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
 #endif
-        .fsync          = xfs_file_fsync,
+        .fsync          = xfs_dir_fsync,
 };
 static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3ff3d9e23ded..5170306a1009 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -682,7 +682,7 @@ xfs_filestream_new_ag(
        ip = ap->ip;
        mp = ip->i_mount;
        cache = mp->m_filestream;
-        minlen = ap->alen;
+        minlen = ap->length;
        *agp = NULLAGNUMBER;
        /*
@@ -761,7 +761,7 @@ xfs_filestream_new_ag(
         */
        ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
        flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
-                (ap->low ? XFS_PICK_LOWSPACE : 0);
+                (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
        err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
        if (err || *agp == NULLAGNUMBER)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..ed88ed16811c 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9153d2c77caf..1c6fdeb702ff 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -194,6 +194,10 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                                 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
                                 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
                agf = XFS_BUF_TO_AGF(bp);
                memset(agf, 0, mp->m_sb.sb_sectsize);
                agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -216,16 +220,21 @@ xfs_growfs_data_private(
                tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
                agf->agf_freeblks = cpu_to_be32(tmpsize);
                agf->agf_longest = cpu_to_be32(tmpsize);
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
-                if (error) {
+                xfs_buf_relse(bp);
+                if (error)
                        goto error0;
-                }
                /*
                 * AG inode header block
                 */
                bp = xfs_buf_get(mp->m_ddev_targp,
                                 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
                                 XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
                agi = XFS_BUF_TO_AGI(bp);
                memset(agi, 0, mp->m_sb.sb_sectsize);
                agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -240,10 +249,11 @@ xfs_growfs_data_private(
                agi->agi_dirino = cpu_to_be32(NULLAGINO);
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
                        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
-                if (error) {
+                xfs_buf_relse(bp);
+                if (error)
                        goto error0;
-                }
                /*
                 * BNO btree root block
                 */
@@ -251,6 +261,10 @@ xfs_growfs_data_private(
                                 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                                 BTOBB(mp->m_sb.sb_blocksize),
                                 XBF_LOCK | XBF_MAPPED);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
@@ -262,10 +276,11 @@ xfs_growfs_data_private(
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
-                if (error) {
+                xfs_buf_relse(bp);
+                if (error)
                        goto error0;
-                }
                /*
                 * CNT btree root block
                 */
@@ -273,6 +288,10 @@ xfs_growfs_data_private(
                                 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                                 BTOBB(mp->m_sb.sb_blocksize),
                                 XBF_LOCK | XBF_MAPPED);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
@@ -285,10 +304,11 @@ xfs_growfs_data_private(
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                nfree += be32_to_cpu(arec->ar_blockcount);
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
-                if (error) {
+                xfs_buf_relse(bp);
+                if (error)
                        goto error0;
-                }
                /*
                 * INO btree root block
                 */
@@ -296,6 +316,10 @@ xfs_growfs_data_private(
                                 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                                 BTOBB(mp->m_sb.sb_blocksize),
                                 XBF_LOCK | XBF_MAPPED);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
@@ -303,10 +327,10 @@ xfs_growfs_data_private(
                block->bb_numrecs = 0;
                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
-                if (error) {
+                xfs_buf_relse(bp);
+                if (error)
                        goto error0;
-                }
        }
        xfs_trans_agblocks_delta(tp, nfree);
        /*
@@ -396,9 +420,9 @@ xfs_growfs_data_private(
                 * just issue a warning and continue.  The real work is
                 * already done and committed.
                 */
-                if (!(error = xfs_bwrite(mp, bp))) {
+                error = xfs_bwrite(bp);
-                        continue;
+                xfs_buf_relse(bp);
-                } else {
+                if (error) {
                        xfs_warn(mp,
                "write error %d updating secondary superblock for ag %d",
                                error, agno);
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..76e81cff70b9 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index dd5628bd8d0b..169380e66057 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -150,7 +150,7 @@ xfs_check_agi_freecount(
 /*
 * Initialise a new set of inodes.
 */
-STATIC void
+STATIC int
 xfs_ialloc_inode_init(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -202,9 +202,8 @@ xfs_ialloc_inode_init(
                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                         mp->m_bsize * blks_per_cluster,
                                         XBF_LOCK);
-                ASSERT(fbuf);
+                if (!fbuf)
-                ASSERT(!XFS_BUF_GETERROR(fbuf));
+                        return ENOMEM;
                /*
                 * Initialize all inodes in this buffer and then log them.
                 *
@@ -226,6 +225,7 @@ xfs_ialloc_inode_init(
                }
                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
+        return 0;
 }
 /*
@@ -370,9 +370,11 @@ xfs_ialloc_ag_alloc(
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-        xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+        error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
-                              random32());
+                        args.len, random32());
+        if (error)
+                return error;
        /*
         * Convert the results.
         */
@@ -1486,7 +1488,7 @@ xfs_read_agi(
        if (error)
                return error;
-        ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+        ASSERT(!xfs_buf_geterror(*bpp));
        agi = XFS_BUF_TO_AGI(*bpp);
        /*
@@ -1503,7 +1505,7 @@ xfs_read_agi(
                return XFS_ERROR(EFSCORRUPTED);
        }
-        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
        xfs_check_agi_unlinked(agi);
        return 0;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7759812c1bbe..0fa98b1c70ea 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -75,7 +75,6 @@ xfs_inode_alloc(
                return NULL;
        }
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
@@ -150,7 +149,6 @@ xfs_inode_free(
        }
        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2fcca4b03ed3..c0237c602f11 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -190,12 +190,6 @@ xfs_imap_to_bp(
        }
        xfs_inobp_check(mp, bp);
-        /*
-         * Mark the buffer as an inode buffer now that it looks good
-         */
-        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
        *bpp = bp;
        return 0;
 }
@@ -1152,7 +1146,7 @@ xfs_ialloc(
        /*
         * Log the new values stuffed into the inode.
         */
-        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1187,6 +1181,7 @@ xfs_isize_check(
        xfs_fileoff_t           map_first;
        int                     nimaps;
        xfs_bmbt_irec_t         imaps[2];
+        int                     error;
        if (!S_ISREG(ip->i_d.di_mode))
                return;
@@ -1203,13 +1198,12 @@ xfs_isize_check(
         * The filesystem could be shutting down, so bmapi may return
         * an error.
         */
-        if (xfs_bmapi(NULL, ip, map_first,
+        error = xfs_bmapi_read(ip, map_first,
                         (XFS_B_TO_FSB(mp,
-                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
+                               (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-                          map_first),
+                         imaps, &nimaps, XFS_BMAPI_ENTIRE);
-                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+        if (error)
-                         NULL))
+                return;
-            return;
        ASSERT(nimaps == 1);
        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
 }
@@ -1297,7 +1291,7 @@ xfs_itruncate_extents(
                 */
                error = xfs_bmap_finish(&tp, &free_list, &committed);
                if (committed)
-                        xfs_trans_ijoin(tp, ip);
+                        xfs_trans_ijoin(tp, ip, 0);
                if (error)
                        goto out_bmap_cancel;
@@ -1313,7 +1307,7 @@ xfs_itruncate_extents(
                error = xfs_trans_commit(tp, 0);
                tp = ntp;
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                if (error)
                        goto out;
@@ -1644,7 +1638,7 @@ xfs_iunlink_remove(
 * inodes that are in memory - they all must be marked stale and attached to
 * the cluster buffer.
 */
-STATIC void
+STATIC int
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
        xfs_trans_t     *tp,
@@ -1690,6 +1684,8 @@ xfs_ifree_cluster(
                                        mp->m_bsize * blks_per_cluster,
                                        XBF_LOCK);
+                if (!bp)
+                        return ENOMEM;
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
@@ -1799,6 +1795,7 @@ retry:
        }
        xfs_perag_put(pag);
+        return 0;
 }
 /*
@@ -1878,10 +1875,10 @@ xfs_ifree(
        dip->di_mode = 0;
        if (delete) {
-                xfs_ifree_cluster(ip, tp, first_ino);
+                error = xfs_ifree_cluster(ip, tp, first_ino);
        }
-        return 0;
+        return error;
 }
 /*
@@ -2472,11 +2469,11 @@ cluster_corrupt_out:
                 */
                if (bp->b_iodone) {
                        XFS_BUF_UNDONE(bp);
-                        XFS_BUF_STALE(bp);
+                        xfs_buf_stale(bp);
-                        XFS_BUF_ERROR(bp,EIO);
+                        xfs_buf_ioerror(bp, EIO);
                        xfs_buf_ioend(bp, 0);
                } else {
-                        XFS_BUF_STALE(bp);
+                        xfs_buf_stale(bp);
                        xfs_buf_relse(bp);
                }
        }
@@ -2585,7 +2582,7 @@ xfs_iflush(
         * If the buffer is pinned then push on the log now so we won't
         * get stuck waiting in the write for too long.
         */
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                xfs_log_force(mp, 0);
        /*
@@ -2597,9 +2594,11 @@ xfs_iflush(
                goto cluster_corrupt_out;
        if (flags & SYNC_WAIT)
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
        else
-                xfs_bdwrite(mp, bp);
+                xfs_buf_delwri_queue(bp);
+        xfs_buf_relse(bp);
        return error;
 corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 2380a4bcbece..760140d1dd66 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -257,7 +257,6 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
-        atomic_t                i_iocount;      /* outstanding I/O count */
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..b7cf21ba240f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -658,10 +658,8 @@ xfs_inode_item_unlock(
        lock_flags = iip->ili_lock_flags;
        iip->ili_lock_flags = 0;
-        if (lock_flags) {
+        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
-                IRELE(ip);
-        }
 }
 /*
@@ -708,13 +706,14 @@ xfs_inode_item_committed(
 * marked delayed write. If that's the case, we'll promote it and that will
 * allow the caller to write the buffer by triggering the xfsbufd to run.
 */
-STATIC void
+STATIC bool
 xfs_inode_item_pushbuf(
        struct xfs_log_item     *lip)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
        struct xfs_buf          *bp;
+        bool                    ret = true;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
@@ -725,7 +724,7 @@ xfs_inode_item_pushbuf(
        if (completion_done(&ip->i_flush) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                return;
+                return true;
        }
        bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +732,13 @@ xfs_inode_item_pushbuf(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!bp)
-                return;
+                return true;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
+        if (xfs_buf_ispinned(bp))
+                ret = false;
        xfs_buf_relse(bp);
+        return ret;
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7ce7debe14c..d99a90518909 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1069,7 +1069,7 @@ xfs_ioctl_setattr(
                }
        }
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        /*
         * Change file ownership.  Must be the owner or privileged.
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 54e623bfbb85..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 091d82b94c4d..9afa282aa937 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -208,22 +208,20 @@ xfs_iomap_write_direct(
        if (error)
                goto error1;
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
-        bmapi_flag = XFS_BMAPI_WRITE;
+        bmapi_flag = 0;
        if (offset < ip->i_size || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
-         * Issue the xfs_bmapi() call to allocate the blocks.
-         *
         * From this point onwards we overwrite the imap pointer that the
         * caller gave to us.
         */
        xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
-        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+        error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-                &firstfsb, 0, imap, &nimaps, &free_list);
+                                &firstfsb, 0, imap, &nimaps, &free_list);
        if (error)
                goto error0;
@@ -300,8 +298,8 @@ xfs_iomap_eof_want_preallocate(
        while (count_fsb > 0) {
                imaps = nimaps;
                firstblock = NULLFSBLOCK;
-                error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
+                error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
-                                  &firstblock, 0, imap, &imaps, NULL);
+                                       0);
                if (error)
                        return error;
                for (n = 0; n < imaps; n++) {
@@ -381,7 +379,6 @@ xfs_iomap_write_delay(
        xfs_fileoff_t   last_fsb;
        xfs_off_t       aligned_offset;
        xfs_fileoff_t   ioalign;
-        xfs_fsblock_t   firstblock;
        xfs_extlen_t    extsz;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
@@ -425,12 +422,8 @@ retry:
        }
        nimaps = XFS_WRITE_IMAPS;
-        firstblock = NULLFSBLOCK;
+        error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
-        error = xfs_bmapi(NULL, ip, offset_fsb,
+                                imap, &nimaps, XFS_BMAPI_ENTIRE);
-                          (xfs_filblks_t)(last_fsb - offset_fsb),
-                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
-                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-                          &nimaps, NULL);
        switch (error) {
        case 0:
        case ENOSPC:
@@ -535,7 +528,7 @@ xfs_iomap_write_allocate(
                                return XFS_ERROR(error);
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin(tp, ip);
+                        xfs_trans_ijoin(tp, ip, 0);
                        xfs_bmap_init(&free_list, &first_block);
@@ -587,14 +580,12 @@ xfs_iomap_write_allocate(
                        }
                        /*
-                         * Go get the actual blocks.
-                         *
                         * From this point onwards we overwrite the imap
                         * pointer that the caller gave to us.
                         */
-                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+                        error = xfs_bmapi_write(tp, ip, map_start_fsb,
-                                        XFS_BMAPI_WRITE, &first_block, 1,
+                                                count_fsb, 0, &first_block, 1,
-                                        imap, &nimaps, &free_list);
+                                                imap, &nimaps, &free_list);
                        if (error)
                                goto trans_cancel;
@@ -701,15 +692,15 @@ xfs_iomap_write_unwritten(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                /*
                 * Modify the unwritten extent state of the buffer.
                 */
                xfs_bmap_init(&free_list, &firstfsb);
                nimaps = 1;
-                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+                error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-                                  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+                                  XFS_BMAPI_CONVERT, &firstfsb,
                                  1, &imap, &nimaps, &free_list);
                if (error)
                        goto error_on_bmapi_transaction;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c
index b9c172b3fbbe..9ba2a07b7343 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -70,9 +70,8 @@ xfs_synchronize_times(
 }
 /*
- * If the linux inode is valid, mark it dirty.
+ * If the linux inode is valid, mark it dirty, else mark the dirty state
- * Used when committing a dirty inode into a transaction so that
+ * in the XFS inode to make sure we pick it up when reclaiming the inode.
- * the inode will get written back by the linux code
 */
 void
 xfs_mark_inode_dirty_sync(
@@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync(
        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty_sync(inode);
+        else {
+                barrier();
+                ip->i_update_core = 1;
+        }
 }
 void
@@ -92,6 +95,28 @@ xfs_mark_inode_dirty(
        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty(inode);
+        else {
+                barrier();
+                ip->i_update_core = 1;
+        }
+}
+int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                   void *fs_info)
+{
+        const struct xattr *xattr;
+        struct xfs_inode *ip = XFS_I(inode);
+        int error = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                error = xfs_attr_set(ip, xattr->name, xattr->value,
+                                     xattr->value_len, ATTR_SECURE);
+                if (error < 0)
+                        break;
+        }
+        return error;
 }
 /*
@@ -100,31 +125,15 @@ xfs_mark_inode_dirty(
 * these attrs can be journalled at inode creation time (along with the
 * inode, of course, such that log replay can't cause these to be lost).
 */
 STATIC int
 xfs_init_security(
        struct inode    *inode,
        struct inode    *dir,
        const struct qstr *qstr)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        return security_inode_init_security(inode, dir, qstr,
-        size_t          length;
+                                            &xfs_initxattrs, NULL);
-        void            *value;
-        unsigned char   *name;
-        int             error;
-        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
-                                             &value, &length);
-        if (error) {
-                if (error == -EOPNOTSUPP)
-                        return 0;
-                return -error;
-        }
-        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-        kfree(name);
-        kfree(value);
-        return error;
 }
 static void
@@ -457,7 +466,7 @@ xfs_vn_getattr(
        trace_xfs_getattr(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -XFS_ERROR(EIO);
        stat->size = XFS_ISIZE(ip);
        stat->dev = inode->i_sb->s_dev;
@@ -603,7 +612,7 @@ xfs_setattr_nonsize(
                }
        }
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -825,16 +834,16 @@ xfs_setattr_size(
         * care about here.
         */
        if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-                error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
+                error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
-                                        XBF_ASYNC, FI_NONE);
+                                        FI_NONE);
                if (error)
                        goto out_unlock;
        }
        /*
-         * Wait for all I/O to complete.
+         * Wait for all direct I/O to complete.
         */
-        xfs_ioend_wait(ip);
+        inode_dio_wait(inode);
        error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
                                     xfs_get_blocks);
@@ -855,7 +864,7 @@ xfs_setattr_size(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        /*
         * Only change the c/mtime if we are changing the size or we are
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h
index d42f814e4d35..828662f70d64 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,13 +32,12 @@
 # define XFS_BIG_INUMS  0
 #endif
-#include <xfs_types.h>
+#include "xfs_types.h"
-#include <kmem.h>
+#include "kmem.h"
-#include <mrlock.h>
+#include "mrlock.h"
-#include <time.h>
+#include "time.h"
+#include "uuid.h"
-#include <support/uuid.h>
 #include <linux/semaphore.h>
 #include <linux/mm.h>
@@ -69,6 +68,8 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <asm/page.h>
@@ -78,14 +79,14 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_vnode.h>
+#include "xfs_vnode.h"
-#include <xfs_stats.h>
+#include "xfs_stats.h"
-#include <xfs_sysctl.h>
+#include "xfs_sysctl.h"
-#include <xfs_iops.h>
+#include "xfs_iops.h"
-#include <xfs_aops.h>
+#include "xfs_aops.h"
-#include <xfs_super.h>
+#include "xfs_super.h"
-#include <xfs_buf.h>
+#include "xfs_buf.h"
-#include <xfs_message.h>
+#include "xfs_message.h"
 #ifdef __BIG_ENDIAN
 #define XFS_NATIVE_HOST 1
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 06ff8437ed8e..2758a6277c52 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -878,10 +878,10 @@ xlog_iodone(xfs_buf_t *bp)
        /*
         * Race to shutdown the filesystem if we see an error.
         */
-        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
+        if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
                        XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
-                xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
+                xfs_buf_ioerror_alert(bp, __func__);
-                XFS_BUF_STALE(bp);
+                xfs_buf_stale(bp);
                xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
                /*
                 * This flag will be propagated to the trans-committed
@@ -1047,11 +1047,10 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xlog_get_iclog_buffer_size(mp, log);
        error = ENOMEM;
-        bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+        bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0);
        if (!bp)
                goto out_free_log;
        bp->b_iodone = xlog_iodone;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(xfs_buf_islocked(bp));
        log->l_xbuf = bp;
@@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
-                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(xfs_buf_islocked(iclog->ic_bp));
                init_waitqueue_head(&iclog->ic_force_wait);
                init_waitqueue_head(&iclog->ic_write_wait);
@@ -1248,8 +1246,8 @@ xlog_bdstrat(
        struct xlog_in_core     *iclog = bp->b_fspriv;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
-                XFS_BUF_ERROR(bp, EIO);
+                xfs_buf_ioerror(bp, EIO);
-                XFS_BUF_STALE(bp);
+                xfs_buf_stale(bp);
                xfs_buf_ioend(bp, 0);
                /*
                 * It would seem logical to return EIO here, but we rely on
@@ -1355,7 +1353,6 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_SET_COUNT(bp, count);
        bp->b_fspriv = iclog;
        XFS_BUF_ZEROFLAGS(bp);
-        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
@@ -1390,24 +1387,23 @@ xlog_sync(xlog_t		*log,
         */
        XFS_BUF_WRITE(bp);
-        if ((error = xlog_bdstrat(bp))) {
+        error = xlog_bdstrat(bp);
-                xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+        if (error) {
-                                  XFS_BUF_ADDR(bp));
+                xfs_buf_ioerror_alert(bp, "xlog_sync");
                return error;
        }
        if (split) {
                bp = iclog->ic_log->l_xbuf;
                XFS_BUF_SET_ADDR(bp, 0);             /* logical 0 */
-                XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+                xfs_buf_associate_memory(bp,
-                                            (__psint_t)count), split);
+                                (char *)&iclog->ic_header + count, split);
                bp->b_fspriv = iclog;
                XFS_BUF_ZEROFLAGS(bp);
-                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                dptr = XFS_BUF_PTR(bp);
+                dptr = bp->b_addr;
                /*
                 * Bump the cycle numbers at the start of each block
                 * since this part of the buffer is at the start of
@@ -1427,9 +1423,9 @@ xlog_sync(xlog_t		*log,
                /* account for internal log which doesn't start at block #0 */
                XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
                XFS_BUF_WRITE(bp);
-                if ((error = xlog_bdstrat(bp))) {
+                error = xlog_bdstrat(bp);
-                        xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+                if (error) {
-                                          bp, XFS_BUF_ADDR(bp));
+                        xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
                        return error;
                }
        }
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 052a2c0ec5fb..541a508adea1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_align(
        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-        return XFS_BUF_PTR(bp) + BBTOB(offset);
+        return bp->b_addr + BBTOB(offset);
 }
@@ -178,15 +178,12 @@ xlog_bread_noalign(
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
-        XFS_BUF_BUSY(bp);
        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
-        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
        xfsbdstrat(log->l_mp, bp);
        error = xfs_buf_iowait(bp);
        if (error)
-                xfs_ioerror_alert("xlog_bread", log->l_mp,
+                xfs_buf_ioerror_alert(bp, __func__);
-                                  bp, XFS_BUF_ADDR(bp));
        return error;
 }
@@ -220,18 +217,18 @@ xlog_bread_offset(
        xfs_buf_t       *bp,
        xfs_caddr_t     offset)
 {
-        xfs_caddr_t     orig_offset = XFS_BUF_PTR(bp);
+        xfs_caddr_t     orig_offset = bp->b_addr;
        int             orig_len = bp->b_buffer_length;
        int             error, error2;
-        error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+        error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
        if (error)
                return error;
        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
        /* must reset buffer pointer even on error */
-        error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+        error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
        if (error)
                return error;
        return error2;
@@ -266,15 +263,14 @@ xlog_bwrite(
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_ZEROFLAGS(bp);
-        XFS_BUF_BUSY(bp);
+        xfs_buf_hold(bp);
-        XFS_BUF_HOLD(bp);
        xfs_buf_lock(bp);
        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
-        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
-        if ((error = xfs_bwrite(log->l_mp, bp)))
+        error = xfs_bwrite(bp);
-                xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+        if (error)
-                                  bp, XFS_BUF_ADDR(bp));
+                xfs_buf_ioerror_alert(bp, __func__);
+        xfs_buf_relse(bp);
        return error;
 }
@@ -360,14 +356,12 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-        if (XFS_BUF_GETERROR(bp)) {
+        if (bp->b_error) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                xfs_ioerror_alert("xlog_recover_iodone",
+                xfs_buf_ioerror_alert(bp, __func__);
-                                        bp->b_target->bt_mount, bp,
-                                        XFS_BUF_ADDR(bp));
                xfs_force_shutdown(bp->b_target->bt_mount,
                                        SHUTDOWN_META_IO_ERROR);
        }
@@ -1262,7 +1256,7 @@ xlog_write_log_records(
                 */
                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
-                        offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+                        offset = bp->b_addr + BBTOB(ealign - start_block);
                        error = xlog_bread_offset(log, ealign, sectbb,
                                                        bp, offset);
                        if (error)
@@ -2135,15 +2129,15 @@ xlog_recover_buffer_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
                          buf_flags);
-        if (XFS_BUF_ISERROR(bp)) {
+        if (!bp)
-                xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+                return XFS_ERROR(ENOMEM);
-                                  bp, buf_f->blf_blkno);
+        error = bp->b_error;
-                error = XFS_BUF_GETERROR(bp);
+        if (error) {
+                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
                xfs_buf_relse(bp);
                return error;
        }
-        error = 0;
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (buf_f->blf_flags &
@@ -2174,15 +2168,16 @@ xlog_recover_buffer_pass2(
            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
            (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
-                XFS_BUF_STALE(bp);
+                xfs_buf_stale(bp);
-                error = xfs_bwrite(mp, bp);
+                error = xfs_bwrite(bp);
        } else {
                ASSERT(bp->b_target->bt_mount == mp);
                bp->b_iodone = xlog_recover_iodone;
-                xfs_bdwrite(mp, bp);
+                xfs_buf_delwri_queue(bp);
        }
-        return (error);
+        xfs_buf_relse(bp);
+        return error;
 }
 STATIC int
@@ -2227,14 +2222,16 @@ xlog_recover_inode_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
-        if (XFS_BUF_ISERROR(bp)) {
+        if (!bp) {
-                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+                error = ENOMEM;
-                                  bp, in_f->ilf_blkno);
+                goto error;
-                error = XFS_BUF_GETERROR(bp);
+        }
+        error = bp->b_error;
+        if (error) {
+                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
                xfs_buf_relse(bp);
                goto error;
        }
-        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2439,7 +2436,8 @@ xlog_recover_inode_pass2(
 write_inode_buffer:
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
-        xfs_bdwrite(mp, bp);
+        xfs_buf_delwri_queue(bp);
+        xfs_buf_relse(bp);
 error:
        if (need_free)
                kmem_free(in_f);
@@ -2537,8 +2535,7 @@ xlog_recover_dquot_pass2(
                             XFS_FSB_TO_BB(mp, dq_f->qlf_len),
                             0, &bp);
        if (error) {
-                xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
-                                  bp, dq_f->qlf_blkno);
                return error;
        }
        ASSERT(bp);
@@ -2561,7 +2558,8 @@ xlog_recover_dquot_pass2(
        ASSERT(dq_f->qlf_size == 2);
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
-        xfs_bdwrite(mp, bp);
+        xfs_buf_delwri_queue(bp);
+        xfs_buf_relse(bp);
        return (0);
 }
@@ -3437,7 +3435,7 @@ xlog_do_recovery_pass(
                        /*
                         * Check for header wrapping around physical end-of-log
                         */
-                        offset = XFS_BUF_PTR(hbp);
+                        offset = hbp->b_addr;
                        split_hblks = 0;
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
@@ -3497,7 +3495,7 @@ xlog_do_recovery_pass(
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
-                                offset = XFS_BUF_PTR(dbp);
+                                offset = dbp->b_addr;
                                split_bblks = 0;
                                if (blk_no != log->l_logBBsize) {
                                        /* some data is before the physical
@@ -3656,7 +3654,7 @@ xlog_do_recover(
                return error;
        }
-        XFS_bflush(log->l_mp->m_ddev_targp);
+        xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
        /*
         * If IO errors happened during recovery, bail out.
@@ -3689,8 +3687,7 @@ xlog_do_recover(
        xfsbdstrat(log->l_mp, bp);
        error = xfs_buf_iowait(bp);
        if (error) {
-                xfs_ioerror_alert("xlog_do_recover",
+                xfs_buf_ioerror_alert(bp, __func__);
-                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
                ASSERT(0);
                xfs_buf_relse(bp);
                return error;
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c
index bd672def95ac..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/xfs_message.c
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb7ea007672..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/xfs_message.h
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 092e16ae4d9d..d06afbc3540d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -44,9 +44,6 @@
 #include "xfs_trace.h"
-STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
 #ifdef HAVE_PERCPU_SB
 STATIC void     xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
                                                int);
@@ -1484,7 +1481,7 @@ xfs_unmountfs(
         * state as much as possible.
         */
        xfs_reclaim_inodes(mp, 0);
-        XFS_bflush(mp->m_ddev_targp);
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);
@@ -1496,11 +1493,6 @@ xfs_unmountfs(
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
-        xfs_binval(mp->m_ddev_targp);
-        if (mp->m_rtdev_targp) {
-                xfs_binval(mp->m_rtdev_targp);
-        }
        /*
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
@@ -1526,7 +1518,16 @@ xfs_unmountfs(
                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
-        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
+        /*
+         * Make sure all buffers have been flushed and completed before
+         * unmounting the log.
+         */
+        error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        if (error)
+                xfs_warn(mp, "%d busy buffers during unmount.", error);
+        xfs_wait_buftarg(mp->m_ddev_targp);
        xfs_log_unmount_write(mp);
        xfs_log_unmount(mp);
        xfs_uuid_unmount(mp);
@@ -1537,16 +1538,6 @@ xfs_unmountfs(
        xfs_free_perag(mp);
 }
-STATIC void
-xfs_unmountfs_wait(xfs_mount_t *mp)
-{
-        if (mp->m_logdev_targp != mp->m_ddev_targp)
-                xfs_wait_buftarg(mp->m_logdev_targp);
-        if (mp->m_rtdev_targp)
-                xfs_wait_buftarg(mp->m_rtdev_targp);
-        xfs_wait_buftarg(mp->m_ddev_targp);
-}
 int
 xfs_fs_writable(xfs_mount_t *mp)
 {
@@ -1612,15 +1603,14 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                XFS_BUF_UNDONE(sbp);
                XFS_BUF_UNREAD(sbp);
-                XFS_BUF_UNDELAYWRITE(sbp);
+                xfs_buf_delwri_dequeue(sbp);
                XFS_BUF_WRITE(sbp);
                XFS_BUF_UNASYNC(sbp);
-                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
+                ASSERT(sbp->b_target == mp->m_ddev_targp);
                xfsbdstrat(mp, sbp);
                error = xfs_buf_iowait(sbp);
                if (error)
-                        xfs_ioerror_alert("xfs_unmountfs_writesb",
+                        xfs_buf_ioerror_alert(sbp, __func__);
-                                          mp, sbp, XFS_BUF_ADDR(sbp));
                xfs_buf_relse(sbp);
        }
        return error;
@@ -1938,7 +1928,7 @@ xfs_getsb(
                xfs_buf_lock(bp);
        }
-        XFS_BUF_HOLD(bp);
+        xfs_buf_hold(bp);
        ASSERT(XFS_BUF_ISDONE(bp));
        return bp;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c
index 46e54ad9a2dc..5cff443f6cdb 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts(
        do_div(j, sizeof(xfs_dqblk_t));
        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
-        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+        ddq = bp->b_addr;
        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
@@ -1296,7 +1296,8 @@ xfs_qm_dqiter_bufs(
                        break;
                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-                xfs_bdwrite(mp, bp);
+                xfs_buf_delwri_queue(bp);
+                xfs_buf_relse(bp);
                /*
                 * goto the next block.
                 */
@@ -1346,11 +1347,8 @@ xfs_qm_dqiterate(
                 * the inode is never added to the transaction.
                 */
                xfs_ilock(qip, XFS_ILOCK_SHARED);
-                error = xfs_bmapi(NULL, qip, lblkno,
+                error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
-                                  maxlblkcnt - lblkno,
+                                       map, &nmaps, 0);
-                                  XFS_BMAPI_METADATA,
-                                  NULL,
-                                  0, map, &nmaps, NULL);
                xfs_iunlock(qip, XFS_ILOCK_SHARED);
                if (error)
                        break;
@@ -1683,7 +1681,7 @@ xfs_qm_quotacheck(
         * quotacheck'd stamp on the superblock. So, here we do a synchronous
         * flush.
         */
-        XFS_bflush(mp->m_ddev_targp);
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
        /*
         * If one type of quotas is off, then it will lose its
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h
index 43b9abe1052c..43b9abe1052c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a0a829addca9..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b32644..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
index 5b964fc0dc09..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/xfs_qm_stats.h
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 609246f42e6c..5cc3dde1bc90 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -261,7 +261,7 @@ xfs_qm_scall_trunc_qfile(
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        error = xfs_itruncate_data(&tp, ip, 0);
        if (error) {
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
index 94a3d927d716..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/xfs_quota_priv.h
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 29b9d642e93d..7e76f537abb7 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -25,7 +25,7 @@
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "quota/xfs_qm.h"
+#include "xfs_qm.h"
 #include <linux/quota.h>
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index df78c297d1a1..866de277079a 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -170,12 +170,12 @@ xfs_rename(
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.
         */
-        xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
        if (new_parent)
-                xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
        if (target_ip)
-                xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
        /*
         * If we are using project inheritance, we only allow renames
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8f76fdff4f46..87323f1ded64 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -112,7 +112,7 @@ xfs_growfs_rt_alloc(
                 * Lock the inode.
                 */
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                xfs_bmap_init(&flist, &firstblock);
                /*
@@ -120,9 +120,9 @@ xfs_growfs_rt_alloc(
                 */
                nmap = 1;
                cancelflags |= XFS_TRANS_ABORT;
-                error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+                error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
-                        XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+                                        XFS_BMAPI_METADATA, &firstblock,
-                        resblks, &map, &nmap, &flist);
+                                        resblks, &map, &nmap, &flist);
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
@@ -155,7 +155,7 @@ xfs_growfs_rt_alloc(
                         * Lock the bitmap inode.
                         */
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                        /*
                         * Get a buffer for the block.
                         */
@@ -168,7 +168,7 @@ error_cancel:
                                xfs_trans_cancel(tp, cancelflags);
                                goto error;
                        }
-                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
+                        memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
                        /*
                         * Commit the transaction.
@@ -856,34 +856,24 @@ xfs_rtbuf_get(
        xfs_buf_t       **bpp)          /* output: buffer for the block */
 {
        xfs_buf_t       *bp;            /* block buffer, result */
-        xfs_daddr_t     d;              /* disk addr of block */
-        int             error;          /* error value */
-        xfs_fsblock_t   fsb;            /* fs block number for block */
        xfs_inode_t     *ip;            /* bitmap or summary inode */
+        xfs_bmbt_irec_t map;
+        int             nmap;
+        int             error;          /* error value */
        ip = issum ? mp->m_rsumip : mp->m_rbmip;
-        /*
-         * Map from the file offset (block) and inode number to the
+        error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
-         * file system block.
+        if (error)
-         */
-        error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
-        if (error) {
                return error;
-        }
-        ASSERT(fsb != NULLFSBLOCK);
+        ASSERT(map.br_startblock != NULLFSBLOCK);
-        /*
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-         * Convert to disk address for buffer cache.
+                                   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-         */
-        d = XFS_FSB_TO_DADDR(mp, fsb);
-        /*
-         * Read the buffer.
-         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
                                   mp->m_bsize, 0, &bp);
-        if (error) {
+        if (error)
                return error;
-        }
+        ASSERT(!xfs_buf_geterror(bp));
-        ASSERT(bp && !XFS_BUF_GETERROR(bp));
        *bpp = bp;
        return 0;
 }
@@ -943,7 +933,7 @@ xfs_rtcheck_range(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Compute the starting word's address, and starting bit.
         */
@@ -994,7 +984,7 @@ xfs_rtcheck_range(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1040,7 +1030,7 @@ xfs_rtcheck_range(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1158,7 +1148,7 @@ xfs_rtfind_back(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Get the first word's index & point to it.
         */
@@ -1210,7 +1200,7 @@ xfs_rtfind_back(
                        if (error) {
                                return error;
                        }
-                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        bufp = bp->b_addr;
                        word = XFS_BLOCKWMASK(mp);
                        b = &bufp[word];
                } else {
@@ -1256,7 +1246,7 @@ xfs_rtfind_back(
                        if (error) {
                                return error;
                        }
-                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        bufp = bp->b_addr;
                        word = XFS_BLOCKWMASK(mp);
                        b = &bufp[word];
                } else {
@@ -1333,7 +1323,7 @@ xfs_rtfind_forw(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Get the first word's index & point to it.
         */
@@ -1384,7 +1374,7 @@ xfs_rtfind_forw(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1429,7 +1419,7 @@ xfs_rtfind_forw(
                        if (error) {
                                return error;
                        }
-                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1649,7 +1639,7 @@ xfs_rtmodify_range(
        if (error) {
                return error;
        }
-        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        bufp = bp->b_addr;
        /*
         * Compute the starting word's address, and starting bit.
         */
@@ -1694,7 +1684,7 @@ xfs_rtmodify_range(
                        if (error) {
                                return error;
                        }
-                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        first = b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1734,7 +1724,7 @@ xfs_rtmodify_range(
                        if (error) {
                                return error;
                        }
-                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        first = b = bufp = bp->b_addr;
                        word = 0;
                } else {
                        /*
@@ -1832,8 +1822,8 @@ xfs_rtmodify_summary(
         */
        sp = XFS_SUMPTR(mp, bp, so);
        *sp += delta;
-        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
-                (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+                (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
        return 0;
 }
@@ -1970,7 +1960,7 @@ xfs_growfs_rt(
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
                /*
                 * Update the bitmap inode's size.
                 */
@@ -1982,7 +1972,7 @@ xfs_growfs_rt(
                 * Get the summary inode into the transaction.
                 */
                xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
                /*
                 * Update the summary inode's size.
                 */
@@ -2153,7 +2143,7 @@ xfs_rtfree_extent(
         * Synchronize by locking the bitmap inode.
         */
        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 09e1f4f35e97..f7f3a359c1c5 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -47,7 +47,7 @@ struct xfs_trans;
 #define XFS_SUMOFFSTOBLOCK(mp,s)        \
        (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
 #define XFS_SUMPTR(mp,bp,so)    \
-        ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+        ((xfs_suminfo_t *)((bp)->b_addr + \
                (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
 #define XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index d6d6fdfe9422..597d044a09a1 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -92,24 +92,6 @@ xfs_do_force_shutdown(
 }
 /*
- * Prints out an ALERT message about I/O error.
- */
-void
-xfs_ioerror_alert(
-        char                    *func,
-        struct xfs_mount        *mp,
-        xfs_buf_t               *bp,
-        xfs_daddr_t             blkno)
-{
-        xfs_alert(mp,
-                 "I/O error occurred: meta-data dev %s block 0x%llx"
-                 "       (\"%s\") error %d buf count %zd",
-                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                (__uint64_t)blkno, func,
-                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
-}
-/*
 * This isn't an absolute requirement, but it is
 * just a good idea to call xfs_read_buf instead of
 * directly doing a read_buf call. For one, we shouldn't
@@ -137,20 +119,19 @@ xfs_read_buf(
        bp = xfs_buf_read(target, blkno, len, flags);
        if (!bp)
                return XFS_ERROR(EIO);
-        error = XFS_BUF_GETERROR(bp);
+        error = bp->b_error;
-        if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+        if (!error && !XFS_FORCED_SHUTDOWN(mp)) {
                *bpp = bp;
        } else {
                *bpp = NULL;
                if (error) {
-                        xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+                        xfs_buf_ioerror_alert(bp, __func__);
                } else {
                        error = XFS_ERROR(EIO);
                }
                if (bp) {
                        XFS_BUF_UNDONE(bp);
-                        XFS_BUF_UNDELAYWRITE(bp);
+                        xfs_buf_stale(bp);
-                        XFS_BUF_STALE(bp);
                        /*
                         * brelse clears B_ERROR and b_error
                         */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 11c41ec6ed75..bbdb9ad6a4ba 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -42,8 +42,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
                        xfs_daddr_t blkno, int len, uint flags,
                        struct xfs_buf **bpp);
-extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
-                                xfs_buf_t *bp, xfs_daddr_t blkno);
 extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1eb2ba586814..cb6ae715814a 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 #define XFS_SB_DADDR            ((xfs_daddr_t)0) /* daddr in filesystem/ag */
 #define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_SBP(bp)      ((xfs_dsb_t *)((bp)->b_addr))
 #define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
 #define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c
index 76fdc5861932..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h
index 736854b1ca1a..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c
index 9a72dda58bd0..3eca58f51ae9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -356,6 +356,8 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+                        xfs_warn(mp,
+        "nodelaylog is deprecated and will be removed in Linux 3.3");
                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
                        mp->m_flags |= XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
@@ -794,8 +796,6 @@ xfs_fs_destroy_inode(
        if (is_bad_inode(inode))
                goto out_reclaim;
-        xfs_ioend_wait(ip);
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
        /*
@@ -835,7 +835,6 @@ xfs_fs_inode_init_once(
        inode_init_once(VFS_I(ip));
        /* xfs inode */
-        atomic_set(&ip->i_iocount, 0);
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
        init_waitqueue_head(&ip->i_ipin_wait);
@@ -877,33 +876,17 @@ xfs_log_inode(
        struct xfs_trans        *tp;
        int                     error;
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
        error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                /* we need to return with the lock hold shared */
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        /*
-         * Note - it's possible that we might have pushed ourselves out of the
-         * way during trans_reserve which would flush the inode.  But there's
-         * no guarantee that the inode buffer has actually gone out yet (it's
-         * delwri).  Plus the buffer could be pinned anyway if it's part of
-         * an inode in another recent transaction.  So we play it safe and
-         * fire off the transaction anyway.
-         */
-        xfs_trans_ijoin(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
+        return xfs_trans_commit(tp, 0);
-        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 STATIC int
@@ -918,7 +901,9 @@ xfs_fs_write_inode(
        trace_xfs_write_inode(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -XFS_ERROR(EIO);
+        if (!ip->i_update_core)
+                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
                /*
@@ -926,15 +911,12 @@ xfs_fs_write_inode(
                 * of forcing it all the way to stable storage using a
                 * synchronous transaction we let the log force inside the
                 * ->sync_fs call do that for thus, which reduces the number
-                 * of synchronous log foces dramatically.
+                 * of synchronous log forces dramatically.
                 */
-                xfs_ioend_wait(ip);
+                error = xfs_log_inode(ip);
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if (error)
-                if (ip->i_update_core) {
+                        goto out;
-                        error = xfs_log_inode(ip);
+                return 0;
-                        if (error)
-                                goto out_unlock;
-                }
        } else {
                /*
                 * We make this non-blocking if the inode is contended, return
@@ -1033,7 +1015,7 @@ xfs_fs_put_super(
         */
        xfs_filestream_unmount(mp);
-        XFS_bflush(mp->m_ddev_targp);
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
        xfs_unmountfs(mp);
        xfs_freesb(mp);
@@ -1457,7 +1439,7 @@ xfs_fs_fill_super(
         */
        xfs_filestream_unmount(mp);
-        XFS_bflush(mp->m_ddev_targp);
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
        xfs_unmountfs(mp);
        goto out_free_sb;
@@ -1666,24 +1648,13 @@ xfs_init_workqueues(void)
         */
        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
        if (!xfs_syncd_wq)
-                goto out;
+                return -ENOMEM;
-        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
-        if (!xfs_ail_wq)
-                goto out_destroy_syncd;
        return 0;
-out_destroy_syncd:
-        destroy_workqueue(xfs_syncd_wq);
-out:
-        return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
-        destroy_workqueue(xfs_ail_wq);
        destroy_workqueue(xfs_syncd_wq);
 }
@@ -1695,7 +1666,6 @@ init_xfs_fs(void)
        printk(KERN_INFO XFS_VERSION_STRING " with "
                         XFS_BUILD_OPTIONS " enabled\n");
-        xfs_ioend_init();
        xfs_dir_startup();
        error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h
index 50a3266c999e..50a3266c999e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/xfs_super.h
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c
index e4c938afb910..aa3dc1a4d53d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -227,21 +227,17 @@ xfs_sync_inode_data(
        int                     error = 0;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-                goto out_wait;
+                return 0;
        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
                if (flags & SYNC_TRYLOCK)
-                        goto out_wait;
+                        return 0;
                xfs_ilock(ip, XFS_IOLOCK_SHARED);
        }
        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
                                0 : XBF_ASYNC, FI_NONE);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- out_wait:
-        if (flags & SYNC_WAIT)
-                xfs_ioend_wait(ip);
        return error;
 }
@@ -322,6 +318,7 @@ xfs_sync_fsdata(
        struct xfs_mount        *mp)
 {
        struct xfs_buf          *bp;
+        int                     error;
        /*
         * If the buffer is pinned then push on the log so we won't get stuck
@@ -332,10 +329,11 @@ xfs_sync_fsdata(
         * between there and here.
         */
        bp = xfs_getsb(mp, 0);
-        if (XFS_BUF_ISPINNED(bp))
+        if (xfs_buf_ispinned(bp))
                xfs_log_force(mp, 0);
+        error = xfs_bwrite(bp);
-        return xfs_bwrite(mp, bp);
+        xfs_buf_relse(bp);
+        return error;
 }
 /*
@@ -379,7 +377,7 @@ xfs_quiesce_data(
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
-                XFS_bflush(mp->m_rtdev_targp);
+                xfs_flush_buftarg(mp->m_rtdev_targp, 1);
        return error ? error : error2;
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..941202e7ac6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c
index 88d25d4aa56e..9010ce885e6a 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -43,8 +43,8 @@
 #include "xfs_quota.h"
 #include "xfs_iomap.h"
 #include "xfs_aops.h"
-#include "quota/xfs_dquot_item.h"
+#include "xfs_dquot_item.h"
-#include "quota/xfs_dquot.h"
+#include "xfs_dquot.h"
 #include "xfs_log_recover.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h
index 690fc7a7bd72..f1d2802b2f07 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -30,6 +30,7 @@ struct xfs_buf_log_item;
 struct xfs_da_args;
 struct xfs_da_node_entry;
 struct xfs_dquot;
+struct xfs_log_item;
 struct xlog_ticket;
 struct log;
 struct xlog_recover;
@@ -320,7 +321,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
@@ -577,6 +577,7 @@ DEFINE_INODE_EVENT(xfs_vm_bmap);
 DEFINE_INODE_EVENT(xfs_file_ioctl);
 DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
 DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
 DEFINE_INODE_EVENT(xfs_write_inode);
@@ -853,6 +854,42 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DECLARE_EVENT_CLASS(xfs_log_item_class,
+        TP_PROTO(struct xfs_log_item *lip),
+        TP_ARGS(lip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(void *, lip)
+                __field(uint, type)
+                __field(uint, flags)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = lip->li_mountp->m_super->s_dev;
+                __entry->lip = lip;
+                __entry->type = lip->li_type;
+                __entry->flags = lip->li_flags;
+                __entry->lsn = lip->li_lsn;
+        ),
+        TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->lip,
+                  CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn),
+                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                  __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+#define DEFINE_LOG_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_log_item_class, name, \
+        TP_PROTO(struct xfs_log_item *lip), \
+        TP_ARGS(lip))
+DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
+DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
 DECLARE_EVENT_CLASS(xfs_file_class,
        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
        TP_ARGS(ip, count, offset, flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index efc147f0e9b6..1f35b2feca97 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1790,9 +1790,7 @@ xfs_trans_commit_cil(
 }
 /*
- * xfs_trans_commit
+ * Commit the given transaction to the log.
- *
- * Commit the given transaction to the log a/synchronously.
 *
 * XFS disk error handling mechanism is not based on a typical
 * transaction abort mechanism. Logically after the filesystem
@@ -1804,10 +1802,9 @@ xfs_trans_commit_cil(
 * Do not reference the transaction structure after this call.
 */
 int
-_xfs_trans_commit(
+xfs_trans_commit(
        struct xfs_trans        *tp,
-        uint                    flags,
+        uint                    flags)
-        int                     *log_flushed)
 {
        struct xfs_mount        *mp = tp->t_mountp;
        xfs_lsn_t               commit_lsn = -1;
@@ -1866,7 +1863,7 @@ _xfs_trans_commit(
        if (sync) {
                if (!error) {
                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, log_flushed);
+                                      XFS_LOG_SYNC, NULL);
                }
                XFS_STATS_INC(xs_trans_sync);
        } else {
@@ -2021,6 +2018,6 @@ xfs_trans_roll(
        if (error)
                return error;
-        xfs_trans_ijoin(trans, dp);
+        xfs_trans_ijoin(trans, dp, 0);
        return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..603f3eb52041 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
        void (*iop_unlock)(xfs_log_item_t *);
        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_pushbuf)(xfs_log_item_t *);
+        bool (*iop_pushbuf)(xfs_log_item_t *);
        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
@@ -470,8 +470,7 @@ void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
-void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
+void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
-void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void            xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -487,10 +486,7 @@ void		xfs_trans_log_efd_extent(xfs_trans_t *,
                                         struct xfs_efd_log_item *,
                                         xfs_fsblock_t,
                                         xfs_extlen_t);
-int             _xfs_trans_commit(xfs_trans_t *,
+int             xfs_trans_commit(xfs_trans_t *, uint flags);
-                                  uint flags,
-                                  int *);
-#define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 43233e92f0f6..ed9252bcdac9 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -26,10 +26,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
+#include "xfs_trace.h"
 #include "xfs_error.h"
-struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
 #ifdef DEBUG
 /*
 * Check that the list is sorted as it should be.
@@ -299,7 +298,7 @@ xfs_trans_ail_cursor_last(
 * Splice the log item list into the AIL at the given LSN. We splice to the
 * tail of the given LSN to maintain insert order for push traversals. The
 * cursor is optional, allowing repeated updates to the same LSN to avoid
- * repeated traversals.
+ * repeated traversals.  This should not be called with an empty list.
 */
 static void
 xfs_ail_splice(
@@ -308,50 +307,39 @@ xfs_ail_splice(
        struct list_head        *list,
        xfs_lsn_t               lsn)
 {
-        struct xfs_log_item     *lip = cur ? cur->item : NULL;
+        struct xfs_log_item     *lip;
-        struct xfs_log_item     *next_lip;
+        ASSERT(!list_empty(list));
        /*
-         * Get a new cursor if we don't have a placeholder or the existing one
+         * Use the cursor to determine the insertion point if one is
-         * has been invalidated.
+         * provided.  If not, or if the one we got is not valid,
+         * find the place in the AIL where the items belong.
         */
-        if (!lip || (__psint_t)lip & 1) {
+        lip = cur ? cur->item : NULL;
+        if (!lip || (__psint_t) lip & 1)
                lip = __xfs_trans_ail_cursor_last(ailp, lsn);
-                if (!lip) {
+        /*
-                        /* The list is empty, so just splice and return.  */
+         * If a cursor is provided, we know we're processing the AIL
-                        if (cur)
+         * in lsn order, and future items to be spliced in will
-                                cur->item = NULL;
+         * follow the last one being inserted now.  Update the
-                        list_splice(list, &ailp->xa_ail);
+         * cursor to point to that last item, now while we have a
-                        return;
+         * reliable pointer to it.
-                }
+         */
-        }
+        if (cur)
+                cur->item = list_entry(list->prev, struct xfs_log_item, li_ail);
        /*
-         * Our cursor points to the item we want to insert _after_, so we have
+         * Finally perform the splice.  Unless the AIL was empty,
-         * to update the cursor to point to the end of the list we are splicing
+         * lip points to the item in the AIL _after_ which the new
-         * in so that it points to the correct location for the next splice.
+         * items should go.  If lip is null the AIL was empty, so
-         * i.e. before the splice
+         * the new items go at the head of the AIL.
-         *
-         *  lsn -> lsn -> lsn + x -> lsn + x ...
-         *          ^
-         *          | cursor points here
-         *
-         * After the splice we have:
-         *
-         *  lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
-         *          ^                            ^
-         *          | cursor points here         | needs to move here
-         *
-         * So we set the cursor to the last item in the list to be spliced
-         * before we execute the splice, resulting in the cursor pointing to
-         * the correct item after the splice occurs.
         */
-        if (cur) {
+        if (lip)
-                next_lip = list_entry(list->prev, struct xfs_log_item, li_ail);
+                list_splice(list, &lip->li_ail);
-                cur->item = next_lip;
+        else
-        }
+                list_splice(list, &ailp->xa_ail);
-        list_splice(list, &lip->li_ail);
 }
 /*
@@ -367,28 +355,34 @@ xfs_ail_delete(
        xfs_trans_ail_cursor_clear(ailp, lip);
 }
-/*
+static long
- * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+xfsaild_push(
- * to run at a later time if there is more work to do to complete the push.
+        struct xfs_ail          *ailp)
- */
-STATIC void
-xfs_ail_worker(
-        struct work_struct      *work)
 {
-        struct xfs_ail          *ailp = container_of(to_delayed_work(work),
-                                        struct xfs_ail, xa_work);
        xfs_mount_t             *mp = ailp->xa_mount;
        struct xfs_ail_cursor   cur;
        xfs_log_item_t          *lip;
        xfs_lsn_t               lsn;
        xfs_lsn_t               target;
        long                    tout = 10;
-        int                     flush_log = 0;
        int                     stuck = 0;
        int                     count = 0;
        int                     push_xfsbufd = 0;
+        /*
+         * If last time we ran we encountered pinned items, force the log first
+         * and wait for it before pushing again.
+         */
        spin_lock(&ailp->xa_lock);
+        if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush &&
+            !list_empty(&ailp->xa_ail)) {
+                ailp->xa_log_flush = 0;
+                spin_unlock(&ailp->xa_lock);
+                XFS_STATS_INC(xs_push_ail_flush);
+                xfs_log_force(mp, XFS_LOG_SYNC);
+                spin_lock(&ailp->xa_lock);
+        }
        target = ailp->xa_target;
        lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
@@ -432,26 +426,37 @@ xfs_ail_worker(
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
+                        trace_xfs_ail_push(lip);
                        IOP_PUSH(lip);
                        ailp->xa_last_pushed_lsn = lsn;
                        break;
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
-                        IOP_PUSHBUF(lip);
+                        trace_xfs_ail_pushbuf(lip);
-                        ailp->xa_last_pushed_lsn = lsn;
+                        if (!IOP_PUSHBUF(lip)) {
+                                trace_xfs_ail_pushbuf_pinned(lip);
+                                stuck++;
+                                ailp->xa_log_flush++;
+                        } else {
+                                ailp->xa_last_pushed_lsn = lsn;
+                        }
                        push_xfsbufd = 1;
                        break;
                case XFS_ITEM_PINNED:
                        XFS_STATS_INC(xs_push_ail_pinned);
+                        trace_xfs_ail_pinned(lip);
                        stuck++;
-                        flush_log = 1;
+                        ailp->xa_log_flush++;
                        break;
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        ailp->xa_last_pushed_lsn = lsn;
+                        trace_xfs_ail_locked(lip);
                        stuck++;
                        break;
@@ -491,16 +496,6 @@ xfs_ail_worker(
        xfs_trans_ail_cursor_done(ailp, &cur);
        spin_unlock(&ailp->xa_lock);
-        if (flush_log) {
-                /*
-                 * If something we need to push out was pinned, then
-                 * push out the log so it will become unpinned and
-                 * move forward in the AIL.
-                 */
-                XFS_STATS_INC(xs_push_ail_flush);
-                xfs_log_force(mp, 0);
-        }
        if (push_xfsbufd) {
                /* we've got delayed write buffers to flush */
                wake_up_process(mp->m_ddev_targp->bt_task);
@@ -511,20 +506,7 @@ out_done:
        if (!count) {
                /* We're past our target or empty, so idle */
                ailp->xa_last_pushed_lsn = 0;
+                ailp->xa_log_flush = 0;
-                /*
-                 * We clear the XFS_AIL_PUSHING_BIT first before checking
-                 * whether the target has changed. If the target has changed,
-                 * this pushes the requeue race directly onto the result of the
-                 * atomic test/set bit, so we are guaranteed that either the
-                 * the pusher that changed the target or ourselves will requeue
-                 * the work (but not both).
-                 */
-                clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
-                smp_rmb();
-                if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
-                    test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-                        return;
                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
@@ -543,14 +525,39 @@ out_done:
                 * were stuck.
                 *
                 * Backoff a bit more to allow some I/O to complete before
-                 * continuing from where we were.
+                 * restarting from the start of the AIL. This prevents us
+                 * from spinning on the same items, and if they are pinned will
+                 * all the restart to issue a log force to unpin the stuck
+                 * items.
                 */
                tout = 20;
+                ailp->xa_last_pushed_lsn = 0;
+        }
+        return tout;
+}
+static int
+xfsaild(
+        void            *data)
+{
+        struct xfs_ail  *ailp = data;
+        long            tout = 0;       /* milliseconds */
+        while (!kthread_should_stop()) {
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                try_to_freeze();
+                tout = xfsaild_push(ailp);
        }
-        /* There is more to do, requeue us.  */
+        return 0;
-        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
-                                        msecs_to_jiffies(tout));
 }
 /*
@@ -585,8 +592,9 @@ xfs_ail_push(
         */
        smp_wmb();
        xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
-        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+        smp_wmb();
-                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+        wake_up_process(ailp->xa_task);
 }
 /*
@@ -682,6 +690,7 @@ xfs_trans_ail_update_bulk(
        int                     i;
        LIST_HEAD(tmp);
+        ASSERT(nr_items > 0);           /* Not required, but true. */
        mlip = xfs_ail_min(ailp);
        for (i = 0; i < nr_items; i++) {
@@ -701,7 +710,8 @@ xfs_trans_ail_update_bulk(
                list_add(&lip->li_ail, &tmp);
        }
-        xfs_ail_splice(ailp, cur, &tmp, lsn);
+        if (!list_empty(&tmp))
+                xfs_ail_splice(ailp, cur, &tmp, lsn);
        if (!mlip_changed) {
                spin_unlock(&ailp->xa_lock);
@@ -822,9 +832,18 @@ xfs_trans_ail_init(
        INIT_LIST_HEAD(&ailp->xa_ail);
        INIT_LIST_HEAD(&ailp->xa_cursors);
        spin_lock_init(&ailp->xa_lock);
-        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                        ailp->xa_mount->m_fsname);
+        if (IS_ERR(ailp->xa_task))
+                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
+out_free_ailp:
+        kmem_free(ailp);
+        return ENOMEM;
 }
 void
@@ -833,6 +852,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        cancel_delayed_work_sync(&ailp->xa_work);
+        kthread_stop(ailp->xa_task);
        kmem_free(ailp);
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15584fc3ed7d..475a4ded4f41 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -54,7 +54,7 @@ xfs_trans_buf_item_match(
        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                blip = (struct xfs_buf_log_item *)lidp->lid_item;
                if (blip->bli_item.li_type == XFS_LI_BUF &&
-                    XFS_BUF_TARGET(blip->bli_buf) == target &&
+                    blip->bli_buf->b_target == target &&
                    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
                    XFS_BUF_COUNT(blip->bli_buf) == len)
                        return blip->bli_buf;
@@ -80,7 +80,6 @@ _xfs_trans_bjoin(
 {
        struct xfs_buf_log_item *bip;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == NULL);
        /*
@@ -161,8 +160,10 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
        bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
        if (bp != NULL) {
                ASSERT(xfs_buf_islocked(bp));
-                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
+                if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
-                        XFS_BUF_SUPER_STALE(bp);
+                        xfs_buf_stale(bp);
+                        XFS_BUF_DONE(bp);
+                }
                /*
                 * If the buffer is stale then it was binval'ed
@@ -194,7 +195,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
                return NULL;
        }
-        ASSERT(!XFS_BUF_GETERROR(bp));
+        ASSERT(!bp->b_error);
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_get_buf(bp->b_fspriv);
@@ -293,10 +294,9 @@ xfs_trans_read_buf(
                        return (flags & XBF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
-                if (XFS_BUF_GETERROR(bp) != 0) {
+                if (bp->b_error) {
-                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                        error = bp->b_error;
-                                          bp, blkno);
+                        xfs_buf_ioerror_alert(bp, __func__);
-                        error = XFS_BUF_GETERROR(bp);
                        xfs_buf_relse(bp);
                        return error;
                }
@@ -330,7 +330,7 @@ xfs_trans_read_buf(
                ASSERT(xfs_buf_islocked(bp));
                ASSERT(bp->b_transp == tp);
                ASSERT(bp->b_fspriv != NULL);
-                ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+                ASSERT(!bp->b_error);
                if (!(XFS_BUF_ISDONE(bp))) {
                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                        ASSERT(!XFS_BUF_ISASYNC(bp));
@@ -338,8 +338,7 @@ xfs_trans_read_buf(
                        xfsbdstrat(tp->t_mountp, bp);
                        error = xfs_buf_iowait(bp);
                        if (error) {
-                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                                xfs_buf_ioerror_alert(bp, __func__);
-                                                  bp, blkno);
                                xfs_buf_relse(bp);
                                /*
                                 * We can gracefully recover from most read
@@ -386,12 +385,11 @@ xfs_trans_read_buf(
                return (flags & XBF_TRYLOCK) ?
                                        0 : XFS_ERROR(ENOMEM);
        }
-        if (XFS_BUF_GETERROR(bp) != 0) {
+        if (bp->b_error) {
-            XFS_BUF_SUPER_STALE(bp);
+                error = bp->b_error;
-                error = XFS_BUF_GETERROR(bp);
+                xfs_buf_stale(bp);
+                XFS_BUF_DONE(bp);
-                xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                xfs_buf_ioerror_alert(bp, __func__);
-                                  bp, blkno);
                if (tp->t_flags & XFS_TRANS_DIRTY)
                        xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
                xfs_buf_relse(bp);
@@ -430,7 +428,7 @@ shutdown_abort:
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
-        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+        ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
@@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
@@ -648,13 +643,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
         * inside the b_bdstrat callback so that this won't get written to
         * disk.
         */
-        XFS_BUF_DELAYWRITE(bp);
        XFS_BUF_DONE(bp);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bp->b_iodone = xfs_buf_iodone_callbacks;
        bip->bli_item.li_cb = xfs_buf_iodone;
+        xfs_buf_delwri_queue(bp);
        trace_xfs_trans_log_buf(bip);
        /*
@@ -702,7 +698,6 @@ xfs_trans_binval(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -744,8 +739,7 @@ xfs_trans_binval(
         * We set the stale bit in the buffer as well since we're getting
         * rid of it.
         */
-        XFS_BUF_UNDELAYWRITE(bp);
+        xfs_buf_stale(bp);
-        XFS_BUF_STALE(bp);
        bip->bli_flags |= XFS_BLI_STALE;
        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
@@ -774,7 +768,6 @@ xfs_trans_inode_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -851,7 +842,6 @@ xfs_trans_dquot_buf(
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792d..4d00ee67792d 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index c8dea2fd7e68..32f0288ae10f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -47,11 +47,13 @@ xfs_trans_inode_broot_debug(
 * Add a locked inode to the transaction.
 *
 * The inode must be locked, and it cannot be associated with any transaction.
+ * If lock_flags is non-zero the inode will be unlocked on transaction commit.
 */
 void
 xfs_trans_ijoin(
        struct xfs_trans        *tp,
-        struct xfs_inode        *ip)
+        struct xfs_inode        *ip,
+        uint                    lock_flags)
 {
        xfs_inode_log_item_t    *iip;
@@ -59,7 +61,9 @@ xfs_trans_ijoin(
        if (ip->i_itemp == NULL)
                xfs_inode_item_init(ip, ip->i_mount);
        iip = ip->i_itemp;
        ASSERT(iip->ili_lock_flags == 0);
+        iip->ili_lock_flags = lock_flags;
        /*
         * Get a log_item_desc to point at the new item.
@@ -70,25 +74,6 @@ xfs_trans_ijoin(
 }
 /*
- * Add a locked inode to the transaction.
- *
- *
- * Grabs a reference to the inode which will be dropped when the transaction
- * is committed.  The inode will also be unlocked at that point.  The inode
- * must be locked, and it cannot be associated with any transaction.
- */
-void
-xfs_trans_ijoin_ref(
-        struct xfs_trans        *tp,
-        struct xfs_inode        *ip,
-        uint                    lock_flags)
-{
-        xfs_trans_ijoin(tp, ip);
-        IHOLD(ip);
-        ip->i_itemp->ili_lock_flags = lock_flags;
-}
-/*
 * Transactional inode timestamp update. Requires the inode to be locked and
 * joined to the transaction supplied. Relies on the transaction subsystem to
 * track dirty state and update/writeback the inode accordingly.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..44820b9fcb43 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,18 @@ struct xfs_ail_cursor {
 */
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
+        struct task_struct      *xa_task;
        struct list_head        xa_ail;
        xfs_lsn_t               xa_target;
        struct list_head        xa_cursors;
        spinlock_t              xa_lock;
-        struct delayed_work     xa_work;
        xfs_lsn_t               xa_last_pushed_lsn;
-        unsigned long           xa_flags;
+        int                     xa_log_flush;
 };
-#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
-extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
 void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                struct xfs_ail_cursor *cur,
                                struct xfs_log_item **log_items, int nr_items,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 7c220b4227bc..7c220b4227bc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9322e13f0c63..4ecf2a549060 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -72,8 +72,8 @@ xfs_readlink_bmap(
        xfs_buf_t       *bp;
        int             error = 0;
-        error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
+        error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
-                        mval, &nmaps, NULL);
+                               0);
        if (error)
                goto out;
@@ -83,10 +83,11 @@ xfs_readlink_bmap(
                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
                                  XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
-                error = XFS_BUF_GETERROR(bp);
+                if (!bp)
+                        return XFS_ERROR(ENOMEM);
+                error = bp->b_error;
                if (error) {
-                        xfs_ioerror_alert("xfs_readlink",
+                        xfs_buf_ioerror_alert(bp, __func__);
-                                  ip->i_mount, bp, XFS_BUF_ADDR(bp));
                        xfs_buf_relse(bp);
                        goto out;
                }
@@ -94,7 +95,7 @@ xfs_readlink_bmap(
                        byte_cnt = pathlen;
                pathlen -= byte_cnt;
-                memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
+                memcpy(link, bp->b_addr, byte_cnt);
                xfs_buf_relse(bp);
        }
@@ -176,8 +177,7 @@ xfs_free_eofblocks(
        nimaps = 1;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+        error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
-                          NULL, 0, &imap, &nimaps, NULL);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!error && (nimaps != 0) &&
@@ -218,7 +218,7 @@ xfs_free_eofblocks(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                error = xfs_itruncate_data(&tp, ip, ip->i_size);
                if (error) {
@@ -287,7 +287,7 @@ xfs_inactive_symlink_rmt(
        xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        size = (int)ip->i_d.di_size;
        ip->i_d.di_size = 0;
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Find the block(s) so we can inval and unmap them.
@@ -295,9 +295,9 @@ xfs_inactive_symlink_rmt(
        done = 0;
        xfs_bmap_init(&free_list, &first_block);
        nmaps = ARRAY_SIZE(mval);
-        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+        error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
-                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+                                mval, &nmaps, 0);
-                        &free_list)))
+        if (error)
                goto error0;
        /*
         * Invalidate the block(s).
@@ -306,6 +306,10 @@ xfs_inactive_symlink_rmt(
                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                        XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
                        XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error1;
+                }
                xfs_trans_binval(tp, bp);
        }
        /*
@@ -331,7 +335,7 @@ xfs_inactive_symlink_rmt(
         * Mark it dirty so it will be logged and moved forward in the log as
         * part of every commit.
         */
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Get a new, empty transaction to return to our caller.
@@ -464,7 +468,7 @@ xfs_inactive_attrs(
                goto error_cancel;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
+        xfs_trans_ijoin(tp, ip, 0);
        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        ASSERT(ip->i_d.di_anextents == 0);
@@ -645,8 +649,6 @@ xfs_inactive(
        if (truncate) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                xfs_ioend_wait(ip);
                error = xfs_trans_reserve(tp, 0,
                                          XFS_ITRUNCATE_LOG_RES(mp),
                                          0, XFS_TRANS_PERM_LOG_RES,
@@ -660,7 +662,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                error = xfs_itruncate_data(&tp, ip, 0);
                if (error) {
@@ -684,7 +686,7 @@ xfs_inactive(
                        return VN_INACTIVE_CACHE;
                }
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
        } else {
                error = xfs_trans_reserve(tp, 0,
                                          XFS_IFREE_LOG_RES(mp),
@@ -697,7 +699,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
        }
        /*
@@ -937,7 +939,7 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1258,8 +1260,8 @@ xfs_remove(
        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        /*
         * If we're removing a directory perform some additional validation.
@@ -1404,8 +1406,8 @@ xfs_link(
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
        /*
         * If the source has too many links, we can't make any more to it.
@@ -1599,7 +1601,7 @@ xfs_symlink(
         * transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        /*
@@ -1630,10 +1632,9 @@ xfs_symlink(
                first_fsb = 0;
                nmaps = SYMLINK_MAPS;
-                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+                error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
-                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+                                  XFS_BMAPI_METADATA, &first_block, resblks,
-                                  &first_block, resblks, mval, &nmaps,
+                                  mval, &nmaps, &free_list);
-                                  &free_list);
                if (error)
                        goto error2;
@@ -1648,13 +1649,16 @@ xfs_symlink(
                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                               BTOBB(byte_cnt), 0);
-                        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+                        if (!bp) {
+                                error = ENOMEM;
+                                goto error2;
+                        }
                        if (pathlen < byte_cnt) {
                                byte_cnt = pathlen;
                        }
                        pathlen -= byte_cnt;
-                        memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
+                        memcpy(bp->b_addr, cur_chunk, byte_cnt);
                        cur_chunk += byte_cnt;
                        xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
@@ -1730,7 +1734,7 @@ xfs_set_dmattrs(
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        ip->i_d.di_dmevmask = evmask;
        ip->i_d.di_dmstate  = state;
@@ -1776,7 +1780,6 @@ xfs_alloc_file_space(
        xfs_fileoff_t           startoffset_fsb;
        xfs_fsblock_t           firstfsb;
        int                     nimaps;
-        int                     bmapi_flag;
        int                     quota_flag;
        int                     rt;
        xfs_trans_t             *tp;
@@ -1804,7 +1807,6 @@ xfs_alloc_file_space(
        count = len;
        imapp = &imaps[0];
        nimaps = 1;
-        bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
@@ -1875,16 +1877,12 @@ xfs_alloc_file_space(
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
-                /*
-                 * Issue the xfs_bmapi() call to allocate the blocks
-                 */
                xfs_bmap_init(&free_list, &firstfsb);
-                error = xfs_bmapi(tp, ip, startoffset_fsb,
+                error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-                                  allocatesize_fsb, bmapi_flag,
+                                        allocatesize_fsb, alloc_type, &firstfsb,
-                                  &firstfsb, 0, imapp, &nimaps,
+                                        0, imapp, &nimaps, &free_list);
-                                  &free_list);
                if (error) {
                        goto error0;
                }
@@ -1974,8 +1972,7 @@ xfs_zero_remaining_bytes(
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
+                error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-                        NULL, 0, &imap, &nimap, NULL);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
@@ -1995,11 +1992,11 @@ xfs_zero_remaining_bytes(
                xfsbdstrat(mp, bp);
                error = xfs_buf_iowait(bp);
                if (error) {
-                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+                        xfs_buf_ioerror_alert(bp,
-                                          mp, bp, XFS_BUF_ADDR(bp));
+                                        "xfs_zero_remaining_bytes(read)");
                        break;
                }
-                memset(XFS_BUF_PTR(bp) +
+                memset(bp->b_addr +
                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
                      0, lastoffset - offset + 1);
                XFS_BUF_UNDONE(bp);
@@ -2008,8 +2005,8 @@ xfs_zero_remaining_bytes(
                xfsbdstrat(mp, bp);
                error = xfs_buf_iowait(bp);
                if (error) {
-                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+                        xfs_buf_ioerror_alert(bp,
-                                          mp, bp, XFS_BUF_ADDR(bp));
+                                        "xfs_zero_remaining_bytes(write)");
                        break;
                }
        }
@@ -2074,7 +2071,7 @@ xfs_free_file_space(
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
                /* wait for the completion of any pending DIOs */
-                xfs_ioend_wait(ip);
+                inode_dio_wait(VFS_I(ip));
        }
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -2094,8 +2091,8 @@ xfs_free_file_space(
         */
        if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, startoffset_fsb,
+                error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-                        1, 0, NULL, 0, &imap, &nimap, NULL);
+                                        &imap, &nimap, 0);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2109,8 +2106,8 @@ xfs_free_file_space(
                                startoffset_fsb += mp->m_sb.sb_rextsize - mod;
                }
                nimap = 1;
-                error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
+                error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                        1, 0, NULL, 0, &imap, &nimap, NULL);
+                                        &imap, &nimap, 0);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2178,7 +2175,7 @@ xfs_free_file_space(
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip);
+                xfs_trans_ijoin(tp, ip, 0);
                /*
                 * issue the bunmapi() call to free the blocks
@@ -2351,8 +2348,7 @@ xfs_change_file_space(
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip);
        if ((attr_flags & XFS_ATTR_DMI) == 0) {
                ip->i_d.di_mode &= ~S_ISUID;
@@ -2377,10 +2373,5 @@ xfs_change_file_space(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (attr_flags & XFS_ATTR_SYNC)
                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return error;
 }
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c