320 files changed, 9330 insertions, 7912 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af61..91fba025fcbe 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o
        vfs_dir.o \
        vfs_dentry.o \
        v9fs.o \
-        fid.o
+        fid.o  \
+        xattr.o \
+        xattr_user.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b2815..358563689064 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
        return ret;
 }
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+                                  struct dentry *dentry, char ***names)
+{
+        int n = 0, i;
+        char **wnames;
+        struct dentry *ds;
+        for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+                n++;
+        wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+        if (!wnames)
+                goto err_out;
+        for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+                wnames[i] = (char  *)ds->d_name.name;
+        *names = wnames;
+        return n;
+err_out:
+        return -ENOMEM;
+}
 /**
 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
 * @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        int i, n, l, clone, any, access;
        u32 uid;
        struct p9_fid *fid, *old_fid = NULL;
-        struct dentry *d, *ds;
+        struct dentry *ds;
        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
@@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
+        /*
+         * we don't have a matching fid. To do a TWALK we need
+         * parent fid. We need to prevent rename when we want to
+         * look at the parent.
+         */
+        down_read(&v9ses->rename_sem);
        ds = dentry->d_parent;
        fid = v9fs_fid_find(ds, uid, any);
-        if (!fid) { /* walk from the root */
+        if (fid) {
-                n = 0;
+                /* Found the parent fid do a lookup with that */
-                for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+                fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
-                        n++;
+                goto fid_out;
+        }
+        up_read(&v9ses->rename_sem);
-                fid = v9fs_fid_find(ds, uid, any);
+        /* start from the root and try to do a lookup */
-                if (!fid) { /* the user is not attached to the fs yet */
+        fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
-                        if (access == V9FS_ACCESS_SINGLE)
+        if (!fid) {
-                                return ERR_PTR(-EPERM);
+                /* the user is not attached to the fs yet */
+                if (access == V9FS_ACCESS_SINGLE)
+                        return ERR_PTR(-EPERM);
-                        if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
                                uname = NULL;
-                        else
+                else
-                                uname = v9ses->uname;
+                        uname = v9ses->uname;
-                        fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+                fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
-                                v9ses->aname);
+                                       v9ses->aname);
+                if (IS_ERR(fid))
-                        if (IS_ERR(fid))
+                        return fid;
-                                return fid;
-                        v9fs_fid_add(ds, fid);
-                }
-        } else /* walk from the parent */
-                n = 1;
-        if (ds == dentry)
+                v9fs_fid_add(dentry->d_sb->s_root, fid);
+        }
+        /* If we are root ourself just return that */
+        if (dentry->d_sb->s_root == dentry)
                return fid;
+        /*
-        wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+         * Do a multipath walk with attached root.
-        if (!wnames)
+         * When walking parent we need to make sure we
-                return ERR_PTR(-ENOMEM);
+         * don't have a parallel rename happening
+         */
-        for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent)
+        down_read(&v9ses->rename_sem);
-                wnames[i] = (char *) d->d_name.name;
+        n  = build_path_from_dentry(v9ses, dentry, &wnames);
+        if (n < 0) {
+                fid = ERR_PTR(n);
+                goto err_out;
+        }
        clone = 1;
        i = 0;
        while (i < n) {
                l = min(n - i, P9_MAXWELEM);
+                /*
+                 * We need to hold rename lock when doing a multipath
+                 * walk to ensure none of the patch component change
+                 */
                fid = p9_client_walk(fid, l, &wnames[i], clone);
                if (IS_ERR(fid)) {
                        if (old_fid) {
@@ -193,15 +234,17 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
                                p9_client_clunk(old_fid);
                        }
                        kfree(wnames);
-                        return fid;
+                        goto err_out;
                }
                old_fid = fid;
                i += l;
                clone = 0;
        }
        kfree(wnames);
+fid_out:
        v9fs_fid_add(dentry, fid);
+err_out:
+        up_read(&v9ses->rename_sem);
        return fid;
 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd66..38dc0e067599 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                __putname(v9ses->uname);
                return ERR_PTR(-ENOMEM);
        }
+        init_rwsem(&v9ses->rename_sem);
        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
        if (rc) {
@@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
-        if (!v9fs_proto_dotu(v9ses) &&
+        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
                v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb458..4c963c9fc41f 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -104,6 +104,7 @@ struct v9fs_session_info {
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
+        struct rw_semaphore rename_sem;
 };
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 32ef4009d030..f47c6bbb01b3 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -55,6 +55,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 void v9fs_clear_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d61e3b28ce37..16c8a2a98c1b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
 }
 /**
- * v9fs_dir_readdir - read a directory
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
 * @filp: opened file structure
- * @dirent: directory structure ???
+ * @buflen: Length in bytes of buffer to allocate
- * @filldir: function to populate directory structure ???
 *
 */
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-        int over;
-        struct p9_wstat st;
-        int err = 0;
-        struct p9_fid *fid;
-        int buflen;
-        int reclen = 0;
        struct p9_rdir *rdir;
+        struct p9_fid *fid;
+        int err = 0;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
        fid = filp->private_data;
-        buflen = fid->clnt->msize - P9_IOHDRSZ;
-        /* allocate rdir on demand */
        if (!fid->rdir) {
                rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                spin_unlock(&filp->f_dentry->d_lock);
                kfree(rdir);
        }
+exit:
+        return err;
+}
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filp: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        int over;
+        struct p9_wstat st;
+        int err = 0;
+        struct p9_fid *fid;
+        int buflen;
+        int reclen = 0;
+        struct p9_rdir *rdir;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        fid = filp->private_data;
+        buflen = fid->clnt->msize - P9_IOHDRSZ;
+        err = v9fs_alloc_rdir_buf(filp, buflen);
+        if (err)
+                goto exit;
        rdir = (struct p9_rdir *) fid->rdir;
        err = mutex_lock_interruptible(&rdir->mutex);
@@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                while (rdir->head < rdir->tail) {
                        p9stat_init(&st);
                        err = p9stat_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &st,
+                                                rdir->tail - rdir->head, &st,
                                                fid->clnt->proto_version);
                        if (err) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -176,6 +196,88 @@ exit:
        return err;
 }
+/**
+ * v9fs_dir_readdir_dotl - read a directory
+ * @filp: opened file structure
+ * @dirent: buffer to fill dirent structures
+ * @filldir: function to populate dirent structures
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+                                                filldir_t filldir)
+{
+        int over;
+        int err = 0;
+        struct p9_fid *fid;
+        int buflen;
+        struct p9_rdir *rdir;
+        struct p9_dirent curdirent;
+        u64 oldoffset = 0;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        fid = filp->private_data;
+        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+        err = v9fs_alloc_rdir_buf(filp, buflen);
+        if (err)
+                goto exit;
+        rdir = (struct p9_rdir *) fid->rdir;
+        err = mutex_lock_interruptible(&rdir->mutex);
+        if (err)
+                return err;
+        while (err == 0) {
+                if (rdir->tail == rdir->head) {
+                        err = p9_client_readdir(fid, rdir->buf, buflen,
+                                                                filp->f_pos);
+                        if (err <= 0)
+                                goto unlock_and_exit;
+                        rdir->head = 0;
+                        rdir->tail = err;
+                }
+                while (rdir->head < rdir->tail) {
+                        err = p9dirent_read(rdir->buf + rdir->head,
+                                                buflen - rdir->head, &curdirent,
+                                                fid->clnt->proto_version);
+                        if (err < 0) {
+                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+                                err = -EIO;
+                                goto unlock_and_exit;
+                        }
+                        /* d_off in dirent structure tracks the offset into
+                         * the next dirent in the dir. However, filldir()
+                         * expects offset into the current dirent. Hence
+                         * while calling filldir send the offset from the
+                         * previous dirent structure.
+                         */
+                        over = filldir(dirent, curdirent.d_name,
+                                        strlen(curdirent.d_name),
+                                        oldoffset, v9fs_qid2ino(&curdirent.qid),
+                                        curdirent.d_type);
+                        oldoffset = curdirent.d_off;
+                        if (over) {
+                                err = 0;
+                                goto unlock_and_exit;
+                        }
+                        filp->f_pos = curdirent.d_off;
+                        rdir->head += err;
+                }
+        }
+unlock_and_exit:
+        mutex_unlock(&rdir->mutex);
+exit:
+        return err;
+}
 /**
 * v9fs_dir_release - close a directory
@@ -207,7 +309,7 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
        .read = generic_read_dir,
        .llseek = generic_file_llseek,
-        .readdir = v9fs_dir_readdir,
+        .readdir = v9fs_dir_readdir_dotl,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2bedc6c94fc2..e97c92bd6f16 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        struct p9_fid *fid;
        int omode;
-        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
+        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
        v9ses = v9fs_inode2v9ses(inode);
-        omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
+        if (v9fs_proto_dotl(v9ses))
+                omode = file->f_flags;
+        else
+                omode = v9fs_uflags2omode(file->f_flags,
+                                        v9fs_proto_dotu(v9ses));
        fid = file->private_data;
        if (!fid) {
                fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                        p9_client_clunk(fid);
                        return err;
                }
-                if (omode & P9_OTRUNC) {
+                if (file->f_flags & O_TRUNC) {
                        i_size_write(inode, 0);
                        inode->i_blocks = 0;
                }
-                if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
+                if ((file->f_flags & O_APPEND) &&
+                        (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
                        generic_file_llseek(file, 0, SEEK_END);
        }
@@ -139,7 +144,7 @@ ssize_t
 v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
               u64 offset)
 {
-        int n, total;
+        int n, total, size;
        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
        n = 0;
        total = 0;
+        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
        do {
                n = p9_client_read(fid, data, udata, offset, count);
                if (n <= 0)
@@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
                offset += n;
                count -= n;
                total += n;
-        } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+        } while (count > 0 && n == size);
        if (n < 0)
                total = n;
@@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 {
        int ret;
        struct p9_fid *fid;
+        size_t size;
        P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
        fid = filp->private_data;
-        if (count > (fid->clnt->msize - P9_IOHDRSZ))
+        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+        if (count > size)
                ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
        else
                ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
        fid = filp->private_data;
        clnt = fid->clnt;
-        rsize = fid->iounit;
+        rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
-        if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
-                rsize = clnt->msize - P9_IOHDRSZ;
        do {
                if (count < rsize)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4331b3b5ee1c..6e94f3247cec 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -42,6 +43,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "cache.h"
+#include "xattr.h"
 static const struct inode_operations v9fs_dir_inode_operations;
 static const struct inode_operations v9fs_dir_inode_operations_dotu;
@@ -236,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 /**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&dcache_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&dcache_lock);
+        return dentry;
+}
+/**
 * v9fs_get_inode - helper function to setup an inode
 * @sb: superblock
 * @mode: mode to setup inode with
@@ -267,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFBLK:
        case S_IFCHR:
        case S_IFSOCK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (v9fs_proto_dotl(v9ses)) {
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else if (v9fs_proto_dotu(v9ses)) {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                } else {
                        P9_DPRINTK(P9_DEBUG_ERROR,
                                   "special files without extended mode\n");
                        err = -EINVAL;
@@ -396,23 +439,14 @@ void v9fs_clear_inode(struct inode *inode)
 #endif
 }
-/**
- * v9fs_inode_from_fid - populate an inode by issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
 static struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
        int err, umode;
-        struct inode *ret;
+        struct inode *ret = NULL;
        struct p9_wstat *st;
-        ret = NULL;
        st = p9_client_stat(fid);
        if (IS_ERR(st))
                return ERR_CAST(st);
@@ -433,15 +467,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 #endif
        p9stat_free(st);
        kfree(st);
        return ret;
 error:
        p9stat_free(st);
        kfree(st);
        return ERR_PTR(err);
 }
+static struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                        struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -563,6 +644,118 @@ error:
 }
 /**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else
+                flags = O_RDWR;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, mode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* No need to populate the inode if we are not opening the file AND
+         * not in cached mode.
+         */
+        if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
+                /* Not in cached mode. No need to populate inode with stat */
+                dentry->d_op = &v9fs_dentry_operations;
+                p9_client_clunk(ofid);
+                d_instantiate(dentry, NULL);
+                return 0;
+        }
+        /* Now walk from the parent so we can get an unopened fid. */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to dentry */
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        dentry->d_op = &v9fs_cached_dentry_operations;
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* if we are opening a file, assign the open fid to the file */
+        if (nd && nd->flags & LOOKUP_OPEN) {
+                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                if (IS_ERR(filp)) {
+                        p9_client_clunk(ofid);
+                        return PTR_ERR(filp);
+                }
+                filp->private_data = ofid;
+        } else
+                p9_client_clunk(ofid);
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -652,6 +845,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
+                                        int mode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        mode |= S_IFDIR;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -678,6 +948,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
+        /* We can walk d_parent because we hold the dir->i_mutex */
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid))
                return ERR_CAST(dfid);
@@ -785,27 +1056,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto clunk_olddir;
        }
+        down_write(&v9ses->rename_sem);
        if (v9fs_proto_dotl(v9ses)) {
                retval = p9_client_rename(oldfid, newdirfid,
                                        (char *) new_dentry->d_name.name);
                if (retval != -ENOSYS)
                        goto clunk_newdir;
        }
+        if (old_dentry->d_parent != new_dentry->d_parent) {
+                /*
+                 * 9P .u can only handle file rename in the same directory
+                 */
-        /* 9P can only handle file rename in the same directory */
-        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
                P9_DPRINTK(P9_DEBUG_ERROR,
                                "old dir and new dir are different\n");
                retval = -EXDEV;
                goto clunk_newdir;
        }
        v9fs_blank_wstat(&wstat);
        wstat.muid = v9ses->uname;
        wstat.name = (char *) new_dentry->d_name.name;
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
+        if (!retval)
+                /* successful rename */
+                d_move(old_dentry, new_dentry);
+        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
 clunk_olddir:
@@ -853,6 +1130,42 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
+        kfree(st);
+        return 0;
+}
 /**
 * v9fs_vfs_setattr - set file metadata
 * @dentry: file whose metadata to set
@@ -903,6 +1216,49 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 }
 /**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval >= 0)
+                retval = inode_setattr(dentry->d_inode, iattr);
+        return retval;
+}
+/**
 * v9fs_stat2inode - populate an inode structure with mistat info
 * @stat: Plan 9 metadata (mistat) structure
 * @inode: inode to populate
@@ -980,6 +1336,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1022,7 +1449,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_proto_dotu(v9ses))
+        if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1128,6 +1555,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
+ * v9fs_vfs_symlink_dotl - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.L RFC for more information
+ *
+ */
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
+                goto error;
+        }
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1186,6 +1706,76 @@ clunk_fid:
 }
 /**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just i_count++
+                 */
+                atomic_inc(&old_dentry->d_inode->i_count);
+        }
+        dentry->d_op = old_dentry->d_op;
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1230,6 +1820,100 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1238,24 +1922,29 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .unlink = v9fs_vfs_unlink,
        .mkdir = v9fs_vfs_mkdir,
        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod,
+        .mknod = v9fs_vfs_mknod_dotl,
        .rename = v9fs_vfs_rename,
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
 static const struct inode_operations v9fs_dir_inode_operations_dotl = {
-        .create = v9fs_vfs_create,
+        .create = v9fs_vfs_create_dotl,
        .lookup = v9fs_vfs_lookup,
-        .symlink = v9fs_vfs_symlink,
+        .link = v9fs_vfs_link_dotl,
-        .link = v9fs_vfs_link,
+        .symlink = v9fs_vfs_symlink_dotl,
        .unlink = v9fs_vfs_unlink,
-        .mkdir = v9fs_vfs_mkdir,
+        .mkdir = v9fs_vfs_mkdir_dotl,
        .rmdir = v9fs_vfs_rmdir,
-        .mknod = v9fs_vfs_mknod,
+        .mknod = v9fs_vfs_mknod_dotl,
        .rename = v9fs_vfs_rename,
-        .getattr = v9fs_vfs_getattr,
+        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
 };
 static const struct inode_operations v9fs_dir_inode_operations = {
@@ -1276,8 +1965,12 @@ static const struct inode_operations v9fs_file_inode_operations = {
 };
 static const struct inode_operations v9fs_file_inode_operations_dotl = {
-        .getattr = v9fs_vfs_getattr,
+        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
 };
 static const struct inode_operations v9fs_symlink_inode_operations = {
@@ -1292,6 +1985,10 @@ static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
        .put_link = v9fs_vfs_put_link,
-        .getattr = v9fs_vfs_getattr,
+        .getattr = v9fs_vfs_getattr_dotl,
-        .setattr = v9fs_vfs_setattr,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
 };
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index be74d020436e..4b9ede0b41b7 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -45,6 +45,7 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "fid.h"
+#include "xattr.h"
 static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
@@ -77,9 +78,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
-        if (v9fs_proto_dotl(v9ses))
+        if (v9fs_proto_dotl(v9ses)) {
                sb->s_op = &v9fs_super_ops_dotl;
-        else
+                sb->s_xattr = v9fs_xattr_handlers;
+        } else
                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
@@ -107,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        struct inode *inode = NULL;
        struct dentry *root = NULL;
        struct v9fs_session_info *v9ses = NULL;
-        struct p9_wstat *st = NULL;
        int mode = S_IRWXUGO | S_ISVTX;
        struct p9_fid *fid;
        int retval = 0;
@@ -124,16 +125,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto close_session;
        }
-        st = p9_client_stat(fid);
-        if (IS_ERR(st)) {
-                retval = PTR_ERR(st);
-                goto clunk_fid;
-        }
        sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
        if (IS_ERR(sb)) {
                retval = PTR_ERR(sb);
-                goto free_stat;
+                goto clunk_fid;
        }
        v9fs_fill_super(sb, v9ses, flags, data);
@@ -151,22 +146,38 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        }
        sb->s_root = root;
-        root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
-        v9fs_stat2inode(st, root->d_inode, sb);
+        if (v9fs_proto_dotl(v9ses)) {
+                struct p9_stat_dotl *st = NULL;
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st)) {
+                        retval = PTR_ERR(st);
+                        goto clunk_fid;
+                }
+                v9fs_stat2inode_dotl(st, root->d_inode);
+                kfree(st);
+        } else {
+                struct p9_wstat *st = NULL;
+                st = p9_client_stat(fid);
+                if (IS_ERR(st)) {
+                        retval = PTR_ERR(st);
+                        goto clunk_fid;
+                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+                v9fs_stat2inode(st, root->d_inode, sb);
+                p9stat_free(st);
+                kfree(st);
+        }
        v9fs_fid_add(root, fid);
-        p9stat_free(st);
-        kfree(st);
 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        simple_set_mnt(mnt, sb);
        return 0;
-free_stat:
-        p9stat_free(st);
-        kfree(st);
 clunk_fid:
        p9_client_clunk(fid);
@@ -176,8 +187,6 @@ close_session:
        return retval;
 release_sb:
-        p9stat_free(st);
-        kfree(st);
        deactivate_locked_super(sb);
        return retval;
 }
@@ -278,4 +287,5 @@ struct file_system_type v9fs_fs_type = {
        .get_sb = v9fs_get_sb,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE,
 };
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 000000000000..f88e5c2dc873
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "fid.h"
+#include "xattr.h"
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+                       void *buffer, size_t buffer_size)
+{
+        ssize_t retval;
+        int msize, read_count;
+        u64 offset = 0, attr_size;
+        struct p9_fid *fid, *attr_fid;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+                __func__, name, buffer_size);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+        if (IS_ERR(attr_fid)) {
+                retval = PTR_ERR(attr_fid);
+                P9_DPRINTK(P9_DEBUG_VFS,
+                        "p9_client_attrwalk failed %zd\n", retval);
+                attr_fid = NULL;
+                goto error;
+        }
+        if (!buffer_size) {
+                /* request to get the attr_size */
+                retval = attr_size;
+                goto error;
+        }
+        if (attr_size > buffer_size) {
+                retval = -ERANGE;
+                goto error;
+        }
+        msize = attr_fid->clnt->msize;
+        while (attr_size) {
+                if (attr_size > (msize - P9_IOHDRSZ))
+                        read_count = msize - P9_IOHDRSZ;
+                else
+                        read_count = attr_size;
+                read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
+                                        NULL, offset, read_count);
+                if (read_count < 0) {
+                        /* error in xattr read */
+                        retval = read_count;
+                        goto error;
+                }
+                offset += read_count;
+                attr_size -= read_count;
+        }
+        /* Total read xattr bytes */
+        retval = offset;
+error:
+        if (attr_fid)
+                p9_client_clunk(attr_fid);
+        return retval;
+}
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+                   const void *value, size_t value_len, int flags)
+{
+        u64 offset = 0;
+        int retval, msize, write_count;
+        struct p9_fid *fid = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+                __func__, name, value_len, flags);
+        fid = v9fs_fid_clone(dentry);
+        if (IS_ERR(fid)) {
+                retval = PTR_ERR(fid);
+                fid = NULL;
+                goto error;
+        }
+        /*
+         * On success fid points to xattr
+         */
+        retval = p9_client_xattrcreate(fid, name, value_len, flags);
+        if (retval < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                        "p9_client_xattrcreate failed %d\n", retval);
+                goto error;
+        }
+        msize = fid->clnt->msize;;
+        while (value_len) {
+                if (value_len > (msize - P9_IOHDRSZ))
+                        write_count = msize - P9_IOHDRSZ;
+                else
+                        write_count = value_len;
+                write_count = p9_client_write(fid, ((char *)value)+offset,
+                                        NULL, offset, write_count);
+                if (write_count < 0) {
+                        /* error in xattr write */
+                        retval = write_count;
+                        goto error;
+                }
+                offset += write_count;
+                value_len -= write_count;
+        }
+        /* Total read xattr bytes */
+        retval = offset;
+error:
+        if (fid)
+                retval = p9_client_clunk(fid);
+        return retval;
+}
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+        return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+        &v9fs_xattr_user_handler,
+        NULL
+};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 000000000000..9ddf672ae5c4
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+#include <linux/xattr.h>
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+                              void *, size_t);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+                          const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 000000000000..d0b701b72080
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+                        void *buffer, size_t size, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_USER_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+        memcpy(full_name+prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+        kfree(full_name);
+        return retval;
+}
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_USER_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+        memcpy(full_name + prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+        kfree(full_name);
+        return retval;
+}
+struct xattr_handler v9fs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .get    = v9fs_xattr_user_get,
+        .set    = v9fs_xattr_user_set,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f85b5947613..3d185308ec88 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -64,7 +64,7 @@ source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 config CUSE
-        tristate "Character device in Userpace support"
+        tristate "Character device in Userspace support"
        depends on FUSE_FS
        help
          This FUSE extension allows character devices to be
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c772..8f975f25b486 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
        select AF_RXRPC
+        select DNS_RESOLVER
        help
          If you say Y here, you will get an experimental Andrew File System
          driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059ed..ffea35c63879 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
+#include <linux/dns_resolver.h>
 #include <linux/sched.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
@@ -36,6 +37,8 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
        struct key *key;
        size_t namelen;
        char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
+        char  *dvllist = NULL, *_vllist = NULL;
+        char  delimiter = ':';
        int ret;
        _enter("%s,%s", name, vllist);
@@ -43,8 +46,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
        BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
        namelen = strlen(name);
-        if (namelen > AFS_MAXCELLNAME)
+        if (namelen > AFS_MAXCELLNAME) {
+                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
+        }
        /* allocate and initialise a cell record */
        cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +69,31 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
        INIT_LIST_HEAD(&cell->vl_list);
        spin_lock_init(&cell->vl_lock);
+        /* if the ip address is invalid, try dns query */
+        if (!vllist || strlen(vllist) < 7) {
+                ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
+                if (ret < 0) {
+                        _leave(" = %d", ret);
+                        return ERR_PTR(ret);
+                }
+                _vllist = dvllist;
+                /* change the delimiter for user-space reply */
+                delimiter = ',';
+        } else {
+                _vllist = vllist;
+        }
        /* fill in the VL server list from the rest of the string */
        do {
                unsigned a, b, c, d;
-                next = strchr(vllist, ':');
+                next = strchr(_vllist, delimiter);
                if (next)
                        *next++ = 0;
-                if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
+                if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
                        goto bad_address;
                if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +102,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
                cell->vl_addrs[cell->vl_naddrs++].s_addr =
                        htonl((a << 24) | (b << 16) | (c << 8) | d);
-        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
+        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
        /* create a key to represent an anonymous user */
        memcpy(keyname, "afs@", 4);
@@ -110,6 +131,7 @@ bad_address:
        ret = -EINVAL;
 error:
        key_put(cell->anonymous_key);
+        kfree(dvllist);
        kfree(cell);
        _leave(" = %d", ret);
        return ERR_PTR(ret);
@@ -201,14 +223,12 @@ int afs_cell_init(char *rootcell)
        }
        cp = strchr(rootcell, ':');
-        if (!cp) {
+        if (!cp)
-                printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
+                _debug("kAFS: no VL server IP addresses specified");
-                _leave(" = -EINVAL");
+        else
-                return -EINVAL;
+                *cp++ = 0;
-        }
        /* allocate a cell record for the root cell */
-        *cp++ = 0;
        new_root = afs_cell_create(rootcell, cp);
        if (IS_ERR(new_root)) {
                _leave(" = %ld", PTR_ERR(new_root));
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c55..cfd1cbe25b22 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
        /* initialise the callback update process */
        ret = afs_callback_update_init();
+        if (ret < 0)
+                goto error_callback_update_init;
        /* create the RxRPC transport */
        ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
 error_fs:
        afs_close_socket();
 error_open_socket:
+        afs_callback_update_kill();
+error_callback_update_init:
+        afs_vlocation_purge();
 error_vl_update_init:
+        afs_cell_purge();
 error_cell_init:
 #ifdef CONFIG_AFS_FSCACHE
        fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-        afs_callback_update_kill();
-        afs_vlocation_purge();
-        afs_cell_purge();
        afs_proc_cleanup();
        rcu_barrier();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3dab9e9948d0..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 {
        struct address_space *mapping = vnode->vfs_inode.i_mapping;
        struct writeback_control wbc = {
-                .bdi            = mapping->backing_dev_info,
                .sync_mode      = WB_SYNC_ALL,
                .nr_to_write    = LONG_MAX,
                .range_cyclic   = 1,
diff --git a/fs/aio.c b/fs/aio.c
index 1ccf25cef1f0..3006b5bc33d6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1277,7 +1277,7 @@ out:
 /* sys_io_destroy:
 *      Destroy the aio_context specified.  May cancel any outstanding 
 *      AIOs and block on completion.  Will fail with -ENOSYS if not
- *      implemented.  May fail with -EFAULT if the context pointed to
+ *      implemented.  May fail with -EINVAL if the context pointed to
 *      is invalid.
 */
 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1795,15 +1795,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 /* io_getevents:
 *      Attempts to read at least min_nr events and up to nr events from
- *      the completion queue for the aio_context specified by ctx_id.  May
+ *      the completion queue for the aio_context specified by ctx_id. If
- *      fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
+ *      it succeeds, the number of read events is returned. May fail with
- *      if nr is out of range, if when is out of range.  May fail with
+ *      -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
- *      -EFAULT if any of the memory specified to is invalid.  May return
+ *      out of range, if timeout is out of range.  May fail with -EFAULT
- *      0 or < min_nr if no events are available and the timeout specified
+ *      if any of the memory specified is invalid.  May return 0 or
- *      by when has elapsed, where when == NULL specifies an infinite
+ *      < min_nr if the timeout specified by timeout has elapsed
- *      timeout.  Note that the timeout pointed to by when is relative and
+ *      before sufficient events are available, where timeout == NULL
- *      will be updated if not NULL and the operation blocks.  Will fail
+ *      specifies an infinite timeout. Note that the timeout pointed to by
- *      with -ENOSYS if not implemented.
+ *      timeout is relative and will be updated if not NULL and the
+ *      operation blocks. Will fail with -ENOSYS if not implemented.
 */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
                long, min_nr,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e63..dc39d2824885 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
                                              init_once);
        if (befs_inode_cachep == NULL) {
                printk(KERN_ERR "befs_init_inodecache: "
-                       "Couldn't initalize inode slabcache\n");
+                       "Couldn't initialize inode slabcache\n");
                return -ENOMEM;
        }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99d6af811747..b3171fb0dc9a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -681,8 +681,8 @@ retry:
        if (!bd_may_claim(bdev, whole, holder))
                return -EBUSY;
-        /* if someone else is claiming, wait for it to finish */
+        /* if claiming is already in progress, wait for it to finish */
-        if (whole->bd_claiming && whole->bd_claiming != holder) {
+        if (whole->bd_claiming) {
                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
                DEFINE_WAIT(wait);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1d966b0fe4..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      int data_size, int empty,
                                      struct extent_buffer *right,
-                                      int free_space, u32 left_nritems)
+                                      int free_space, u32 left_nritems,
+                                      u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *upper = path->nodes[1];
@@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (empty)
                nr = 0;
        else
-                nr = 1;
+                nr = max_t(u32, 1, min_slot);
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
@@ -2469,10 +2474,14 @@ out_unlock:
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
 */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *root, struct btrfs_path *path, int data_size,
+                           *root, struct btrfs_path *path,
-                           int empty)
+                           int min_data_size, int data_size,
+                           int empty, u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *right;
@@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (left_nritems == 0)
                goto out_unlock;
-        return __push_leaf_right(trans, root, path, data_size, empty,
+        return __push_leaf_right(trans, root, path, min_data_size, empty,
-                                right, free_space, left_nritems);
+                                right, free_space, left_nritems, min_slot);
 out_unlock:
        btrfs_tree_unlock(right);
        free_extent_buffer(right);
@@ -2525,12 +2534,17 @@ out_unlock:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
 */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct btrfs_path *path, int data_size,
                                     int empty, struct extent_buffer *left,
-                                     int free_space, int right_nritems)
+                                     int free_space, u32 right_nritems,
+                                     u32 max_slot)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
@@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        slot = path->slots[1];
        if (empty)
-                nr = right_nritems;
+                nr = min(right_nritems, max_slot);
        else
-                nr = right_nritems - 1;
+                nr = min(right_nritems - 1, max_slot);
        for (i = 0; i < nr; i++) {
                item = btrfs_item_nr(right, i);
@@ -2712,10 +2726,14 @@ out:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
 */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, struct btrfs_path *path, int data_size,
+                          *root, struct btrfs_path *path, int min_data_size,
-                          int empty)
+                          int data_size, int empty, u32 max_slot)
 {
        struct extent_buffer *right = path->nodes[0];
        struct extent_buffer *left;
@@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
                goto out;
        }
-        return __push_leaf_left(trans, root, path, data_size,
+        return __push_leaf_left(trans, root, path, min_data_size,
-                               empty, left, free_space, right_nritems);
+                               empty, left, free_space, right_nritems,
+                               max_slot);
 out:
        btrfs_tree_unlock(left);
        free_extent_buffer(left);
@@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 }
 /*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          int data_size)
+{
+        int ret;
+        int progress = 0;
+        int slot;
+        u32 nritems;
+        slot = path->slots[0];
+        /*
+         * try to push all the items after our slot into the
+         * right leaf
+         */
+        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        /*
+         * our goal is to get our slot at the start or end of a leaf.  If
+         * we've done so we're done
+         */
+        if (path->slots[0] == 0 || path->slots[0] == nritems)
+                return 0;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        /* try to push all the items before our slot into the next leaf */
+        slot = path->slots[0];
+        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        if (progress)
+                return 0;
+        return 1;
+}
+/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
@@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int wret;
        int split;
        int num_doubles = 0;
+        int tried_avoid_double = 0;
        l = path->nodes[0];
        slot = path->slots[0];
@@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+        if (data_size) {
-                wret = push_leaf_right(trans, root, path, data_size, 0);
+                wret = push_leaf_right(trans, root, path, data_size,
+                                       data_size, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
-                        wret = push_leaf_left(trans, root, path, data_size, 0);
+                        wret = push_leaf_left(trans, root, path, data_size,
+                                              data_size, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
@@ -2923,6 +3003,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2;
                                }
                        }
@@ -2939,6 +3021,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2 ;
                                }
                        }
@@ -3019,6 +3103,13 @@ again:
        }
        return ret;
+push_for_double:
+        push_for_double_split(trans, root, path, data_size);
+        tried_avoid_double = 1;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        goto again;
 }
 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        extent_buffer_get(leaf);
                        btrfs_set_path_blocking(path);
-                        wret = push_leaf_left(trans, root, path, 1, 1);
+                        wret = push_leaf_left(trans, root, path, 1, 1,
+                                              1, (u32)-1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
                        if (path->nodes[0] == leaf &&
                            btrfs_header_nritems(leaf)) {
-                                wret = push_leaf_right(trans, root, path, 1, 1);
+                                wret = push_leaf_right(trans, root, path, 1,
+                                                       1, 1, 0);
                                if (wret < 0 && wret != -ENOSPC)
                                        ret = wret;
                        }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a4080c21ec55..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2594,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
-                .bdi            = wbc->bdi,
                .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
@@ -2628,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
-                .bdi            = inode->i_mapping->backing_dev_info,
                .sync_mode      = mode,
                .older_than_this = NULL,
                .nr_to_write    = nr_pages * 2,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4dbaf89b1337..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         */
        /* the destination must be opened for writing */
-        if (!(file->f_mode & FMODE_WRITE))
+        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
        ret = mnt_want_write(file->f_path.mnt);
@@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        /* determine range to clone */
        ret = -EINVAL;
-        if (off >= src->i_size || off + len > src->i_size)
+        if (off + len > src->i_size || off + len < off)
                goto out_unlock;
        if (len == 0)
                olen = len = src->i_size - off;
@@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        u64 disko = 0, diskl = 0;
                        u64 datao = 0, datal = 0;
                        u8 comp;
+                        u64 endoff;
                        size = btrfs_item_size_nr(leaf, slot);
                        read_extent_buffer(leaf, buf,
@@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_release_path(root, path);
                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                        if (new_key.offset + datal > inode->i_size)
-                                btrfs_i_size_write(inode,
+                        /*
-                                                   new_key.offset + datal);
+                         * we round up to the block size at eof when
+                         * determining which extents to clone above,
+                         * but shouldn't round up the file size
+                         */
+                        endoff = new_key.offset + datal;
+                        if (endoff > off+olen)
+                                endoff = off+olen;
+                        if (endoff > inode->i_size)
+                                btrfs_i_size_write(inode, endoff);
                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
        printk(KERN_ERR "%sobject: OBJ%x\n",
               prefix, object->fscache.debug_id);
-        printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+        printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
               prefix, fscache_object_states[object->fscache.state],
-               object->fscache.flags, object->fscache.work.flags,
+               object->fscache.flags, work_busy(&object->fscache.work),
               object->fscache.events,
               object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
                /* if the object we're waiting for is queued for processing,
                 * then just put ourselves on the queue behind it */
-                if (slow_work_is_queued(&xobject->fscache.work)) {
+                if (work_pending(&xobject->fscache.work)) {
                        _debug("queue OBJ%x behind OBJ%x immediately",
                               object->fscache.debug_id,
                               xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
                }
                /* otherwise we sleep until either the object we're waiting for
-                 * is done, or the slow-work facility wants the thread back to
+                 * is done, or the fscache_object is congested */
-                 * do other work */
                wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
                init_wait(&wait);
                requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
                        prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
                        if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
                                break;
-                        requeue = slow_work_sleep_till_thread_needed(
-                                &object->fscache.work, &timeout);
+                        requeue = fscache_object_sleep_till_congested(&timeout);
                } while (timeout > 0 && !requeue);
                finish_wait(wq, &wait);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-        op->op.flags |= FSCACHE_OP_FAST;
+        op->op.flags |= FSCACHE_OP_ASYNC;
        op->op.processor = cachefiles_read_copier;
        pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
        pagevec_init(&pagevec, 0);
        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-        op->op.flags |= FSCACHE_OP_FAST;
+        op->op.flags |= FSCACHE_OP_ASYNC;
        op->op.processor = cachefiles_read_copier;
        INIT_LIST_HEAD(&backpages);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a9..bc87b9c1d27e 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,7 @@ config CEPH_FS
        tristate "Ceph distributed file system (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
        select LIBCRC32C
-        select CONFIG_CRYPTO_AES
+        select CRYPTO_AES
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 83d4d2785ffe..6d44053ecff1 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -493,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
                return -EAGAIN;
        }
-        op = le32_to_cpu(head->op);
+        op = le16_to_cpu(head->op);
        result = le32_to_cpu(head->result);
        dout("handle_reply op %d result %d\n", op, result);
        switch (op) {
@@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
                remove_ticket_handler(ac, th);
        }
+        if (xi->auth_authorizer.buf)
+                ceph_buffer_put(xi->auth_authorizer.buf);
        kfree(ac->private);
        ac->private = NULL;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 619b61655ee5..b81be9a56487 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
        struct ceph_cap *cap = NULL;
        /* temporary, until we do something about cap import/export */
-        if (!ctx)
+        if (!ctx) {
-                return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                if (cap) {
+                        caps_use_count++;
+                        caps_total_count++;
+                }
+                return cap;
+        }
        spin_lock(&caps_list_lock);
        dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
@@ -621,7 +627,7 @@ retry:
        if (fmode >= 0)
                __ceph_get_fmode(ci, fmode);
        spin_unlock(&inode->i_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -1175,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        }
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        return delayed;
 }
@@ -2147,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        else if (flushsnaps)
                ceph_flush_snaps(ci);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (put)
                iput(inode);
 }
@@ -2223,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                iput(inode);
        } else if (complete_capsnap) {
                ceph_flush_snaps(ci);
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        }
        if (drop_capsnap)
                iput(inode);
@@ -2399,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        if (queue_invalidate)
                ceph_queue_invalidate(inode);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (check_caps == 1)
                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2454,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-                wake_up(&mdsc->cap_flushing_wq);
+                wake_up_all(&mdsc->cap_flushing_wq);
                dout(" inode %p now !flushing\n", inode);
                if (ci->i_dirty_caps == 0) {
@@ -2466,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                }
        }
        spin_unlock(&mdsc->cap_dirty_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
 out:
        spin_unlock(&inode->i_lock);
@@ -2886,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *cap;
        struct ceph_mds_request_release *rel = *p;
+        int used, dirty;
        int ret = 0;
-        int used = 0;
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
-        dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+        dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
-             mds, ceph_cap_string(used), ceph_cap_string(drop),
+             inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
             ceph_cap_string(unless));
-        /* only drop unused caps */
+        /* only drop unused, clean caps */
-        drop &= ~used;
+        drop &= ~(used | dirty);
        cap = __get_cap_for_mds(ci, mds);
        if (cap && __cap_is_valid(cap)) {
@@ -2977,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
                memcpy(*p, dentry->d_name.name, dentry->d_name.len);
                *p += dentry->d_name.len;
                rel->dname_seq = cpu_to_le32(di->lease_seq);
+                __ceph_mdsc_drop_dentry_lease(dentry);
        }
        spin_unlock(&dentry->d_lock);
        return ret;
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 {
-        dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 */
 static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
 {
-        if (weight[item] >= 0x1000)
+        if (weight[item] >= 0x10000)
                return 0;
        if (weight[item] == 0)
                return 1;
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
        int itemtype;
        int collide, reject;
        const int orig_tries = 5; /* attempts before we fall back to search */
-        dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+                bucket->id, x, outpos, numrep);
        for (rep = outpos; rep < numrep; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
@@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map,
                                        BUG_ON(item >= 0 ||
                                               (-1-item) >= map->max_buckets);
                                        in = map->buckets[-1-item];
+                                        retry_bucket = 1;
                                        continue;
                                }
@@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map,
                                        }
                                }
-                                if (recurse_to_leaf &&
+                                reject = 0;
-                                    item < 0 &&
+                                if (recurse_to_leaf) {
-                                    crush_choose(map, map->buckets[-1-item],
+                                        if (item < 0) {
-                                                 weight,
+                                                if (crush_choose(map,
-                                                 x, outpos+1, 0,
+                                                         map->buckets[-1-item],
-                                                 out2, outpos,
+                                                         weight,
-                                                 firstn, 0, NULL) <= outpos) {
+                                                         x, outpos+1, 0,
-                                        reject = 1;
+                                                         out2, outpos,
-                                } else {
+                                                         firstn, 0,
+                                                         NULL) <= outpos)
+                                                        /* didn't get leaf */
+                                                        reject = 1;
+                                        } else {
+                                                /* we already have a leaf! */
+                                                out2[outpos] = item;
+                                        }
+                                }
+                                if (!reject) {
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
@@ -424,12 +437,12 @@ reject:
                        continue;
                }
-                dprintk("choose got %d\n", item);
+                dprintk("CHOOSE got %d\n", item);
                out[outpos] = item;
                outpos++;
        }
-        dprintk("choose returns %d\n", outpos);
+        dprintk("CHOOSE returns %d\n", outpos);
        return outpos;
 }
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3be33fb066cc..f2f5332ddbba 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 static int caps_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = p;
+        struct ceph_client *client = s->private;
        int total, avail, used, reserved, min;
        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f85719310db2..f94ed3c7f6a5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        spin_lock(&inode->i_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            ceph_snap(inode) != CEPH_SNAPDIR &&
            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                err = __dcache_readdir(filp, dirent, filldir);
@@ -1013,18 +1014,22 @@ out_touch:
 /*
 * When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
+ * of the current dir gen or if this is in the snapshot namespace.
 */
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        struct inode *parent_inode = dentry->d_parent->d_inode;
+        u64 snapid = ceph_snap(parent_inode);
-        if (parent_inode) {
+        dout("dentry_release %p parent %p\n", dentry, parent_inode);
+        if (parent_inode && snapid != CEPH_SNAPDIR) {
                struct ceph_inode_info *ci = ceph_inode(parent_inode);
                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen) {
+                if (ci->i_shared_gen == di->lease_shared_gen ||
+                    snapid <= CEPH_MAXSNAP) {
                        dout(" clearing %p complete (d_release)\n",
                             parent_inode);
                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = {
 struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
+        .d_release = ceph_dentry_release,
 };
 struct dentry_operations ceph_snap_dentry_ops = {
+        .d_release = ceph_dentry_release,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6251a1574b94..7c08698fad3e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
        kmem_cache_free(ceph_file_cachep, cf);
        /* wake up anyone waiting for caps on this inode */
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ab47f46ca282..389f9dbd9949 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                d_drop(dn);
        realdn = d_materialise_unique(dn, in);
        if (IS_ERR(realdn)) {
-                pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+                pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
-                       dn, in, ceph_vinop(in));
+                       PTR_ERR(realdn), dn, in, ceph_vinop(in));
                if (prehash)
                        *prehash = false; /* don't rehash on error */
                dn = realdn; /* note realdn contains the error */
@@ -1199,8 +1199,10 @@ retry_lookup:
                                goto out;
                        }
                        err = ceph_init_dentry(dn);
-                        if (err < 0)
+                        if (err < 0) {
+                                dput(dn);
                                goto out;
+                        }
                } else if (dn->d_inode &&
                           (ceph_ino(dn->d_inode) != vino.ino ||
                            ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1234,18 +1236,23 @@ retry_lookup:
                                goto out;
                        }
                        dn = splice_dentry(dn, in, NULL);
+                        if (IS_ERR(dn))
+                                dn = NULL;
                }
                if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
                               req->r_request_started, -1,
                               &req->r_caps_reservation) < 0) {
                        pr_err("fill_inode badness on %p\n", in);
-                        dput(dn);
+                        goto next_item;
-                        continue;
                }
-                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                if (dn)
-                                    req->r_session, req->r_request_started);
+                        update_dentry_lease(dn, rinfo->dir_dlease[i],
-                dput(dn);
+                                            req->r_session,
+                                            req->r_request_started);
+next_item:
+                if (dn)
+                        dput(dn);
        }
        req->r_did_prepopulate = true;
@@ -1494,7 +1501,7 @@ retry:
        if (wrbuffer_refs == 0)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1766947fc07a..dd440bd438a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&inode->i_lock);
                ci->i_wanted_max_size = 0;
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        ceph_encode_filepath(&p, end, ino1, path1);
        ceph_encode_filepath(&p, end, ino2, path2);
+        /* make note of release offset, in case we need to replay */
+        req->r_request_release_offset = p - msg->front.iov_base;
        /* cap releases */
        releases = 0;
        if (req->r_inode_drop)
@@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
        if (req->r_callback)
                req->r_callback(mdsc, req);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
 }
 /*
@@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+        if (req->r_got_unsafe) {
+                /*
+                 * Replay.  Do not regenerate message (and rebuild
+                 * paths, etc.); just use the original message.
+                 * Rebuilding paths will break for renames because
+                 * d_move mangles the src name.
+                 */
+                msg = req->r_request;
+                rhead = msg->front.iov_base;
+                flags = le32_to_cpu(rhead->flags);
+                flags |= CEPH_MDS_FLAG_REPLAY;
+                rhead->flags = cpu_to_le32(flags);
+                if (req->r_target_inode)
+                        rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+                rhead->num_retry = req->r_attempts - 1;
+                /* remove cap/dentry releases from message */
+                rhead->num_releases = 0;
+                msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+                msg->front.iov_len = req->r_request_release_offset;
+                return 0;
+        }
        if (req->r_request) {
                ceph_msg_put(req->r_request);
                req->r_request = NULL;
@@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
+        rhead->ino = 0;
        dout(" r_locked_dir = %p\n", req->r_locked_dir);
-        if (req->r_target_inode && req->r_got_unsafe)
-                rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-        else
-                rhead->ino = 0;
        return 0;
 }
@@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (head->safe) {
                req->r_got_safe = true;
                __unregister_request(mdsc, req);
-                complete(&req->r_safe_completion);
+                complete_all(&req->r_safe_completion);
                if (req->r_got_unsafe) {
                        /*
@@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
-                                complete(&mdsc->safe_umount_waiters);
+                                complete_all(&mdsc->safe_umount_waiters);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
@@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session,
                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
-                complete(&mdsc->session_close_waiters);
+                complete_all(&mdsc->session_close_waiters);
                kick_requests(mdsc, mds);
                break;
@@ -2783,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
        drop_leases(mdsc);
        ceph_flush_dirty_caps(mdsc);
        wait_requests(mdsc);
+        /*
+         * wait for reply handlers to drop their request refs and
+         * their inode/dcache refs
+         */
+        ceph_msgr_flush();
 }
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b292fa42a66d..952410c60d09 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
        int r_old_inode_drop, r_old_inode_unless;
        struct ceph_msg  *r_request;  /* original request */
+        int r_request_release_offset;
        struct ceph_msg  *r_reply;
        struct ceph_mds_reply_info_parsed r_reply_info;
        int r_err;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 64b8b1f7863d..15167b2daa55 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
 * nicely render a sockaddr as a string.
 */
 #define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        int i;
        char *s;
        struct sockaddr_in *in4 = (void *)ss;
-        unsigned char *quad = (void *)&in4->sin_addr.s_addr;
        struct sockaddr_in6 *in6 = (void *)ss;
        spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        switch (ss->ss_family) {
        case AF_INET:
-                sprintf(s, "%u.%u.%u.%u:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                        (unsigned int)quad[0],
+                         (unsigned int)ntohs(in4->sin_port));
-                        (unsigned int)quad[1],
-                        (unsigned int)quad[2],
-                        (unsigned int)quad[3],
-                        (unsigned int)ntohs(in4->sin_port));
                break;
        case AF_INET6:
-                sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                        in6->sin6_addr.s6_addr16[0],
+                         (unsigned int)ntohs(in6->sin6_port));
-                        in6->sin6_addr.s6_addr16[1],
-                        in6->sin6_addr.s6_addr16[2],
-                        in6->sin6_addr.s6_addr16[3],
-                        in6->sin6_addr.s6_addr16[4],
-                        in6->sin6_addr.s6_addr16[5],
-                        in6->sin6_addr.s6_addr16[6],
-                        in6->sin6_addr.s6_addr16[7],
-                        (unsigned int)ntohs(in6->sin6_port));
                break;
        default:
@@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
 */
 static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 {
-        struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+        struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
        struct socket *sock;
        int ret;
        BUG_ON(con->sock);
-        ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+        ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+                               IPPROTO_TCP, &sock);
        if (ret)
                return ERR_PTR(ret);
        con->sock = sock;
@@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-        ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+        ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+                                 O_NONBLOCK);
        if (ret == -EINPROGRESS) {
                dout("connect %s EINPROGRESS sk_state = %u\n",
                     pr_addr(&con->peer_addr.in_addr),
@@ -657,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
+        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
                struct sockaddr_in *in4 = (void *)ss;
                struct sockaddr_in6 *in6 = (void *)ss;
                int port;
+                char delim = ',';
+                if (*p == '[') {
+                        delim = ']';
+                        p++;
+                }
                memset(ss, 0, sizeof(*ss));
                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                             ',', &ipend)) {
+                             delim, &ipend))
                        ss->ss_family = AF_INET;
-                } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                    ',', &ipend)) {
+                                  delim, &ipend))
                        ss->ss_family = AF_INET6;
-                } else {
+                else
                        goto bad;
-                }
                p = ipend;
+                if (delim == ']') {
+                        if (*p != ']') {
+                                dout("missing matching ']'\n");
+                                goto bad;
+                        }
+                        p++;
+                }
                /* port? */
                if (p < end && *p == ':') {
                        port = 0;
@@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
        return 0;
 bad:
-        pr_err("parse_ips bad ip '%s'\n", c);
+        pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
        return -EINVAL;
 }
@@ -1396,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con)
        if (!con->in_msg) {
                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
                     con->in_hdr.front_len, con->in_hdr.data_len);
+                skip = 0;
                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                if (skip) {
                        /* skip this message */
                        dout("alloc_msg said skip message\n");
+                        BUG_ON(con->in_msg);
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
@@ -2013,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
        mutex_lock(&con->mutex);
        if (!list_empty(&msg->list_head)) {
-                dout("con_revoke %p msg %p\n", con, msg);
+                dout("con_revoke %p msg %p - was on queue\n", con, msg);
                list_del_init(&msg->list_head);
                ceph_msg_put(msg);
                msg->hdr.seq = 0;
-                if (con->out_msg == msg) {
+        }
-                        ceph_msg_put(con->out_msg);
+        if (con->out_msg == msg) {
-                        con->out_msg = NULL;
+                dout("con_revoke %p msg %p - was sending\n", con, msg);
-                }
+                con->out_msg = NULL;
                if (con->out_kvec_is_msg) {
                        con->out_skip = con->out_kvec_bytes;
                        con->out_kvec_is_msg = false;
                }
-        } else {
+                ceph_msg_put(msg);
-                dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+                msg->hdr.seq = 0;
        }
        mutex_unlock(&con->mutex);
 }
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 07a539906e67..54fe01c50706 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 out:
        mutex_unlock(&monc->mutex);
-        wake_up(&client->auth_wq);
+        wake_up_all(&client->auth_wq);
 }
 /*
@@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        }
        mutex_unlock(&monc->mutex);
        if (req) {
-                complete(&req->completion);
+                complete_all(&req->completion);
                put_generic_request(req);
        }
        return;
@@ -718,14 +718,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                                     monc->m_auth->front_max);
        if (ret < 0) {
                monc->client->auth_err = ret;
-                wake_up(&monc->client->auth_wq);
+                wake_up_all(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
                dout("authenticated, starting session\n");
                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-                monc->client->msgr->inst.name.num = monc->auth->global_id;
+                monc->client->msgr->inst.name.num =
+                                        cpu_to_le64(monc->auth->global_id);
                __send_subscribe(monc);
                __resend_generic_request(monc);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d25b4add85b4..e38522347898 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        if (req->r_callback)
                req->r_callback(req, msg);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
        if (flags & CEPH_OSD_FLAG_ONDISK) {
                if (req->r_safe_callback)
                        req->r_safe_callback(req, msg);
-                complete(&req->r_safe_completion);  /* fsync waiter */
+                complete_all(&req->r_safe_completion);  /* fsync waiter */
        }
 done:
@@ -1083,7 +1083,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
-        wake_up(&osdc->client->auth_wq);
+        wake_up_all(&osdc->client->auth_wq);
        return;
 bad:
@@ -1344,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
        int type = le16_to_cpu(msg->hdr.type);
        if (!osd)
-                return;
+                goto out;
        osdc = osd->o_osdc;
        switch (type) {
@@ -1359,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
+out:
        ceph_msg_put(msg);
 }
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index ddc656fb5c05..416d46adbf87 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                if (ev > CEPH_PG_POOL_VERSION) {
                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
                                   ev, CEPH_PG_POOL_VERSION);
+                        kfree(pi);
                        goto bad;
                }
                __decode_pool(p, pi);
@@ -707,6 +708,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                newcrush = crush_decode(*p, min(*p+len, end));
                if (IS_ERR(newcrush))
                        return ERR_CAST(newcrush);
+                *p += len;
        }
        /* new flags? */
@@ -829,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                /* remove any? */
                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
                                                node)->pgid, pgid) <= 0) {
-                        struct rb_node *cur = rbp;
+                        struct ceph_pg_mapping *cur =
+                                rb_entry(rbp, struct ceph_pg_mapping, node);
+                        
                        rbp = rb_next(rbp);
-                        dout(" removed pg_temp %llx\n",
+                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                             *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                        rb_erase(&cur->node, &map->pg_temp);
-                                               node)->pgid);
+                        kfree(cur);
-                        rb_erase(cur, &map->pg_temp);
                }
                if (pglen) {
@@ -850,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        for (j = 0; j < pglen; j++)
                                pg->osds[j] = ceph_decode_32(p);
                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err)
+                        if (err) {
+                                kfree(pg);
                                goto bad;
+                        }
                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
                             pglen);
                }
        }
        while (rbp) {
-                struct rb_node *cur = rbp;
+                struct ceph_pg_mapping *cur =
+                        rb_entry(rbp, struct ceph_pg_mapping, node);
                rbp = rb_next(rbp);
-                dout(" removed pg_temp %llx\n",
+                dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                rb_erase(&cur->node, &map->pg_temp);
-                                       node)->pgid);
+                kfree(cur);
-                rb_erase(cur, &map->pg_temp);
        }
        /* ignore the rest */
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..f80a4f25123c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/tty.h>
 #include "internal.h"
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f352596807..917b7d449bb2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,6 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
-        select SLOW_WORK
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH
          If unsure, say N.
 config CIFS_UPCALL
-          bool "Kerberos/SPNEGO advanced session setup"
+        bool "Kerberos/SPNEGO advanced session setup"
-          depends on CIFS && KEYS
+        depends on CIFS && KEYS
-          help
+        select DNS_RESOLVER
-            Enables an upcall mechanism for CIFS which accesses
+        help
-            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+          Enables an upcall mechanism for CIFS which accesses userspace helper
-            Kerberos tickets which are needed to mount to certain secure servers
+          utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
-            (for which more secure Kerberos authentication is required). If
+          which are needed to mount to certain secure servers (for which more
-            unsure, say N.
+          secure Kerberos authentication is required). If unsure, say N.
 config CIFS_XATTR
        bool "CIFS extended attributes"
@@ -122,6 +121,7 @@ config CIFS_DEBUG2
 config CIFS_DFS_UPCALL
          bool "DFS feature support"
          depends on CIFS && KEYS
+          select DNS_RESOLVER
          help
            Distributed File System (DFS) support is used to access shares
            transparently in an enterprise name space, even if the share
@@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL
            IP addresses) which is needed for implicit mounts of DFS junction
            points. If unsure, say N.
+config CIFS_FSCACHE
+          bool "Provide CIFS client caching support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL
+          depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+          help
+            Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+            to be cached locally on disk through the general filesystem cache
+            manager. If unsure, say N.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e86..adefa60a9bdc 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075f..a7081eeeb85d 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -568,8 +568,9 @@ module can be displayed via modinfo.
 Misc /proc/fs/cifs Flags and Debug Info
 =======================================
 Informational pseudo-files:
-DebugData               Displays information about active CIFS sessions
+DebugData               Displays information about active CIFS sessions and
-                        and shares, as well as the cifs.ko version.
+                        shares, features enabled as well as the cifs.ko
+                        version.
 Stats                   Lists summary resource usage information as well as per
                        share statistics, if CONFIG_CIFS_STATS in enabled
                        in the kernel configuration.
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 000000000000..224d7bbd1fcc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
+/*
+ *   fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+        .name = "cifs",
+        .version = 0,
+};
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+        return fscache_register_netfs(&cifs_fscache_netfs);
+}
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+        fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+        uint16_t        family;         /* address family */
+        uint16_t        port;           /* IP port */
+        union {
+                struct in_addr  ipv4_addr;
+                struct in6_addr ipv6_addr;
+        } addr[0];
+};
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+                                   void *buffer, uint16_t maxbuf)
+{
+        const struct TCP_Server_Info *server = cookie_netfs_data;
+        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        struct cifs_server_key *key = buffer;
+        uint16_t key_len = sizeof(struct cifs_server_key);
+        memset(key, 0, key_len);
+        /*
+         * Should not be a problem as sin_family/sin6_family overlays
+         * sa_family field
+         */
+        switch (sa->sa_family) {
+        case AF_INET:
+                key->family = server->addr.sockAddr.sin_family;
+                key->port = server->addr.sockAddr.sin_port;
+                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key_len += sizeof(key->addr[0].ipv4_addr);
+                break;
+        case AF_INET6:
+                key->family = server->addr.sockAddr6.sin6_family;
+                key->port = server->addr.sockAddr6.sin6_port;
+                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key_len += sizeof(key->addr[0].ipv6_addr);
+                break;
+        default:
+                cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+                key_len = 0;
+                break;
+        }
+        return key_len;
+}
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+        .name = "CIFS.server",
+        .type = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key = cifs_server_get_key,
+};
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+        u64     resource_id;            /* unique server resource id */
+};
+static char *extract_sharename(const char *treename)
+{
+        const char *src;
+        char *delim, *dst;
+        int len;
+        /* skip double chars at the beginning */
+        src = treename + 2;
+        /* share name is always preceded by '\\' now */
+        delim = strchr(src, '\\');
+        if (!delim)
+                return ERR_PTR(-EINVAL);
+        delim++;
+        len = strlen(delim);
+        /* caller has to free the memory */
+        dst = kstrndup(delim, len, GFP_KERNEL);
+        if (!dst)
+                return ERR_PTR(-ENOMEM);
+        return dst;
+}
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+                                   uint16_t maxbuf)
+{
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        char *sharename;
+        uint16_t len;
+        sharename = extract_sharename(tcon->treeName);
+        if (IS_ERR(sharename)) {
+                cFYI(1, "CIFS: couldn't extract sharename\n");
+                sharename = NULL;
+                return 0;
+        }
+        len = strlen(sharename);
+        if (len > maxbuf)
+                return 0;
+        memcpy(buffer, sharename, len);
+        kfree(sharename);
+        return len;
+}
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+                           uint16_t maxbuf)
+{
+        struct cifs_fscache_super_auxdata auxdata;
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.resource_id = tcon->resource_id;
+        if (maxbuf > sizeof(auxdata))
+                maxbuf = sizeof(auxdata);
+        memcpy(buffer, &auxdata, maxbuf);
+        return maxbuf;
+}
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+                                              const void *data,
+                                              uint16_t datalen)
+{
+        struct cifs_fscache_super_auxdata auxdata;
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.resource_id = tcon->resource_id;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+        .name = "CIFS.super",
+        .type = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key = cifs_super_get_key,
+        .get_aux = cifs_fscache_super_get_aux,
+        .check_aux = cifs_fscache_super_check_aux,
+};
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+        struct timespec last_write_time;
+        struct timespec last_change_time;
+        u64             eof;
+};
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t maxbuf)
+{
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        uint16_t keylen;
+        /* use the UniqueId as the key */
+        keylen = sizeof(cifsi->uniqueid);
+        if (keylen > maxbuf)
+                keylen = 0;
+        else
+                memcpy(buffer, &cifsi->uniqueid, keylen);
+        return keylen;
+}
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        *size = cifsi->vfs_inode.i_size;
+}
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+                           uint16_t maxbuf)
+{
+        struct cifs_fscache_inode_auxdata auxdata;
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.eof = cifsi->server_eof;
+        auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+        auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+        if (maxbuf > sizeof(auxdata))
+                maxbuf = sizeof(auxdata);
+        memcpy(buffer, &auxdata, maxbuf);
+        return maxbuf;
+}
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+                                              const void *data,
+                                              uint16_t datalen)
+{
+        struct cifs_fscache_inode_auxdata auxdata;
+        struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.eof = cifsi->server_eof;
+        auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+        auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+        struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        pagevec_init(&pvec, 0);
+        first = 0;
+        cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+        for (;;) {
+                nr_pages = pagevec_lookup(&pvec,
+                                          cifsi->vfs_inode.i_mapping, first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+        .name           = "CIFS.uniqueid",
+        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key        = cifs_fscache_inode_get_key,
+        .get_attr       = cifs_fscache_inode_get_attr,
+        .get_aux        = cifs_fscache_inode_get_aux,
+        .check_aux      = cifs_fscache_inode_check_aux,
+        .now_uncached   = cifs_fscache_inode_now_uncached,
+};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4fce6e61b34e..eb1ba493489f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+        seq_printf(m, "Features: ");
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        seq_printf(m, "dfs");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+        seq_printf(m, "fscache");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+        seq_printf(m, "lanman");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_POSIX
+        seq_printf(m, "posix");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+        seq_printf(m, "spnego");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_XATTR
+        seq_printf(m, "xattr");
+#endif
+        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
        seq_printf(m, "Servers:");
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ac19a6f3dae0..d6ced7aa23cf 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -141,7 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
-        if (rc != 0) {
+        if (rc < 0) {
                cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
                          __func__, *devname, rc);
                goto compose_mount_options_err;
@@ -150,8 +150,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
         * assuming that we have 'unc=' and 'ip=' in
         * the original sb_mountdata
         */
-        md_len = strlen(sb_mountdata) + strlen(srvIP) +
+        md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
-                strlen(ref->node_name) + 12;
        mountdata = kzalloc(md_len+1, GFP_KERNEL);
        if (mountdata == NULL) {
                rc = -ENOMEM;
@@ -230,28 +229,22 @@ compose_mount_options_err:
        goto compose_mount_options_out;
 }
+/**
-static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
-                struct dentry *dentry, const struct dfs_info3_param *ref)
+ * @cifs_sb:            parent/root superblock
+ * @fullpath:           full path in UNC format
+ * @ref:                server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+                const char *fullpath, const struct dfs_info3_param *ref)
 {
-        struct cifs_sb_info *cifs_sb;
        struct vfsmount *mnt;
        char *mountdata;
        char *devname = NULL;
-        char *fullpath;
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-        /*
-         * this function gives us a path with a double backslash prefix. We
-         * require a single backslash for DFS.
-         */
-        fullpath = build_path_from_dentry(dentry);
-        if (!fullpath)
-                return ERR_PTR(-ENOMEM);
+        /* strip first '\' from fullpath */
        mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
                        fullpath + 1, ref, &devname);
-        kfree(fullpath);
        if (IS_ERR(mountdata))
                return (struct vfsmount *)mountdata;
@@ -357,8 +350,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                        rc = -EINVAL;
                        goto out_err;
                }
-                mnt = cifs_dfs_do_refmount(nd->path.mnt,
+                mnt = cifs_dfs_do_refmount(cifs_sb,
-                                nd->path.dentry, referrals + i);
+                                full_path, referrals + i);
                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
                                        referrals[i].node_name, mnt);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb913..9e771450c3b8 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -35,6 +35,7 @@
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
 struct cifs_sb_info {
        struct cifsTconInfo *tcon;      /* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 379bd7d9c05f..87044906cd1f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
 /* strlen of ";uid=0x" */
 #define UID_KEY_LEN             7
+/* strlen of ";creduid=0x" */
+#define CREDUID_KEY_LEN         11
 /* strlen of ";user=" */
 #define USER_KEY_LEN            6
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+                   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
                   USER_KEY_LEN + strlen(sesInfo->userName) +
                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
@@ -144,6 +148,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
        dp = description + strlen(description);
+        sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+        dp = description + strlen(description);
        sprintf(dp, ";user=%s", sesInfo->userName);
        dp = description + strlen(description);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 484e52bb40bb..a5ed10c9afef 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,8 +45,8 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
-#include "dns_resolve.h"
 #include "cifs_spnego.h"
+#include "fscache.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42    /* the first four bytes of SMB PDUs */
 int cifsFYI = 0;
@@ -329,6 +329,12 @@ cifs_destroy_inode(struct inode *inode)
 }
 static void
+cifs_clear_inode(struct inode *inode)
+{
+        cifs_fscache_release_inode_cookie(inode);
+}
+static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
        seq_printf(s, ",addr=");
@@ -489,6 +495,7 @@ static const struct super_operations cifs_super_ops = {
        .alloc_inode = cifs_alloc_inode,
        .destroy_inode = cifs_destroy_inode,
        .drop_inode     = cifs_drop_inode,
+        .clear_inode    = cifs_clear_inode,
 /*      .delete_inode   = cifs_delete_inode,  */  /* Do not need above
        function unless later we add lazy close of inodes or unless the
        kernel forgets to call us with the same number of releases (closes)
@@ -902,6 +909,10 @@ init_cifs(void)
                cFYI(1, "cifs_max_pending set to max of 256");
        }
+        rc = cifs_fscache_register();
+        if (rc)
+                goto out;
        rc = cifs_init_inodecache();
        if (rc)
                goto out_clean_proc;
@@ -922,27 +933,13 @@ init_cifs(void)
        if (rc)
                goto out_unregister_filesystem;
 #endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        rc = register_key_type(&key_type_dns_resolver);
-        if (rc)
-                goto out_unregister_key_type;
-#endif
-        rc = slow_work_register_user(THIS_MODULE);
-        if (rc)
-                goto out_unregister_resolver_key;
        return 0;
- out_unregister_resolver_key:
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        unregister_key_type(&key_type_dns_resolver);
- out_unregister_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
-        unregister_key_type(&cifs_spnego_key_type);
 out_unregister_filesystem:
-#endif
        unregister_filesystem(&cifs_fs_type);
+#endif
 out_destroy_request_bufs:
        cifs_destroy_request_bufs();
 out_destroy_mids:
@@ -951,6 +948,8 @@ init_cifs(void)
        cifs_destroy_inodecache();
 out_clean_proc:
        cifs_proc_clean();
+        cifs_fscache_unregister();
+ out:
        return rc;
 }
@@ -959,9 +958,9 @@ exit_cifs(void)
 {
        cFYI(DBG2, "exit_cifs");
        cifs_proc_clean();
+        cifs_fscache_unregister();
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
-        unregister_key_type(&key_type_dns_resolver);
 #endif
 #ifdef CONFIG_CIFS_UPCALL
        unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a7eb65c84b1c..d82f5fb4761e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.64"
+#define CIFS_VERSION   "1.65"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a88479ceaad5..0cdfb8c32ac6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,10 +16,13 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
-#include <linux/slow-work.h>
+#include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
 /*
@@ -34,7 +37,7 @@
 #define MAX_SHARE_SIZE  64      /* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32    /* 32 is to allow for 15 char names + null
                                   termination then *2 for unicode versions */
-#define MAX_PASSWORD_SIZE 16
+#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
 #define CIFS_MIN_RCV_POOL 4
@@ -80,8 +83,7 @@ enum statusEnum {
 };
 enum securityEnum {
-        PLAINTXT = 0,           /* Legacy with Plaintext passwords */
+        LANMAN = 0,                     /* Legacy LANMAN auth */
-        LANMAN,                 /* Legacy LANMAN auth */
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
@@ -142,7 +144,6 @@ struct TCP_Server_Info {
        struct list_head pending_mid_q;
        void *Server_NlsInfo;   /* BB - placeholder for future NLS info  */
        unsigned short server_codepage; /* codepage for the server    */
-        unsigned long ip_address;       /* IP addr for the server if known */
        enum protocolEnum protocolType;
        char versionMajor;
        char versionMinor;
@@ -190,19 +191,9 @@ struct TCP_Server_Info {
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_ntlmssp;            /* supports NTLMSSP */
-};
+#ifdef CONFIG_CIFS_FSCACHE
+        struct fscache_cookie   *fscache; /* client index cache cookie */
-/*
+#endif
- * The following is our shortcut to user information.  We surface the uid,
- * and name. We always get the password on the fly in case it
- * has changed. We also hang a list of sessions owned by this user off here.
- */
-struct cifsUidInfo {
-        struct list_head userList;
-        struct list_head sessionList; /* SMB sessions for this user */
-        uid_t linux_uid;
-        char user[MAX_USERNAME_SIZE + 1];       /* ascii name of user */
-        /* BB may need ptr or callback for PAM or WinBind info */
 };
 /*
@@ -212,9 +203,6 @@ struct cifsSesInfo {
        struct list_head smb_ses_list;
        struct list_head tcon_list;
        struct mutex session_mutex;
-#if 0
-        struct cifsUidInfo *uidInfo;    /* pointer to user info */
-#endif
        struct TCP_Server_Info *server; /* pointer to server info */
        int ses_count;          /* reference counter */
        enum statusEnum status;
@@ -226,7 +214,8 @@ struct cifsSesInfo {
        char *serverNOS;        /* name of network operating system of server */
        char *serverDomain;     /* security realm of server */
        int Suid;               /* remote smb uid  */
-        uid_t linux_uid;        /* local Linux uid */
+        uid_t linux_uid;        /* overriding owner of files on the mount */
+        uid_t cred_uid;         /* owner of credentials */
        int capabilities;
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
                                TCP names - will ipv6 and sctp addresses fit? */
@@ -311,6 +300,10 @@ struct cifsTconInfo {
        bool local_lease:1; /* check leases (only) on local system not remote */
        bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
        bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_FSCACHE
+        u64 resource_id;                /* server resource id */
+        struct fscache_cookie *fscache; /* cookie for share */
+#endif
        /* BB add field for back pointer to sb struct(s)? */
 };
@@ -363,7 +356,7 @@ struct cifsFileInfo {
        atomic_t count;         /* reference count */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
-        struct slow_work oplock_break; /* slow_work job for oplock breaks */
+        struct work_struct oplock_break; /* work for oplock breaks */
 };
 /* Take a reference on the file private data */
@@ -398,6 +391,9 @@ struct cifsInodeInfo {
        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+#ifdef CONFIG_CIFS_FSCACHE
+        struct fscache_cookie *fscache;
+#endif
        struct inode vfs_inode;
 };
@@ -732,4 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+void cifs_oplock_break(struct work_struct *work);
+void cifs_oplock_break_get(struct cifsFileInfo *cfile);
+void cifs_oplock_break_put(struct cifsFileInfo *cfile);
 extern const struct slow_work_ops cifs_oplock_break_ops;
+#endif  /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fb6318b81509..1f5450814087 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,7 +86,9 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        struct TCP_Server_Info *server);
-extern int cifs_convert_address(char *src, void *dst);
+extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+                                unsigned short int port);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2208f06e4c45..95c2ea67edfb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -48,6 +48,7 @@
 #include "nterr.h"
 #include "rfc1002pdu.h"
 #include "cn_cifs.h"
+#include "fscache.h"
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -66,6 +67,7 @@ struct smb_vol {
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
        mode_t file_mode;
@@ -97,6 +99,7 @@ struct smb_vol {
        bool noblocksnd:1;
        bool noautotune:1;
        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+        bool fsc:1;     /* enable fscache */
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
@@ -830,7 +833,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
-        vol->linux_uid = current_uid();  /* use current_euid() instead? */
+        vol->cred_uid = current_uid();
+        vol->linux_uid = current_uid();
        vol->linux_gid = current_gid();
        /* default to only allowing write access to owner of the mount */
@@ -1257,6 +1261,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if ((strnicmp(data, "nocase", 6) == 0) ||
                           (strnicmp(data, "ignorecase", 10)  == 0)) {
                        vol->nocase = 1;
+                } else if (strnicmp(data, "mand", 4) == 0) {
+                        /* ignore */
+                } else if (strnicmp(data, "nomand", 6) == 0) {
+                        /* ignore */
+                } else if (strnicmp(data, "_netdev", 7) == 0) {
+                        /* ignore */
                } else if (strnicmp(data, "brl", 3) == 0) {
                        vol->nobrl =  0;
                } else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1331,6 +1341,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
+                } else if (strnicmp(data, "fsc", 3) == 0) {
+                        vol->fsc = true;
                } else
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
                                                data);
@@ -1380,18 +1392,92 @@ cifs_parse_mount_options(char *options, const char *devname,
        return 0;
 }
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+        switch (addr->sa_family) {
+        case AF_INET:
+                if (addr4->sin_addr.s_addr !=
+                    server->addr.sockAddr.sin_addr.s_addr)
+                        return false;
+                if (addr4->sin_port &&
+                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                        return false;
+                break;
+        case AF_INET6:
+                if (!ipv6_addr_equal(&addr6->sin6_addr,
+                                     &server->addr.sockAddr6.sin6_addr))
+                        return false;
+                if (addr6->sin6_scope_id !=
+                    server->addr.sockAddr6.sin6_scope_id)
+                        return false;
+                if (addr6->sin6_port &&
+                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
+                        return false;
+                break;
+        }
+        return true;
+}
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+        unsigned int secFlags;
+        if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+                secFlags = vol->secFlg;
+        else
+                secFlags = global_secflags | vol->secFlg;
+        switch (server->secType) {
+        case LANMAN:
+                if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
+                        return false;
+                break;
+        case NTLMv2:
+                if (!(secFlags & CIFSSEC_MAY_NTLMV2))
+                        return false;
+                break;
+        case NTLM:
+                if (!(secFlags & CIFSSEC_MAY_NTLM))
+                        return false;
+                break;
+        case Kerberos:
+                if (!(secFlags & CIFSSEC_MAY_KRB5))
+                        return false;
+                break;
+        case RawNTLMSSP:
+                if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
+                        return false;
+                break;
+        default:
+                /* shouldn't happen */
+                return false;
+        }
+        /* now check if signing mode is acceptible */
+        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+            (server->secMode & SECMODE_SIGN_REQUIRED))
+                        return false;
+        else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+                 (server->secMode &
+                  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+                        return false;
+        return true;
+}
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
+cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
-        struct list_head *tmp;
        struct TCP_Server_Info *server;
-        struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
        write_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &cifs_tcp_ses_list) {
+        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                server = list_entry(tmp, struct TCP_Server_Info,
-                                    tcp_ses_list);
                /*
                 * the demux thread can exit on its own while still in CifsNew
                 * so don't accept any sockets in that state. Since the
@@ -1401,37 +1487,11 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
                if (server->tcpStatus == CifsNew)
                        continue;
-                switch (addr->ss_family) {
+                if (!match_address(server, addr))
-                case AF_INET:
+                        continue;
-                        if (addr4->sin_addr.s_addr ==
-                            server->addr.sockAddr.sin_addr.s_addr) {
-                                addr4->sin_port = htons(port);
-                                /* user overrode default port? */
-                                if (addr4->sin_port) {
-                                        if (addr4->sin_port !=
-                                            server->addr.sockAddr.sin_port)
-                                                continue;
-                                }
-                                break;
-                        } else
-                                continue;
-                case AF_INET6:
+                if (!match_security(server, vol))
-                        if (ipv6_addr_equal(&addr6->sin6_addr,
+                        continue;
-                            &server->addr.sockAddr6.sin6_addr) &&
-                            (addr6->sin6_scope_id ==
-                            server->addr.sockAddr6.sin6_scope_id)) {
-                                addr6->sin6_port = htons(port);
-                                /* user overrode default port? */
-                                if (addr6->sin6_port) {
-                                        if (addr6->sin6_port !=
-                                           server->addr.sockAddr6.sin6_port)
-                                                continue;
-                                }
-                                break;
-                        } else
-                                continue;
-                }
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
@@ -1460,6 +1520,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        cifs_fscache_release_client_cookie(server);
        task = xchg(&server->tsk, NULL);
        if (task)
                force_sig(SIGKILL, task);
@@ -1479,7 +1541,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
        if (volume_info->UNCip && volume_info->UNC) {
-                rc = cifs_convert_address(volume_info->UNCip, &addr);
+                rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+                                        volume_info->UNCip,
+                                        strlen(volume_info->UNCip),
+                                        volume_info->port);
                if (!rc) {
                        /* we failed translating address */
                        rc = -EINVAL;
@@ -1499,7 +1564,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
+        tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
        if (tcp_ses)
                return tcp_ses;
@@ -1543,12 +1608,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                sin_server6->sin6_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
                        sizeof(struct sockaddr_in6));
                rc = ipv6_connect(tcp_ses);
        } else {
-                sin_server->sin_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr, sin_server,
                        sizeof(struct sockaddr_in));
                rc = ipv4_connect(tcp_ses);
@@ -1577,6 +1640,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
        write_unlock(&cifs_tcp_ses_lock);
+        cifs_fscache_get_client_cookie(tcp_ses);
        return tcp_ses;
 out_err:
@@ -1591,17 +1656,27 @@ out_err:
 }
 static struct cifsSesInfo *
-cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-        struct list_head *tmp;
        struct cifsSesInfo *ses;
        write_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &server->smb_ses_list) {
+        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                switch (server->secType) {
-                if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
+                case Kerberos:
-                        continue;
+                        if (vol->cred_uid != ses->cred_uid)
+                                continue;
+                        break;
+                default:
+                        /* anything else takes username/password */
+                        if (strncmp(ses->userName, vol->username,
+                                    MAX_USERNAME_SIZE))
+                                continue;
+                        if (strlen(vol->username) != 0 &&
+                            strncmp(ses->password, vol->password,
+                                    MAX_PASSWORD_SIZE))
+                                continue;
+                }
                ++ses->ses_count;
                write_unlock(&cifs_tcp_ses_lock);
                return ses;
@@ -1643,7 +1718,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        xid = GetXid();
-        ses = cifs_find_smb_ses(server, volume_info->username);
+        ses = cifs_find_smb_ses(server, volume_info);
        if (ses) {
                cFYI(1, "Existing smb sess found (status=%d)", ses->status);
@@ -1706,6 +1781,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                if (ses->domainName)
                        strcpy(ses->domainName, volume_info->domainname);
        }
+        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
        ses->overrideSecFlg = volume_info->secFlg;
@@ -1773,6 +1849,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        CIFSSMBTDis(xid, tcon);
        _FreeXid(xid);
+        cifs_fscache_release_super_cookie(tcon);
        tconInfoFree(tcon);
        cifs_put_smb_ses(ses);
 }
@@ -1843,6 +1920,8 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
        list_add(&tcon->tcon_list, &ses->tcon_list);
        write_unlock(&cifs_tcp_ses_lock);
+        cifs_fscache_get_super_cookie(tcon);
        return tcon;
 out_fail:
@@ -2397,6 +2476,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
        if (pvolume_info->dynperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+        if (pvolume_info->fsc)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
        if (pvolume_info->direct_io) {
                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e7ae78b66fa1..578d88c5b46e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,12 +130,6 @@ cifs_bp_rename_retry:
        return full_path;
 }
-/*
- * When called with struct file pointer set to NULL, there is no way we could
- * update file->private_data, but getting it stuck on openFileList provides a
- * way to access it from cifs_fill_filedata and thereby set file->private_data
- * from cifs_open.
- */
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                  struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -163,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
        mutex_init(&pCifsFile->lock_mutex);
        INIT_LIST_HEAD(&pCifsFile->llist);
        atomic_set(&pCifsFile->count, 1);
-        slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
+        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
        write_lock(&GlobalSMBSeslock);
        list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 4db2c5e7283f..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
 *   Copyright (c) 2007 Igor Mammedov
 *   Author(s): Igor Mammedov (niallain@gmail.com)
 *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *              David Howells (dhowells@redhat.com)
 *
 *   Contains the CIFS DFS upcall routines used for hostname to
 *   IP address translation.
@@ -24,145 +26,73 @@
 */
 #include <linux/slab.h>
-#include <keys/user-type.h>
+#include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
-/* Checks if supplied name is IP address
+/**
- * returns:
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- *              1 - name is IP
+ * @unc: UNC path specifying the server
- *              0 - name is not IP
+ * @ip_addr: Where to return the IP address.
- */
+ *
-static int
+ * The IP address will be returned in string form, and the caller is
-is_ip(char *name)
+ * responsible for freeing it.
-{
+ *
-        struct sockaddr_storage ss;
+ * Returns length of result on success, -ve on error.
-        return cifs_convert_address(name, &ss);
-}
-static int
-dns_resolver_instantiate(struct key *key, const void *data,
-                size_t datalen)
-{
-        int rc = 0;
-        char *ip;
-        ip = kmalloc(datalen + 1, GFP_KERNEL);
-        if (!ip)
-                return -ENOMEM;
-        memcpy(ip, data, datalen);
-        ip[datalen] = '\0';
-        /* make sure this looks like an address */
-        if (!is_ip(ip)) {
-                kfree(ip);
-                return -EINVAL;
-        }
-        key->type_data.x[0] = datalen;
-        key->payload.data = ip;
-        return rc;
-}
-static void
-dns_resolver_destroy(struct key *key)
-{
-        kfree(key->payload.data);
-}
-struct key_type key_type_dns_resolver = {
-        .name        = "dns_resolver",
-        .def_datalen = sizeof(struct in_addr),
-        .describe    = user_describe,
-        .instantiate = dns_resolver_instantiate,
-        .destroy     = dns_resolver_destroy,
-        .match       = user_match,
-};
-/* Resolves server name to ip address.
- * input:
- *      unc - server UNC
- * output:
- *      *ip_addr - pointer to server ip, caller responcible for freeing it.
- * return 0 on success
 */
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
-        int rc = -EAGAIN;
+        struct sockaddr_storage ss;
-        struct key *rkey = ERR_PTR(-EAGAIN);
+        const char *hostname, *sep;
        char *name;
-        char *data = NULL;
+        int len, rc;
-        int len;
        if (!ip_addr || !unc)
                return -EINVAL;
-        /* search for server name delimiter */
        len = strlen(unc);
        if (len < 3) {
                cFYI(1, "%s: unc is too short: %s", __func__, unc);
                return -EINVAL;
        }
+        /* Discount leading slashes for cifs */
        len -= 2;
-        name = memchr(unc+2, '\\', len);
+        hostname = unc + 2;
-        if (!name) {
+        /* Search for server name delimiter */
+        sep = memchr(hostname, '\\', len);
+        if (sep)
+                len = sep - unc;
+        else
                cFYI(1, "%s: probably server name is whole unc: %s",
-                                        __func__, unc);
+                     __func__, unc);
-        } else {
-                len = (name - unc) - 2/* leading // */;
+        /* Try to interpret hostname as an IPv4 or IPv6 address */
-        }
+        rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+        if (rc > 0)
+                goto name_is_IP_address;
+        /* Perform the upcall */
+        rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+        if (rc < 0)
+                cERROR(1, "%s: unable to resolve: %*.*s",
+                       __func__, len, len, hostname);
+        else
+                cFYI(1, "%s: resolved: %*.*s to %s",
+                     __func__, len, len, hostname, *ip_addr);
+        return rc;
-        name = kmalloc(len+1, GFP_KERNEL);
+name_is_IP_address:
-        if (!name) {
+        name = kmalloc(len + 1, GFP_KERNEL);
-                rc = -ENOMEM;
+        if (!name)
-                return rc;
+                return -ENOMEM;
-        }
+        memcpy(name, hostname, len);
-        memcpy(name, unc+2, len);
        name[len] = 0;
+        cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
-        if (is_ip(name)) {
+        *ip_addr = name;
-                cFYI(1, "%s: it is IP, skipping dns upcall: %s",
+        return 0;
-                                        __func__, name);
-                data = name;
-                goto skip_upcall;
-        }
-        rkey = request_key(&key_type_dns_resolver, name, "");
-        if (!IS_ERR(rkey)) {
-                len = rkey->type_data.x[0];
-                data = rkey->payload.data;
-        } else {
-                cERROR(1, "%s: unable to resolve: %s", __func__, name);
-                goto out;
-        }
-skip_upcall:
-        if (data) {
-                *ip_addr = kmalloc(len + 1, GFP_KERNEL);
-                if (*ip_addr) {
-                        memcpy(*ip_addr, data, len + 1);
-                        if (!IS_ERR(rkey))
-                                cFYI(1, "%s: resolved: %s to %s", __func__,
-                                                        name,
-                                                        *ip_addr
-                                        );
-                        rc = 0;
-                } else {
-                        rc = -ENOMEM;
-                }
-                if (!IS_ERR(rkey))
-                        key_put(rkey);
-        }
-out:
-        kfree(name);
-        return rc;
 }
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
 #define _DNS_RESOLVE_H
 #ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 409e4f523e61..db11fdef0e92 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -40,6 +40,7 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -282,6 +283,9 @@ int cifs_open(struct inode *inode, struct file *file)
                                CIFSSMBClose(xid, tcon, netfid);
                                rc = -ENOMEM;
                        }
+                        cifs_fscache_set_inode_cookie(inode, file);
                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
@@ -373,6 +377,8 @@ int cifs_open(struct inode *inode, struct file *file)
                goto out;
        }
+        cifs_fscache_set_inode_cookie(inode, file);
        if (oplock & CIFS_CREATE_ACTION) {
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
@@ -427,7 +433,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        __u16 netfid;
        if (file->private_data)
-                pCifsFile = (struct cifsFileInfo *)file->private_data;
+                pCifsFile = file->private_data;
        else
                return -EBADF;
@@ -565,8 +571,7 @@ int cifs_close(struct inode *inode, struct file *file)
        int xid, timeout;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        struct cifsFileInfo *pSMBFile =
+        struct cifsFileInfo *pSMBFile = file->private_data;
-                (struct cifsFileInfo *)file->private_data;
        xid = GetXid();
@@ -641,8 +646,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 {
        int rc = 0;
        int xid;
-        struct cifsFileInfo *pCFileStruct =
+        struct cifsFileInfo *pCFileStruct = file->private_data;
-            (struct cifsFileInfo *)file->private_data;
        char *ptmp;
        cFYI(1, "Closedir inode = 0x%p", inode);
@@ -863,8 +867,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                      length, pfLock,
                                      posix_lock_type, wait_flag);
        } else {
-                struct cifsFileInfo *fid =
+                struct cifsFileInfo *fid = file->private_data;
-                        (struct cifsFileInfo *)file->private_data;
                if (numLock) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -965,7 +968,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        if (file->private_data == NULL)
                return -EBADF;
-        open_file = (struct cifsFileInfo *) file->private_data;
+        open_file = file->private_data;
        rc = generic_write_checks(file, poffset, &write_size, 0);
        if (rc)
@@ -1067,7 +1070,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        if (file->private_data == NULL)
                return -EBADF;
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        xid = GetXid();
@@ -1651,8 +1654,7 @@ int cifs_fsync(struct file *file, int datasync)
        int xid;
        int rc = 0;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *smbfile =
+        struct cifsFileInfo *smbfile = file->private_data;
-                (struct cifsFileInfo *)file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
        xid = GetXid();
@@ -1756,7 +1758,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1837,7 +1839,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
                cFYI(1, "attempting read on write only file instance");
@@ -1942,6 +1944,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                SetPageUptodate(page);
                unlock_page(page);
                data += PAGE_CACHE_SIZE;
+                /* add page to FS-Cache */
+                cifs_readpage_to_fscache(mapping->host, page);
        }
        return;
 }
@@ -1968,10 +1973,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        pTcon = cifs_sb->tcon;
+        /*
+         * Reads as many pages as possible from fscache. Returns -ENOBUFS
+         * immediately if the cookie is negative
+         */
+        rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+                                         &num_pages);
+        if (rc == 0)
+                goto read_complete;
        cFYI(DBG2, "rpages: num pages %d", num_pages);
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
@@ -2082,6 +2096,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                smb_read_data = NULL;
        }
+read_complete:
        FreeXid(xid);
        return rc;
 }
@@ -2092,6 +2107,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        char *read_data;
        int rc;
+        /* Is the page cached? */
+        rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+        if (rc == 0)
+                goto read_complete;
        page_cache_get(page);
        read_data = kmap(page);
        /* for reads over a certain size could initiate async read ahead */
@@ -2111,11 +2131,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        flush_dcache_page(page);
        SetPageUptodate(page);
+        /* send this page to the cache */
+        cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
        rc = 0;
 io_error:
        kunmap(page);
        page_cache_release(page);
+read_complete:
        return rc;
 }
@@ -2265,8 +2291,23 @@ out:
        return rc;
 }
-static void
+static int cifs_release_page(struct page *page, gfp_t gfp)
-cifs_oplock_break(struct slow_work *work)
+{
+        if (PagePrivate(page))
+                return 0;
+        return cifs_fscache_release_page(page, gfp);
+}
+static void cifs_invalidate_page(struct page *page, unsigned long offset)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+        if (offset == 0)
+                cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
                                                  oplock_break);
@@ -2303,33 +2344,30 @@ cifs_oplock_break(struct slow_work *work)
                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
                cFYI(1, "Oplock release rc = %d", rc);
        }
+        /*
+         * We might have kicked in before is_valid_oplock_break()
+         * finished grabbing reference for us.  Make sure it's done by
+         * waiting for GlobalSMSSeslock.
+         */
+        write_lock(&GlobalSMBSeslock);
+        write_unlock(&GlobalSMBSeslock);
+        cifs_oplock_break_put(cfile);
 }
-static int
+void cifs_oplock_break_get(struct cifsFileInfo *cfile)
-cifs_oplock_break_get(struct slow_work *work)
 {
-        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-                                                  oplock_break);
        mntget(cfile->mnt);
        cifsFileInfo_get(cfile);
-        return 0;
 }
-static void
+void cifs_oplock_break_put(struct cifsFileInfo *cfile)
-cifs_oplock_break_put(struct slow_work *work)
 {
-        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-                                                  oplock_break);
        mntput(cfile->mnt);
        cifsFileInfo_put(cfile);
 }
-const struct slow_work_ops cifs_oplock_break_ops = {
-        .get_ref        = cifs_oplock_break_get,
-        .put_ref        = cifs_oplock_break_put,
-        .execute        = cifs_oplock_break,
-};
 const struct address_space_operations cifs_addr_ops = {
        .readpage = cifs_readpage,
        .readpages = cifs_readpages,
@@ -2338,6 +2376,8 @@ const struct address_space_operations cifs_addr_ops = {
        .write_begin = cifs_write_begin,
        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
+        .releasepage = cifs_release_page,
+        .invalidatepage = cifs_invalidate_page,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
@@ -2354,6 +2394,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .write_begin = cifs_write_begin,
        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
+        .releasepage = cifs_release_page,
+        .invalidatepage = cifs_invalidate_page,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 000000000000..9f3f5c4be161
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,236 @@
+/*
+ *   fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+        server->fscache =
+                fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+                                &cifs_fscache_server_index_def, server);
+        cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
+                                server->fscache);
+}
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+        cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
+                                server->fscache);
+        fscache_relinquish_cookie(server->fscache, 0);
+        server->fscache = NULL;
+}
+void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+{
+        struct TCP_Server_Info *server = tcon->ses->server;
+        tcon->fscache =
+                fscache_acquire_cookie(server->fscache,
+                                &cifs_fscache_super_index_def, tcon);
+        cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
+                                server->fscache, tcon->fscache);
+}
+void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+{
+        cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+        fscache_relinquish_cookie(tcon->fscache, 0);
+        tcon->fscache = NULL;
+}
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        if (cifsi->fscache)
+                return;
+        cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                                &cifs_fscache_inode_object_def,
+                                cifsi);
+        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
+                        cifs_sb->tcon->fscache, cifsi->fscache);
+}
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        if (cifsi->fscache) {
+                cFYI(1, "CIFS releasing inode cookie (0x%p)",
+                                cifsi->fscache);
+                fscache_relinquish_cookie(cifsi->fscache, 0);
+                cifsi->fscache = NULL;
+        }
+}
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        if (cifsi->fscache) {
+                cFYI(1, "CIFS disabling inode cookie (0x%p)",
+                                cifsi->fscache);
+                fscache_relinquish_cookie(cifsi->fscache, 1);
+                cifsi->fscache = NULL;
+        }
+}
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+                cifs_fscache_disable_inode_cookie(inode);
+        else {
+                cifs_fscache_enable_inode_cookie(inode);
+                cFYI(1, "CIFS: fscache inode cookie set");
+        }
+}
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct fscache_cookie *old = cifsi->fscache;
+        if (cifsi->fscache) {
+                /* retire the current fscache cache and get a new one */
+                fscache_relinquish_cookie(cifsi->fscache, 1);
+                cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                                        &cifs_fscache_inode_object_def,
+                                        cifsi);
+                cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
+                                cifsi->fscache, old);
+        }
+}
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        if (PageFsCache(page)) {
+                struct inode *inode = page->mapping->host;
+                struct cifsInodeInfo *cifsi = CIFS_I(inode);
+                cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
+                                page, cifsi->fscache);
+                if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+                        return 0;
+        }
+        return 1;
+}
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+                                                int error)
+{
+        cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
+                        page, error);
+        if (!error)
+                SetPageUptodate(page);
+        unlock_page(page);
+}
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+        int ret;
+        cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
+                        CIFS_I(inode)->fscache, page, inode);
+        ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+                                         cifs_readpage_from_fscache_complete,
+                                         NULL,
+                                         GFP_KERNEL);
+        switch (ret) {
+        case 0: /* page found in fscache, read submitted */
+                cFYI(1, "CIFS: readpage_from_fscache: submitted");
+                return ret;
+        case -ENOBUFS:  /* page won't be cached */
+        case -ENODATA:  /* page not in cache */
+                cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+                return 1;
+        default:
+                cERROR(1, "unknown error ret = %d", ret);
+        }
+        return ret;
+}
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+                                struct address_space *mapping,
+                                struct list_head *pages,
+                                unsigned *nr_pages)
+{
+        int ret;
+        cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
+                        CIFS_I(inode)->fscache, *nr_pages, inode);
+        ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+                                          pages, nr_pages,
+                                          cifs_readpage_from_fscache_complete,
+                                          NULL,
+                                          mapping_gfp_mask(mapping));
+        switch (ret) {
+        case 0: /* read submitted to the cache for all pages */
+                cFYI(1, "CIFS: readpages_from_fscache: submitted");
+                return ret;
+        case -ENOBUFS:  /* some pages are not cached and can't be */
+        case -ENODATA:  /* some pages are not cached */
+                cFYI(1, "CIFS: readpages_from_fscache: no page");
+                return 1;
+        default:
+                cFYI(1, "unknown error ret = %d", ret);
+        }
+        return ret;
+}
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+        int ret;
+        cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
+                        CIFS_I(inode)->fscache, page, inode);
+        ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+        if (ret != 0)
+                fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct fscache_cookie *cookie = cifsi->fscache;
+        cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+        fscache_wait_on_page_write(cookie, page);
+        fscache_uncache_page(cookie, page);
+}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 000000000000..31b88ec2341e
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
+/*
+ *   fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+#include <linux/fscache.h>
+#include "cifsglob.h"
+#ifdef CONFIG_CIFS_FSCACHE
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+                                         struct address_space *,
+                                         struct list_head *,
+                                         unsigned *);
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+static inline void cifs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode)
+{
+        if (PageFsCache(page))
+                __cifs_fscache_invalidate_page(page, inode);
+}
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+                                             struct page *page)
+{
+        if (CIFS_I(inode)->fscache)
+                return __cifs_readpage_from_fscache(inode, page);
+        return -ENOBUFS;
+}
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+                                              struct address_space *mapping,
+                                              struct list_head *pages,
+                                              unsigned *nr_pages)
+{
+        if (CIFS_I(inode)->fscache)
+                return __cifs_readpages_from_fscache(inode, mapping, pages,
+                                                     nr_pages);
+        return -ENOBUFS;
+}
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+                                            struct page *page)
+{
+        if (PageFsCache(page))
+                __cifs_readpage_to_fscache(inode, page);
+}
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+                                                 struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        return 1; /* May release page */
+}
+static inline void cifs_fscache_invalidate_page(struct page *page,
+                        struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+        return -ENOBUFS;
+}
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+                                              struct address_space *mapping,
+                                              struct list_head *pages,
+                                              unsigned *nr_pages)
+{
+        return -ENOBUFS;
+}
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+                        struct page *page) {}
+#endif /* CONFIG_CIFS_FSCACHE */
+#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6f0683c68952..dc4c47ab9588 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -29,6 +29,7 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -288,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        struct cifsFileInfo *cfile = filp->private_data;
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -515,7 +516,7 @@ int cifs_get_file_info(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        struct cifsFileInfo *cfile = filp->private_data;
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -723,18 +724,17 @@ cifs_find_inode(struct inode *inode, void *opaque)
 {
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        /* don't match inode with different uniqueid */
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
-        /*
+        /* don't match inode of different type */
-         * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
-         * verboten. Disable serverino and return it as if it were found, the
+                return 0;
-         * caller can discard it, generate a uniqueid and retry the find
-         */
+        /* if it's not a directory or has no dentries, then flag it */
-        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
                fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
-                cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
-        }
        return 1;
 }
@@ -748,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
        return 0;
 }
+/*
+ * walk dentry list for an inode and report whether it has aliases that
+ * are hashed. We use this to determine if a directory inode can actually
+ * be used.
+ */
+static bool
+inode_has_hashed_dentries(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&dcache_lock);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+                        spin_unlock(&dcache_lock);
+                        return true;
+                }
+        }
+        spin_unlock(&dcache_lock);
+        return false;
+}
 /* Given fattrs, get a corresponding inode */
 struct inode *
 cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -763,12 +784,16 @@ retry_iget5_locked:
        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
        if (inode) {
-                /* was there a problematic inode number collision? */
+                /* was there a potentially problematic inode collision? */
                if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
-                        iput(inode);
-                        fattr->cf_uniqueid = iunique(sb, ROOT_I);
                        fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
-                        goto retry_iget5_locked;
+                        if (inode_has_hashed_dentries(inode)) {
+                                cifs_autodisable_serverino(CIFS_SB(sb));
+                                iput(inode);
+                                fattr->cf_uniqueid = iunique(sb, ROOT_I);
+                                goto retry_iget5_locked;
+                        }
                }
                cifs_fattr_to_inode(inode, fattr);
@@ -776,6 +801,10 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
+#ifdef CONFIG_CIFS_FSCACHE
+                        /* initialize per-inode cache cookie pointer */
+                        CIFS_I(inode)->fscache = NULL;
+#endif
                        unlock_new_inode(inode);
                }
        }
@@ -807,6 +836,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
        if (!inode)
                return ERR_PTR(-ENOMEM);
+#ifdef CONFIG_CIFS_FSCACHE
+        /* populate tcon->resource_id */
+        cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
        if (rc && cifs_sb->tcon->ipc) {
                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
@@ -1568,6 +1602,7 @@ cifs_invalidate_mapping(struct inode *inode)
                        cifs_i->write_behind_rc = rc;
        }
        invalidate_remote_inode(inode);
+        cifs_fscache_reset_inode_cookie(inode);
 }
 int cifs_revalidate_file(struct file *filp)
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 505926f1ee6b..9d38a71c8e14 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -41,8 +41,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        __u64   ExtAttrMask = 0;
        __u64   caps;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pSMBFile =
+        struct cifsFileInfo *pSMBFile = filep->private_data;
-                (struct cifsFileInfo *)filep->private_data;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1394aa37f26c..3ccadc1326d6 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        struct cifsTconInfo *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct cifsFileInfo *netfile;
-        int rc;
        cFYI(1, "Checking for oplock break or dnotify response");
        if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -583,13 +582,18 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                pCifsInode->clientCanCacheAll = false;
                                if (pSMB->OplockLevel == 0)
                                        pCifsInode->clientCanCacheRead = false;
-                                rc = slow_work_enqueue(&netfile->oplock_break);
-                                if (rc) {
+                                /*
-                                        cERROR(1, "failed to enqueue oplock "
+                                 * cifs_oplock_break_put() can't be called
-                                                   "break: %d\n", rc);
+                                 * from here.  Get reference after queueing
-                                } else {
+                                 * succeeded.  cifs_oplock_break() will
-                                        netfile->oplock_break_cancelled = false;
+                                 * synchronize using GlobalSMSSeslock.
-                                }
+                                 */
+                                if (queue_work(system_nrt_wq,
+                                               &netfile->oplock_break))
+                                        cifs_oplock_break_get(netfile);
+                                netfile->oplock_break_cancelled = false;
                                read_unlock(&GlobalSMBSeslock);
                                read_unlock(&cifs_tcp_ses_lock);
                                return true;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d35d52889cb5..f97851119e6c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
        {ERRremcd, -EACCES},
        {ERRdiffdevice, -EXDEV},
        {ERRnofiles, -ENOENT},
+        {ERRwriteprot, -EROFS},
        {ERRbadshare, -ETXTBSY},
        {ERRlock, -EACCES},
        {ERRunsup, -EINVAL},
@@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 * Returns 0 on failure.
 */
 static int
-cifs_inet_pton(const int address_family, const char *cp, void *dst)
+cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
 {
        int ret = 0;
        /* calculate length by finding first slash or NULL */
        if (address_family == AF_INET)
-                ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
+                ret = in4_pton(cp, len, dst, '\\', NULL);
        else if (address_family == AF_INET6)
-                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+                ret = in6_pton(cp, len, dst , '\\', NULL);
-        cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
+        cFYI(DBG2, "address conversion returned %d for %*.*s",
+             ret, len, len, cp);
        if (ret > 0)
                ret = 1;
        return ret;
@@ -164,43 +166,66 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 * Returns 0 on failure.
 */
 int
-cifs_convert_address(char *src, void *dst)
+cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
-        int rc;
+        int rc, alen, slen;
-        char *pct, *endp;
+        const char *pct;
+        char *endp, scope_id[13];
        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
        /* IPv4 address */
-        if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
+        if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
                s4->sin_family = AF_INET;
                return 1;
        }
-        /* temporarily terminate string */
+        /* attempt to exclude the scope ID from the address part */
-        pct = strchr(src, '%');
+        pct = memchr(src, '%', len);
-        if (pct)
+        alen = pct ? pct - src : len;
-                *pct = '\0';
-        rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
-        /* repair temp termination (if any) and make pct point to scopeid */
-        if (pct)
-                *pct++ = '%';
+        rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
        if (!rc)
                return rc;
        s6->sin6_family = AF_INET6;
        if (pct) {
+                /* grab the scope ID */
+                slen = len - (alen + 1);
+                if (slen <= 0 || slen > 12)
+                        return 0;
+                memcpy(scope_id, pct + 1, slen);
+                scope_id[slen] = '\0';
                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
-                if (!*pct || *endp)
+                if (endp != scope_id + slen)
                        return 0;
        }
        return rc;
 }
+int
+cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+                   const unsigned short int port)
+{
+        if (!cifs_convert_address(dst, src, len))
+                return 0;
+        switch (dst->sa_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)dst)->sin_port = htons(port);
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)dst)->sin6_port = htons(port);
+                break;
+        default:
+                return 0;
+        }
+        return 1;
+}
 /*****************************************************************************
 convert a NT status code to a dos class/code
 *****************************************************************************/
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index daf1753af674..d5e591fab475 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -847,6 +847,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
                tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+                if (tmp_buf == NULL) {
+                        rc = -ENOMEM;
+                        break;
+                }
                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
                        if (current_entry == NULL) {
                                /* evaluate whether this case is an error */
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7c..7f16cb825fe5 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
 #define ERRnofiles              18      /* A File Search command can find no
                                           more files matching the specified
                                           criteria. */
+#define ERRwriteprot            19      /* media is write protected */
 #define ERRgeneral              31
 #define ERRbadshare             32      /* The sharing mode specified for an
                                           Open conflicts with existing FIDs on
diff --git a/fs/compat.c b/fs/compat.c
index 6490d2134ff3..c6fda9aeb864 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,7 +8,7 @@
 *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
 *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
 *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *  Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 618f38136304..5d9b936c458b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
 * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
 * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
 * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
 *
 * These routines maintain argument size conversion between 32bit and 64bit
 * ioctls.
@@ -578,8 +578,11 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
 }
 /* Bluetooth ioctls */
-#define HCIUARTSETPROTO _IOW('U', 200, int)
+#define HCIUARTSETPROTO         _IOW('U', 200, int)
-#define HCIUARTGETPROTO _IOR('U', 201, int)
+#define HCIUARTGETPROTO         _IOR('U', 201, int)
+#define HCIUARTGETDEVICE        _IOR('U', 202, int)
+#define HCIUARTSETFLAGS         _IOW('U', 203, int)
+#define HCIUARTGETFLAGS         _IOR('U', 204, int)
 #define BNEPCONNADD     _IOW('B', 200, int)
 #define BNEPCONNDEL     _IOW('B', 201, int)
@@ -1298,6 +1301,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
 COMPATIBLE_IOCTL(HCISETLINKMODE)
 COMPATIBLE_IOCTL(HCISETACLMTU)
 COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
diff --git a/fs/dcache.c b/fs/dcache.c
index c8c78ba07827..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 *
 * In this case we return -1 to tell the caller that we baled.
 */
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7600aacf531d..a10cb91cadea 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 {
        ssize_t transferred = 0;
@@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
                        transferred = dio->i_size - offset;
        }
-        if (dio->end_io && dio->result)
-                dio->end_io(dio->iocb, offset, transferred,
-                            dio->map_bh.b_private);
-        if (dio->flags & DIO_LOCKING)
-                /* lockdep: non-owner release */
-                up_read_non_owner(&dio->inode->i_alloc_sem);
        if (ret == 0)
                ret = dio->page_errors;
        if (ret == 0)
@@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
        if (ret == 0)
                ret = transferred;
+        if (dio->end_io && dio->result) {
+                dio->end_io(dio->iocb, offset, transferred,
+                            dio->map_bh.b_private, ret, is_async);
+        } else if (is_async) {
+                aio_complete(dio->iocb, ret, 0);
+        }
+        if (dio->flags & DIO_LOCKING)
+                /* lockdep: non-owner release */
+                up_read_non_owner(&dio->inode->i_alloc_sem);
        return ret;
 }
@@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (remaining == 0) {
-                int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
+                dio_complete(dio, dio->iocb->ki_pos, 0, true);
-                aio_complete(dio->iocb, ret, 0);
                kfree(dio);
        }
 }
@@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (ret2 == 0) {
-                ret = dio_complete(dio, offset, ret);
+                ret = dio_complete(dio, offset, ret, false);
                kfree(dio);
        } else
                BUG_ON(ret != -EIOCBQUEUED);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c620526..37a34c2c622a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
        for (i = 0 ; i < CONN_HASH_SIZE; i++) {
                hlist_for_each_entry(con, h, &connection_hash[i], list) {
-                        if (con && con->sctp_assoc == assoc_id) {
+                        if (con->sctp_assoc == assoc_id) {
                                mutex_unlock(&connections_lock);
                                return con;
                        }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100d..ef17e0169da1 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
 int __init dlm_netlink_init(void)
 {
-        int rv;
+        return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
-        rv = genl_register_family(&family);
-        if (rv)
-                return rv;
-        rv = genl_register_ops(&family, &dlm_nl_ops);
-        if (rv < 0)
-                goto err;
-        return 0;
- err:
-        genl_unregister_family(&family);
-        return rv;
 }
 void dlm_netlink_exit(void)
 {
-        genl_unregister_ops(&family, &dlm_nl_ops);
        genl_unregister_family(&family);
 }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5e..a2e3b562e65d 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
 /**
 * ecryptfs_init_crypt_ctx
- * @crypt_stat: Uninitilized crypt stats structure
+ * @crypt_stat: Uninitialized crypt stats structure
 *
 * Initialize the crypto context.
 *
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d485..46c4dd8dfcc3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
 static struct hlist_head *ecryptfs_daemon_hash;
 struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
 #define ecryptfs_uid_hash(uid) \
-        hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+        hash_long((unsigned long)uid, ecryptfs_hash_bits)
 static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
        }
        mutex_init(&ecryptfs_daemon_hash_mux);
        mutex_lock(&ecryptfs_daemon_hash_mux);
-        ecryptfs_hash_buckets = 1;
+        ecryptfs_hash_bits = 1;
-        while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
+        while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
-                ecryptfs_hash_buckets++;
+                ecryptfs_hash_bits++;
        ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
-                                        * ecryptfs_hash_buckets), GFP_KERNEL);
+                                        * (1 << ecryptfs_hash_bits)),
+                                       GFP_KERNEL);
        if (!ecryptfs_daemon_hash) {
                rc = -ENOMEM;
                printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
                mutex_unlock(&ecryptfs_daemon_hash_mux);
                goto out;
        }
-        for (i = 0; i < ecryptfs_hash_buckets; i++)
+        for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
                INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
        mutex_unlock(&ecryptfs_daemon_hash_mux);
        ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
                int i;
                mutex_lock(&ecryptfs_daemon_hash_mux);
-                for (i = 0; i < ecryptfs_hash_buckets; i++) {
+                for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
                        int rc;
                        hlist_for_each_entry(daemon, elem,
diff --git a/fs/exec.c b/fs/exec.c
index e19de6a80339..dab85ecad686 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -653,6 +652,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else
                stack_base = vma->vm_start - stack_expand;
 #endif
+        current->mm->start_stack = bprm->p;
        ret = expand_stack(vma, stack_base);
        if (ret)
                ret = -EFAULT;
@@ -1891,13 +1891,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        clear_thread_flag(TIF_SIGPENDING);
-        /*
-         * lock_kernel() because format_corename() is controlled by sysctl, which
-         * uses lock_kernel()
-         */
-        lock_kernel();
        ispipe = format_corename(corename, signr);
-        unlock_kernel();
        if (ispipe) {
                int dump_count;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f45..e8c6ba0e4a3e 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
 config EXT3_DEFAULTS_TO_ORDERED
        bool "Default to 'data=ordered' in ext3"
        depends on EXT3_FS
+        default y
        help
          The journal mode options for ext3 have different tradeoffs
          between when data is guaranteed to be on disk and
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2a..001eb0e2d48e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,9 +1149,25 @@ static int walk_page_buffers(	handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
                                        struct buffer_head *bh)
 {
+        int dirty = buffer_dirty(bh);
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
-        return ext3_journal_get_write_access(handle, bh);
+        /*
+         * __block_prepare_write() could have dirtied some buffers. Clean
+         * the dirty bit as jbd2_journal_get_write_access() could complain
+         * otherwise about fs integrity issues. Setting of the dirty bit
+         * by __block_prepare_write() isn't a real problem here as we clear
+         * the bit before releasing a page lock and thus writeback cannot
+         * ever write the buffer.
+         */
+        if (dirty)
+                clear_buffer_dirty(bh);
+        ret = ext3_journal_get_write_access(handle, bh);
+        if (!ret && dirty)
+                ret = ext3_journal_dirty_metadata(handle, bh);
+        return ret;
 }
 /*
@@ -1625,10 +1641,7 @@ static int ext3_writeback_writepage(struct page *page,
                goto out_fail;
        }
-        if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
+        ret = block_write_full_page(page, ext3_get_block, wbc);
-                ret = nobh_writepage(page, ext3_get_block, wbc);
-        else
-                ret = block_write_full_page(page, ext3_get_block, wbc);
        err = ext3_journal_stop(handle);
        if (!ret)
@@ -1922,17 +1935,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        /*
-         * For "nobh" option,  we can only work if we don't need to
-         * read-in the page - otherwise we create buffers to do the IO.
-         */
-        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-             ext3_should_writeback_data(inode) && PageUptodate(page)) {
-                zero_user(page, offset, length);
-                set_page_dirty(page);
-                goto unlock;
-        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
@@ -2284,27 +2286,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                           depth);
                        /*
-                         * We've probably journalled the indirect block several
-                         * times during the truncate.  But it's no longer
-                         * needed and we now drop it from the transaction via
-                         * journal_revoke().
-                         *
-                         * That's easy if it's exclusively part of this
-                         * transaction.  But if it's part of the committing
-                         * transaction then journal_forget() will simply
-                         * brelse() it.  That means that if the underlying
-                         * block is reallocated in ext3_get_block(),
-                         * unmap_underlying_metadata() will find this block
-                         * and will try to get rid of it.  damn, damn.
-                         *
-                         * If this block has already been committed to the
-                         * journal, a revoke record will be written.  And
-                         * revoke records must be emitted *before* clearing
-                         * this block's bit in the bitmaps.
-                         */
-                        ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-                        /*
                         * Everything below this this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
@@ -2327,6 +2308,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                truncate_restart_transaction(handle, inode);
                        }
+                        /*
+                         * We've probably journalled the indirect block several
+                         * times during the truncate.  But it's no longer
+                         * needed and we now drop it from the transaction via
+                         * journal_revoke().
+                         *
+                         * That's easy if it's exclusively part of this
+                         * transaction.  But if it's part of the committing
+                         * transaction then journal_forget() will simply
+                         * brelse() it.  That means that if the underlying
+                         * block is reallocated in ext3_get_block(),
+                         * unmap_underlying_metadata() will find this block
+                         * and will try to get rid of it.  damn, damn. Thus
+                         * we don't allow a block to be reallocated until
+                         * a transaction freeing it has fully committed.
+                         *
+                         * We also have to make sure journal replay after a
+                         * crash does not overwrite non-journaled data blocks
+                         * with old metadata when the block got reallocated for
+                         * data.  Thus we have to store a revoke record for a
+                         * block in the same transaction in which we free the
+                         * block.
+                         */
+                        ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
                        ext3_free_blocks(handle, inode, nr, 1);
                        if (parent_bh) {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca42..2b35ddb70d65 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
        struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        unsigned long offset;
        struct buffer_head * bh;
        struct ext3_dir_entry_2 *de;
        struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
                ext3_mark_inode_dirty(handle, dir);
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
-        for (block = 0, offset = 0; block < blocks; block++) {
+        for (block = 0; block < blocks; block++) {
                bh = ext3_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef9..0ccd7b12b73c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                      ext3_fsblk_t n_blocks_count)
 {
        ext3_fsblk_t o_blocks_count;
-        unsigned long o_groups_count;
        ext3_grpblk_t last;
        ext3_grpblk_t add;
        struct buffer_head * bh;
@@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
         * yet: we're going to revalidate es->s_blocks_count after
         * taking the s_resize_lock below. */
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
-        o_groups_count = EXT3_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e7..9650a956fd0e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
         */
        seq_puts(seq, ",barrier=");
        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-        if (test_opt(sb, NOBH))
-                seq_puts(seq, ",nobh");
        seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
@@ -1255,10 +1252,12 @@ set_qf_format:
                        *n_blocks_count = option;
                        break;
                case Opt_nobh:
-                        set_opt(sbi->s_mount_opt, NOBH);
+                        ext3_msg(sb, KERN_WARNING,
+                                "warning: ignoring deprecated nobh option");
                        break;
                case Opt_bh:
-                        clear_opt(sbi->s_mount_opt, NOBH);
+                        ext3_msg(sb, KERN_WARNING,
+                                "warning: ignoring deprecated bh option");
                        break;
                default:
                        ext3_msg(sb, KERN_ERR,
@@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                break;
        }
-        if (test_opt(sb, NOBH)) {
-                if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-                        ext3_msg(sb, KERN_WARNING,
-                                "warning: ignoring nobh option - "
-                                "it is supported only with writeback mode");
-                        clear_opt(sbi->s_mount_opt, NOBH);
-                }
-        }
        /*
         * The journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index feaf498feaa6..5e2ed4504ead 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
                                return error;
                        else {
                                inode->i_mode = mode;
+                                inode->i_ctime = ext4_current_time(inode);
                                ext4_mark_inode_dirty(handle, inode);
                                if (error == 0)
                                        acl = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 95b7594c76f9..bd30799a43ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        ext4_grpblk_t bit;
        unsigned int i;
        struct ext4_group_desc *desc;
-        struct ext4_super_block *es;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_sb_info *sbi;
        int err = 0, ret, blk_free_count;
        ext4_grpblk_t blocks_freed;
        struct ext4_group_info *grp;
-        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 5b6973fbf1bd..3db5084db9bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (start_blk + count < start_blk) ||
-            (start_blk + count > ext4_blocks_count(sbi->s_es)))
+            (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+                sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
                return 0;
+        }
        while (n) {
                entry = rb_entry(n, struct ext4_system_zone, node);
                if (start_blk + count - 1 < entry->start_blk)
                        n = n->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = n->rb_right;
-                else
+                else {
+                        sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
                        return 0;
+                }
        }
        return 1;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ea5e6cb7e2a5..374510f72baa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
-int ext4_check_dir_entry(const char *function, struct inode *dir,
+int __ext4_check_dir_entry(const char *function, unsigned int line,
-                         struct ext4_dir_entry_2 *de,
+                           struct inode *dir,
-                         struct buffer_head *bh,
+                           struct ext4_dir_entry_2 *de,
-                         unsigned int offset)
+                           struct buffer_head *bh,
+                           unsigned int offset)
 {
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                ext4_error_inode(function, dir,
+                ext4_error_inode(dir, function, line, bh->b_blocknr,
-                        "bad entry in directory: %s - block=%llu"
+                        "bad entry in directory: %s - "
                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        error_msg, (unsigned long long) bh->b_blocknr,
+                        error_msg, (unsigned) (offset%bh->b_size), offset,
-                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
@@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp,
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
+                ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+                                      EXT4_INODE_INDEX);
        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -193,7 +194,7 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+                        if (!ext4_check_dir_entry(inode, de,
                                                  bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
@@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
        struct dir_private_info *info;
        int len;
-        info = (struct dir_private_info *) dir_file->private_data;
+        info = dir_file->private_data;
        p = &info->root.rb_node;
        /* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de57128a..e03841d9f30b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,10 +57,13 @@
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-        ext4_error_inode(__func__, (inode), (fmt), ## a)
+        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
+        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, fmt, a...)        \
-        ext4_error_file(__func__, (file), (fmt), ## a)
+        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -167,13 +170,15 @@ struct mpage_da_data {
 };
 #define EXT4_IO_UNWRITTEN       0x1
 typedef struct ext4_io_end {
-        struct list_head        list;           /* per-file finished AIO list */
+        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
        struct page             *page;          /* page struct for buffer write */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
+        struct kiocb            *iocb;          /* iocb struct for AIO */
+        int                     result;         /* error value for AIO */
 } ext4_io_end_t;
 /*
@@ -460,7 +465,7 @@ struct ext4_new_group_data {
 };
 /*
- * Flags used by ext4_get_blocks()
+ * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unitialized
           extent to be an initialized ext4 */
@@ -873,7 +878,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_POSIX_ACL            0x08000 /* POSIX Access Control Lists */
 #define EXT4_MOUNT_NO_AUTO_DA_ALLOC     0x10000 /* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER              0x20000 /* Use block barriers */
-#define EXT4_MOUNT_NOBH                 0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
@@ -982,7 +986,7 @@ struct ext4_super_block {
        __le32  s_last_orphan;          /* start of list of inodes to delete */
        __le32  s_hash_seed[4];         /* HTREE hash seed */
        __u8    s_def_hash_version;     /* Default hash version to use */
-        __u8    s_reserved_char_pad;
+        __u8    s_jnl_backup_type;
        __le16  s_desc_size;            /* size of group descriptor */
 /*100*/ __le32  s_default_mount_opts;
        __le32  s_first_meta_bg;        /* First metablock block group */
@@ -1000,12 +1004,34 @@ struct ext4_super_block {
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
-        __u8    s_reserved_char_pad2;
+        __u8    s_reserved_char_pad;
        __le16  s_reserved_pad;
        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
-        __u32   s_reserved[160];        /* Padding to the end of the block */
+        __le32  s_snapshot_inum;        /* Inode number of active snapshot */
+        __le32  s_snapshot_id;          /* sequential ID of active snapshot */
+        __le64  s_snapshot_r_blocks_count; /* reserved blocks for active
+                                              snapshot's future use */
+        __le32  s_snapshot_list;        /* inode number of the head of the
+                                           on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+        __le32  s_error_count;          /* number of fs errors */
+        __le32  s_first_error_time;     /* first time an error happened */
+        __le32  s_first_error_ino;      /* inode involved in first error */
+        __le64  s_first_error_block;    /* block involved of first error */
+        __u8    s_first_error_func[32]; /* function where the error happened */
+        __le32  s_first_error_line;     /* line number where error happened */
+        __le32  s_last_error_time;      /* most recent time of an error */
+        __le32  s_last_error_ino;       /* inode involved in last error */
+        __le32  s_last_error_line;      /* line number where error happened */
+        __le64  s_last_error_block;     /* block involved of last error */
+        __u8    s_last_error_func[32];  /* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+        __u8    s_mount_opts[64];
+        __le32  s_reserved[112];        /* Padding to the end of the block */
 };
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
 #ifdef __KERNEL__
 /*
@@ -1143,6 +1169,9 @@ struct ext4_sb_info {
        /* workqueue for dio unwritten */
        struct workqueue_struct *dio_unwritten_wq;
+        /* timer for periodic error stats printing */
+        struct timer_list s_err_report;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1313,6 +1342,10 @@ EXT4_INODE_BIT_FNS(state, state_flags)
 #define EXT4_DEFM_JMODE_DATA    0x0020
 #define EXT4_DEFM_JMODE_ORDERED 0x0040
 #define EXT4_DEFM_JMODE_WBACK   0x0060
+#define EXT4_DEFM_NOBARRIER     0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD       0x0400
+#define EXT4_DEFM_NODELALLOC    0x0800
 /*
 * Default journal batch times
@@ -1379,6 +1412,43 @@ struct ext4_dir_entry_2 {
 #define EXT4_MAX_REC_LEN                ((1<<16)-1)
 /*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
+        if (len == EXT4_MAX_REC_LEN || len == 0)
+                return blocksize;
+        return (len & 65532) | ((len & 3) << 16);
+#else
+        return len;
+#endif
+}
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+                BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+        if (len < 65536)
+                return cpu_to_le16(len);
+        if (len == blocksize) {
+                if (blocksize == 65536)
+                        return cpu_to_le16(EXT4_MAX_REC_LEN);
+                else
+                        return cpu_to_le16(0);
+        }
+        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+        return cpu_to_le16(len);
+#endif
+}
+/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */
@@ -1510,9 +1580,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                ext4_init_block_bitmap(sb, NULL, group, desc)
 /* dir.c */
-extern int ext4_check_dir_entry(const char *, struct inode *,
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
-                                struct ext4_dir_entry_2 *,
+                                  struct ext4_dir_entry_2 *,
-                                struct buffer_head *, unsigned int);
+                                  struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, de, bh, offset) \
+        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1601,8 +1673,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
-extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
-extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1616,25 +1686,38 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 /* super.c */
-extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                         const char *, ...)
-#define ext4_error(sb, message...)      __ext4_error(sb, __func__, ## message)
+        __attribute__ ((format (printf, 4, 5)));
-extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+#define ext4_error(sb, message...)      __ext4_error(sb, __func__,      \
-        __attribute__ ((format (printf, 3, 4)));
+                                                     __LINE__, ## message)
-extern void ext4_error_file(const char *, struct file *, const char *, ...)
+extern void ext4_error_inode(struct inode *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                             ext4_fsblk_t, const char *, ...)
-extern void __ext4_std_error(struct super_block *, const char *, int);
+        __attribute__ ((format (printf, 5, 6)));
-extern void ext4_abort(struct super_block *, const char *, const char *, ...)
+extern void ext4_error_file(struct file *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                            const char *, ...)
-extern void __ext4_warning(struct super_block *, const char *,
+        __attribute__ ((format (printf, 4, 5)));
+extern void __ext4_std_error(struct super_block *, const char *,
+                             unsigned int, int);
+extern void __ext4_abort(struct super_block *, const char *, unsigned int,
+                       const char *, ...)
+        __attribute__ ((format (printf, 4, 5)));
+#define ext4_abort(sb, message...)      __ext4_abort(sb, __func__, \
+                                                       __LINE__, ## message)
+extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                          const char *, ...)
-        __attribute__ ((format (printf, 3, 4)));
+        __attribute__ ((format (printf, 4, 5)));
-#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, ## message)
+#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, \
+                                                       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+extern void __ext4_grp_locked_error(const char *, unsigned int, \
-                                const char *, const char *, ...)
+                                    struct super_block *, ext4_group_t, \
-        __attribute__ ((format (printf, 4, 5)));
+                                    unsigned long, ext4_fsblk_t, \
+                                    const char *, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#define ext4_grp_locked_error(sb, grp, message...) \
+        __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -1768,7 +1851,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
 #define ext4_std_error(sb, errno)                               \
 do {                                                            \
        if ((errno))                                            \
-                __ext4_std_error((sb), __func__, (errno));      \
+                __ext4_std_error((sb), __func__, __LINE__, (errno));    \
 } while (0)
 #ifdef CONFIG_SMP
@@ -1860,6 +1943,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
        spin_unlock(ext4_group_lock_ptr(sb, group));
 }
+static inline void ext4_mark_super_dirty(struct super_block *sb)
+{
+        if (EXT4_SB(sb)->s_journal == NULL)
+                sb->s_dirt =1;
+}
 /*
 * Inodes and files operations
 */
@@ -1905,9 +1994,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
-extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
-                           sector_t block, unsigned int max_blocks,
-                           struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 /* move_extent.c */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71ca..6e272ef6ba96 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
 #include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                struct buffer_head *bh)
+                                   handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_undo_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
        }
        return err;
 }
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
-                                struct buffer_head *bh)
+                                    handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
        }
        return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 * If the handle isn't valid we're not journaling, but we still need to
 * call into ext4_journal_revoke() to put the buffer head.
 */
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
-                  struct inode *inode, struct buffer_head *bh,
+                  int is_metadata, struct inode *inode,
-                  ext4_fsblk_t blocknr)
+                  struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
-                                ext4_journal_abort_handle(where, __func__, bh,
+                                ext4_journal_abort_handle(where, line, __func__,
-                                                          handle, err);
+                                                          bh, handle, err);
                        return err;
                }
                return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+                ext4_journal_abort_handle(where, line, __func__,
-                ext4_abort(inode->i_sb, __func__,
+                                          bh, handle, err);
+                __ext4_abort(inode->i_sb, where, line,
                           "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
 }
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_create_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__,
-                                                  handle, err);
+                                                  bh, handle, err);
        }
        return err;
 }
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
-                                 struct inode *inode, struct buffer_head *bh)
+                                 handle_t *handle, struct inode *inode,
+                                 struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__,
-                                                  handle, err);
+                                                  bh, handle, err);
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                                ext4_error(inode->i_sb,
+                                struct ext4_super_block *es;
-                                           "IO error syncing inode, "
-                                           "inode=%lu, block=%llu",
+                                es = EXT4_SB(inode->i_sb)->s_es;
-                                           inode->i_ino,
+                                es->s_last_error_block =
-                                           (unsigned long long) bh->b_blocknr);
+                                        cpu_to_le64(bh->b_blocknr);
+                                ext4_error_inode(inode, where, line,
+                                                 bh->b_blocknr,
+                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
 }
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+                              handle_t *handle, struct super_block *sb)
+{
+        struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+        int err = 0;
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_dirty_metadata(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, line, __func__,
+                                                  bh, handle, err);
+        } else
+                sb->s_dirt = 1;
+        return err;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index dade0c024797..b0bd792c58c5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
 * Wrapper functions with which ext4 calls into JBD.
 */
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+                               const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                struct buffer_head *bh);
+                                   handle_t *handle, struct buffer_head *bh);
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
-                                struct buffer_head *bh);
+                                    handle_t *handle, struct buffer_head *bh);
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
-                  struct inode *inode, struct buffer_head *bh,
+                  int is_metadata, struct inode *inode,
-                  ext4_fsblk_t blocknr);
+                  struct buffer_head *bh, ext4_fsblk_t blocknr);
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh);
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
-                                 struct inode *inode, struct buffer_head *bh);
+                                 handle_t *handle, struct inode *inode,
+                                 struct buffer_head *bh);
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+                              handle_t *handle, struct super_block *sb);
 #define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__func__, (handle), (bh))
+        __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-        __ext4_journal_get_write_access(__func__, (handle), (bh))
+        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-        __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
-                      (block_nr))
+                      (bh), (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
-        __ext4_journal_get_create_access(__func__, (handle), (bh))
+        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
-        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
+        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+                                     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
-int __ext4_journal_stop(const char *where, handle_t *handle);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 #define ext4_journal_stop(handle) \
-        __ext4_journal_stop(__func__, (handle))
+        __ext4_journal_stop(__func__, __LINE__, (handle))
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_mutex for direct I/O reads.  This only works for extent-based
- * files, and it doesn't work for nobh or if data journaling is
+ * files, and it doesn't work if data journaling is enabled, since the
- * enabled, since the dioread_nolock code uses b_private to pass
+ * dioread_nolock code uses b_private to pass information back to the
- * information back to the I/O completion handler, and this conflicts
+ * I/O completion handler, and this conflicts with the jbd's use of
- * with the jbd's use of b_private.
+ * b_private.
 */
 static inline int ext4_should_dioread_nolock(struct inode *inode)
 {
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
-        if (test_opt(inode->i_sb, NOBH))
-                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 377309c1af65..06328d3e5717 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -401,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
        return 1;
 }
-static int __ext4_ext_check(const char *function, struct inode *inode,
+static int __ext4_ext_check(const char *function, unsigned int line,
-                                        struct ext4_extent_header *eh,
+                            struct inode *inode, struct ext4_extent_header *eh,
-                                        int depth)
+                            int depth)
 {
        const char *error_msg;
        int max = 0;
@@ -436,7 +436,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        ext4_error_inode(function, inode,
+        ext4_error_inode(inode, function, line, 0,
                        "bad header/extent: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        error_msg, le16_to_cpu(eh->eh_magic),
@@ -447,7 +447,7 @@ corrupted:
 }
 #define ext4_ext_check(inode, eh, depth)        \
-        __ext4_ext_check(__func__, inode, eh, depth)
+        __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
 int ext4_ext_check_inode(struct inode *inode)
 {
@@ -1083,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
-        struct ext4_extent_idx *fidx;
        struct buffer_head *bh;
        ext4_fsblk_t newblock;
        int err = 0;
@@ -1144,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_idx_store_pblock(curp->p_idx, newblock);
        neh = ext_inode_hdr(inode);
-        fidx = EXT_FIRST_INDEX(neh);
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
-                  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+                  idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -2937,7 +2936,7 @@ fix_extent_len:
 * One of more index blocks maybe needed if the extent tree grow after
 * the unintialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitilized extent called at this time will be split
+ * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
@@ -2954,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
-        struct ext4_extent_header *eh;
        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
@@ -2971,7 +2969,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
@@ -3058,7 +3055,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        err = PTR_ERR(path);
                        goto out;
                }
-                eh = path[depth].p_hdr;
                ex = path[depth].p_ext;
                if (ex2 != &newex)
                        ex2 = ex;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5313ae4cda2d..ee92b66d4558 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                size_t length = iov_length(iov, nr_segs);
-                if (pos > sbi->s_bitmap_maxbytes)
+                if ((pos > sbi->s_bitmap_maxbytes ||
+                    (pos == sbi->s_bitmap_maxbytes && length > 0)))
                        return -EFBIG;
                if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                if (!IS_ERR(cp)) {
                        memcpy(sbi->s_es->s_last_mounted, cp,
                               sizeof(sbi->s_es->s_last_mounted));
-                        sb->s_dirt = 1;
+                        ext4_mark_super_dirty(sb);
                }
        }
        return dquot_file_open(inode, filp);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25c4b3173fd9..ac377505ed57 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -279,7 +279,7 @@ out:
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!fatal)
                        fatal = err;
-                sb->s_dirt = 1;
+                ext4_mark_super_dirty(sb);
        } else
                ext4_error(sb, "bit already cleared for inode %lu", ino);
@@ -965,7 +965,7 @@ got:
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 42272d67955a..a0ab3754d0d6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -221,6 +221,7 @@ void ext4_delete_inode(struct inode *inode)
                                     "couldn't extend journal (err %d)", err);
                stop_handle:
                        ext4_journal_stop(handle);
+                        ext4_orphan_del(NULL, inode);
                        goto no_delete;
                }
        }
@@ -337,9 +338,11 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
-static int __ext4_check_blockref(const char *function, struct inode *inode,
+static int __ext4_check_blockref(const char *function, unsigned int line,
+                                 struct inode *inode,
                                 __le32 *p, unsigned int max)
 {
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        __le32 *bref = p;
        unsigned int blk;
@@ -348,8 +351,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        ext4_error_inode(function, inode,
+                        es->s_last_error_block = cpu_to_le64(blk);
-                                         "invalid block reference %u", blk);
+                        ext4_error_inode(inode, function, line, blk,
+                                         "invalid block");
                        return -EIO;
                }
        }
@@ -358,11 +362,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              (__le32 *)(bh)->b_data,                   \
                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              EXT4_I(inode)->i_data,                    \
                              EXT4_NDIR_BLOCKS)
 /**
@@ -1128,20 +1134,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, const char *func,
+static int __check_block_validity(struct inode *inode, const char *func,
+                                unsigned int line,
                                struct ext4_map_blocks *map)
 {
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
                                   map->m_len)) {
-                ext4_error_inode(func, inode,
+                ext4_error_inode(inode, func, line, map->m_pblk,
-                           "lblock %lu mapped to illegal pblock %llu "
+                                 "lblock %lu mapped to illegal pblock "
-                           "(length %d)", (unsigned long) map->m_lblk,
+                                 "(length %d)", (unsigned long) map->m_lblk,
-                                 map->m_pblk, map->m_len);
+                                 map->m_len);
                return -EIO;
        }
        return 0;
 }
+#define check_block_validity(inode, map)        \
+        __check_block_validity((inode), __func__, __LINE__, (map))
 /*
 * Return the number of contiguous dirty pages in a given inode
 * starting at page frame idx.
@@ -1244,7 +1254,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        up_read((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, __func__, map);
+                int ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -1324,9 +1334,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode,
+                int ret = check_block_validity(inode, map);
-                                               "ext4_map_blocks_after_alloc",
-                                               map);
                if (ret != 0)
                        return ret;
        }
@@ -1519,9 +1527,25 @@ static int walk_page_buffers(handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
                                       struct buffer_head *bh)
 {
+        int dirty = buffer_dirty(bh);
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
-        return ext4_journal_get_write_access(handle, bh);
+        /*
+         * __block_prepare_write() could have dirtied some buffers. Clean
+         * the dirty bit as jbd2_journal_get_write_access() could complain
+         * otherwise about fs integrity issues. Setting of the dirty bit
+         * by __block_prepare_write() isn't a real problem here as we clear
+         * the bit before releasing a page lock and thus writeback cannot
+         * ever write the buffer.
+         */
+        if (dirty)
+                clear_buffer_dirty(bh);
+        ret = ext4_journal_get_write_access(handle, bh);
+        if (!ret && dirty)
+                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+        return ret;
 }
 /*
@@ -2194,7 +2218,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        BUG_ON(!handle);
        /*
-         * Call ext4_get_blocks() to allocate any delayed allocation
+         * Call ext4_map_blocks() to allocate any delayed allocation
         * blocks, or to convert an uninitialized extent to be
         * initialized (in the case where we have written into
         * one or more preallocated blocks).
@@ -2203,7 +2227,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * indicate that we are on the delayed allocation path.  This
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
-         * want to change *many* call functions, so ext4_get_blocks()
+         * want to change *many* call functions, so ext4_map_blocks()
         * will set the magic i_delalloc_reserved_flag once the
         * inode's allocation semaphore is taken.
         *
@@ -2221,6 +2245,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
        if (blks < 0) {
+                struct super_block *sb = mpd->inode->i_sb;
                err = blks;
                /*
                 * If get block returns with error we simply
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        return 0;
                if (err == -ENOSPC &&
-                    ext4_count_free_blocks(mpd->inode->i_sb)) {
+                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
                        return 0;
                }
@@ -2243,16 +2269,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                 * writepage and writepages will again try to write
                 * the same.
                 */
-                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                         "delayed block allocation failed for inode %lu at "
+                        ext4_msg(sb, KERN_CRIT,
-                         "logical offset %llu with max blocks %zd with "
+                                 "delayed block allocation failed for inode %lu "
-                         "error %d", mpd->inode->i_ino,
+                                 "at logical offset %llu with max blocks %zd "
-                         (unsigned long long) next,
+                                 "with error %d", mpd->inode->i_ino,
-                         mpd->b_size >> mpd->inode->i_blkbits, err);
+                                 (unsigned long long) next,
-                printk(KERN_CRIT "This should not happen!!  "
+                                 mpd->b_size >> mpd->inode->i_blkbits, err);
-                       "Data will be lost\n");
+                        ext4_msg(sb, KERN_CRIT,
-                if (err == -ENOSPC) {
+                                "This should not happen!! Data will be lost\n");
-                        ext4_print_free_blocks(mpd->inode);
+                        if (err == -ENOSPC)
+                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
@@ -2320,7 +2347,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
         * XXX Don't go larger than mballoc is willing to allocate
         * This is a stopgap solution.  We eventually need to fold
         * mpage_da_submit_io() into this function and then call
-         * ext4_get_blocks() multiple times in a loop
+         * ext4_map_blocks() multiple times in a loop
         */
        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
                goto flush_it;
@@ -2553,18 +2580,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write(), nobh_writepage(), and
+ * callback function for block_prepare_write() and block_write_full_page().
- * block_write_full_page().  These functions should only try to map a
+ * These functions should only try to map a single block at a time.
- * single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
 * requests it by passing in create=1, it is critically important that
 * any caller checks to make sure that any buffer heads are returned
 * by this function are either all already mapped or marked for
- * delayed allocation before calling nobh_writepage() or
+ * delayed allocation before calling  block_write_full_page().  Otherwise,
- * block_write_full_page().  Otherwise, b_blocknr could be left
+ * b_blocknr could be left unitialized, and the page write functions will
- * unitialized, and the page write functions will be taken by
+ * be taken by surprise.
- * surprise.
 */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
@@ -2749,9 +2774,7 @@ static int ext4_writepage(struct page *page,
                return __ext4_journalled_writepage(page, len);
        }
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        if (page_bufs && buffer_uninit(page_bufs)) {
-                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-        else if (page_bufs && buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -3146,13 +3169,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
-        unsigned from, to;
        struct inode *inode = mapping->host;
        handle_t *handle;
        index = pos >> PAGE_CACHE_SHIFT;
-        from = pos & (PAGE_CACHE_SIZE - 1);
-        to = from + len;
        if (ext4_nonda_switch(inode->i_sb)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3668,6 +3688,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
                return ret;
        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
        /* clear the DIO AIO unwritten flag */
        io->flag = 0;
        return ret;
@@ -3767,6 +3789,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
                io->offset = 0;
                io->size = 0;
                io->page = NULL;
+                io->iocb = NULL;
+                io->result = 0;
                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -3775,7 +3799,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                            ssize_t size, void *private)
+                            ssize_t size, void *private, int ret,
+                            bool is_async)
 {
        ext4_io_end_t *io_end = iocb->private;
        struct workqueue_struct *wq;
@@ -3784,7 +3809,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!io_end || !size)
-                return;
+                goto out;
        ext_debug("ext4_end_io_dio(): io_end 0x%p"
                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3795,12 +3820,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (io_end->flag != EXT4_IO_UNWRITTEN){
                ext4_free_io_end(io_end);
                iocb->private = NULL;
+out:
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
                return;
        }
        io_end->offset = offset;
        io_end->size = size;
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        if (is_async) {
+                io_end->iocb = iocb;
+                io_end->result = ret;
+        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
        /* queue the work to convert unwritten extents to written */
@@ -3937,7 +3968,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                return -ENOMEM;
                        /*
                         * we save the io structure for current async
-                         * direct IO, so that later ext4_get_blocks()
+                         * direct IO, so that later ext4_map_blocks()
                         * could flag the io structure whether there
                         * is a unwritten extents needs to be converted
                         * when IO is completed.
@@ -4128,17 +4159,6 @@ int ext4_block_truncate_page(handle_t *handle,
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        /*
-         * For "nobh" option,  we can only work if we don't need to
-         * read-in the page - otherwise we create buffers to do the IO.
-         */
-        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-             ext4_should_writeback_data(inode) && PageUptodate(page)) {
-                zero_user(page, offset, length);
-                set_page_dirty(page);
-                goto unlock;
-        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
@@ -4488,9 +4508,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                EXT4_ERROR_INODE(inode,
+                                EXT4_ERROR_INODE_BLOCK(inode, nr,
-                                                 "Read failure block=%llu",
+                                                       "Read failure");
-                                                 (unsigned long long) nr);
                                continue;
                        }
@@ -4502,27 +4521,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        depth);
                        /*
-                         * We've probably journalled the indirect block several
-                         * times during the truncate.  But it's no longer
-                         * needed and we now drop it from the transaction via
-                         * jbd2_journal_revoke().
-                         *
-                         * That's easy if it's exclusively part of this
-                         * transaction.  But if it's part of the committing
-                         * transaction then jbd2_journal_forget() will simply
-                         * brelse() it.  That means that if the underlying
-                         * block is reallocated in ext4_get_block(),
-                         * unmap_underlying_metadata() will find this block
-                         * and will try to get rid of it.  damn, damn.
-                         *
-                         * If this block has already been committed to the
-                         * journal, a revoke record will be written.  And
-                         * revoke records must be emitted *before* clearing
-                         * this block's bit in the bitmaps.
-                         */
-                        ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
-                        /*
                         * Everything below this this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
@@ -4546,8 +4544,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                            blocks_for_truncate(inode));
                        }
+                        /*
+                         * The forget flag here is critical because if
+                         * we are journaling (and not doing data
+                         * journaling), we have to make sure a revoke
+                         * record is written to prevent the journal
+                         * replay from overwriting the (former)
+                         * indirect block if it gets reallocated as a
+                         * data block.  This must happen in the same
+                         * transaction where the data blocks are
+                         * actually freed.
+                         */
                        ext4_free_blocks(handle, inode, 0, nr, 1,
-                                         EXT4_FREE_BLOCKS_METADATA);
+                                         EXT4_FREE_BLOCKS_METADATA|
+                                         EXT4_FREE_BLOCKS_FORGET);
                        if (parent_bh) {
                                /*
@@ -4805,8 +4815,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                EXT4_ERROR_INODE(inode, "unable to read inode block - "
+                EXT4_ERROR_INODE_BLOCK(inode, block,
-                                 "block %llu", block);
+                                       "unable to read itable block");
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4904,8 +4914,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        EXT4_ERROR_INODE(inode, "unable to read inode "
+                        EXT4_ERROR_INODE_BLOCK(inode, block,
-                                         "block %llu", block);
+                                               "unable to read itable block");
                        brelse(bh);
                        return -EIO;
                }
@@ -4976,7 +4986,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
-                if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
@@ -5072,7 +5082,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                transaction_t *transaction;
                tid_t tid;
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
@@ -5081,7 +5091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }
@@ -5126,7 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
-        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
+        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                    (S_ISLNK(inode->i_mode) &&
                     !ext4_inode_is_fast_symlink(inode)))
@@ -5406,9 +5416,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        EXT4_ERROR_INODE(inode,
+                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
-                                "IO error syncing inode (block=%llu)",
+                                         "IO error syncing inode");
-                                (unsigned long long) iloc.bh->b_blocknr);
                        err = -EIO;
                }
                brelse(iloc.bh);
@@ -5483,10 +5492,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
-                                error = -EFBIG;
+                                return -EFBIG;
-                                goto err_out;
-                        }
                }
        }
@@ -5688,7 +5695,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
@@ -5754,7 +5761,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
-        struct ext4_xattr_entry *entry;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;
@@ -5762,7 +5768,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
-        entry = IFIRST(header);
        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3bc026a68..4b4ad4b7ce57 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += first + i;
                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                   __func__, "double-free of inode"
+                                              inode ? inode->i_ino : 0,
-                                   " %lu's block %llu(bit %u in group %u)",
+                                              blocknr,
-                                   inode ? inode->i_ino : 0, blocknr,
+                                              "freeing block already freed "
-                                   first + i, e4b->bd_group);
+                                              "(bit %u)",
+                                              first + i);
                }
                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
        }
@@ -712,9 +713,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        grp->bb_fragments = fragments;
        if (free != grp->bb_free) {
-                ext4_grp_locked_error(sb, group,  __func__,
+                ext4_grp_locked_error(sb, group, 0, 0,
-                        "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
+                                      "%u blocks in bitmap, %u in gd",
-                        group, free, grp->bb_free);
+                                      free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
                 * corrupt and update bb_free using bitmap value
@@ -1296,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += block;
                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                   __func__, "double-free of inode"
+                                              inode ? inode->i_ino : 0,
-                                   " %lu's block %llu(bit %u in group %u)",
+                                              blocknr,
-                                   inode ? inode->i_ino : 0, blocknr, block,
+                                              "freeing already freed block "
-                                   e4b->bd_group);
+                                              "(bit %u)", block);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1788,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         * free blocks even though group info says we
                         * we have free blocks
                         */
-                        ext4_grp_locked_error(sb, e4b->bd_group,
+                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        __func__, "%d free blocks as per "
+                                        "%d free blocks as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
@@ -1798,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
-                        ext4_grp_locked_error(sb, e4b->bd_group,
+                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        __func__, "%d free blocks as per "
+                                        "%d free blocks as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
@@ -1821,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 /*
 * This is a special case for storages like raid5
- * we try to find stripe-aligned chunks for stripe-size requests
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
- * XXX should do so at least for multiples of stripe size as well
 */
 static noinline_for_stack
 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1999,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        ext4_group_t ngroups, group, i;
        int cr;
        int err = 0;
-        int bsbits;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
@@ -2041,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                        ac->ac_2order = i - 1;
        }
-        bsbits = ac->ac_sb->s_blocksize_bits;
        /* if stream allocation is enabled, use global goal */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                /* TBD: may be hot point */
@@ -2094,8 +2091,8 @@ repeat:
                        ac->ac_groups_scanned++;
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
-                        else if (cr == 1 &&
+                        else if (cr == 1 && sbi->s_stripe &&
-                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
+                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
                                ext4_mb_scan_aligned(ac, &e4b);
                        else
                                ext4_mb_complex_scan_group(ac, &e4b);
@@ -2221,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
        rc = seq_open(file, &ext4_mb_seq_groups_ops);
        if (rc == 0) {
-                struct seq_file *m = (struct seq_file *)file->private_data;
+                struct seq_file *m = file->private_data;
                m->private = sb;
        }
        return rc;
@@ -2560,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
+static inline void ext4_issue_discard(struct super_block *sb,
+                ext4_group_t block_group, ext4_grpblk_t block, int count)
+{
+        int ret;
+        ext4_fsblk_t discard_block;
+        discard_block = block + ext4_group_first_block_no(sb, block_group);
+        trace_ext4_discard_blocks(sb,
+                        (unsigned long long) discard_block, count);
+        ret = sb_issue_discard(sb, discard_block, count);
+        if (ret == EOPNOTSUPP) {
+                ext4_warning(sb, "discard not supported, disabling");
+                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+        }
+}
 /*
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
@@ -2579,22 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD)) {
+                if (test_opt(sb, DISCARD))
-                        int ret;
+                        ext4_issue_discard(sb, entry->group,
-                        ext4_fsblk_t discard_block;
+                                        entry->start_blk, entry->count);
-                        discard_block = entry->start_blk +
-                                ext4_group_first_block_no(sb, entry->group);
-                        trace_ext4_discard_blocks(sb,
-                                        (unsigned long long)discard_block,
-                                        entry->count);
-                        ret = sb_issue_discard(sb, discard_block, entry->count);
-                        if (ret == EOPNOTSUPP) {
-                                ext4_warning(sb,
-                                        "discard not supported, disabling");
-                                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
-                        }
-                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2704,7 +2704,7 @@ void exit_ext4_mballoc(void)
 /*
- * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
 * Returns 0 if success or error code
 */
 static noinline_for_stack int
@@ -2712,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                                handle_t *handle, unsigned int reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
-        struct ext4_super_block *es;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi;
@@ -2725,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        err = -EIO;
        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2812,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 out_err:
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
        brelse(bitmap_bh);
        return err;
 }
@@ -2850,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        int bsbits, max;
        ext4_lblk_t end;
        loff_t size, orig_size, start_off;
-        ext4_lblk_t start, orig_start;
+        ext4_lblk_t start;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *pa;
@@ -2881,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
+        orig_size = size;
        /* max size of free chunks */
        max = 2 << bsbits;
@@ -2922,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
                size      = ac->ac_o_ex.fe_len << bsbits;
        }
-        orig_size = size = size >> bsbits;
+        size = size >> bsbits;
-        orig_start = start = start_off >> bsbits;
+        start = start_off >> bsbits;
        /* don't cover already allocated blocks in selected range */
        if (ar->pleft && start <= ar->lleft) {
@@ -3547,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        ext4_group_t group;
        ext4_grpblk_t bit;
        unsigned long long grp_blk_start;
-        sector_t start;
        int err = 0;
        int free = 0;
@@ -3567,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                start = ext4_group_first_block_no(sb, group) + bit;
                mb_debug(1, "    free preallocated %u/%u in group %u\n",
-                                (unsigned) start, (unsigned) next - bit,
+                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
-                                (unsigned) group);
+                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
                if (ac) {
@@ -3581,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        trace_ext4_mballoc_discard(ac);
                }
-                trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
+                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
@@ -3591,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        pa, (unsigned long) pa->pa_lstart,
                        (unsigned long) pa->pa_pstart,
                        (unsigned long) pa->pa_len);
-                ext4_grp_locked_error(sb, group,
+                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
-                                        __func__, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
@@ -3613,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(ac, pa);
+        trace_ext4_mb_release_group_pa(sb, ac, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
+        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                return;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
        printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4255,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 * to usual allocation
 */
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
-                                 struct ext4_allocation_request *ar, int *errp)
+                                struct ext4_allocation_request *ar, int *errp)
 {
        int freed;
        struct ext4_allocation_context *ac = NULL;
@@ -4299,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                inquota = ar->len;
                if (ar->len == 0) {
                        *errp = -EDQUOT;
-                        goto out3;
+                        goto out;
                }
        }
@@ -4307,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        if (!ac) {
                ar->len = 0;
                *errp = -ENOMEM;
-                goto out1;
+                goto out;
        }
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
-                goto out2;
+                goto out;
        }
        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4322,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                ext4_mb_normalize_request(ac, ar);
 repeat:
                /* allocate space in core */
-                ext4_mb_regular_allocator(ac);
+                *errp = ext4_mb_regular_allocator(ac);
+                if (*errp)
+                        goto errout;
                /* as we've just preallocated more space than
                 * user requested orinally, we store allocated
@@ -4333,7 +4333,7 @@ repeat:
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
-                if (*errp ==  -EAGAIN) {
+                if (*errp == -EAGAIN) {
                        /*
                         * drop the reference that we took
                         * in ext4_mb_use_best_found
@@ -4344,12 +4344,10 @@ repeat:
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
-                } else if (*errp) {
+                } else if (*errp)
+                errout:
                        ext4_discard_allocated_blocks(ac);
-                        ac->ac_b_ex.fe_len = 0;
+                else {
-                        ar->len = 0;
-                        ext4_mb_show_ac(ac);
-                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
@@ -4358,19 +4356,19 @@ repeat:
                if (freed)
                        goto repeat;
                *errp = -ENOSPC;
+        }
+        if (*errp) {
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
                ext4_mb_show_ac(ac);
        }
        ext4_mb_release_context(ac);
+out:
-out2:
+        if (ac)
-        kmem_cache_free(ext4_ac_cachep, ac);
+                kmem_cache_free(ext4_ac_cachep, ac);
-out1:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
-out3:
        if (!ar->len) {
                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
                        /* release all the reserved blocks if non delalloc */
@@ -4402,6 +4400,7 @@ static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
 {
+        ext4_group_t group = e4b->bd_group;
        ext4_grpblk_t block;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
@@ -4434,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+                        ext4_grp_locked_error(sb, group, 0,
-                                        "Double free of blocks %d (%d %d)",
+                                ext4_group_first_block_no(sb, group) + block,
-                                        block, entry->start_blk, entry->count);
+                                "Block already on to-be-freed list");
                        return 0;
                }
        }
@@ -4494,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
-        struct ext4_super_block *es;
        unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
@@ -4513,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        }
        sbi = EXT4_SB(sb);
-        es = EXT4_SB(sb)->s_es;
        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_data_block_valid(sbi, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4647,6 +4644,8 @@ do_more:
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+                if (test_opt(sb, DISCARD))
+                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4680,7 +4679,7 @@ do_more:
                put_bh(bitmap_bh);
                goto do_more;
        }
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
 error_return:
        if (freed)
                dquot_free_block(inode, freed);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6f3a27ec30bf..1765c2c50a9b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
         */
-        ei->i_flags |= EXT4_EXTENTS_FL;
+        ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
        /*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 52abfa12762a..5f1ed9fc913c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 */
 static int
 mext_check_null_inode(struct inode *inode1, struct inode *inode2,
-                const char *function)
+                      const char *function, unsigned int line)
 {
        int ret = 0;
        if (inode1 == NULL) {
-                __ext4_error(inode2->i_sb, function,
+                __ext4_error(inode2->i_sb, function, line,
                        "Both inodes should not be NULL: "
                        "inode1 NULL inode2 %lu", inode2->i_ino);
                ret = -EIO;
        } else if (inode2 == NULL) {
-                __ext4_error(inode1->i_sb, function,
+                __ext4_error(inode1->i_sb, function, line,
                        "Both inodes should not be NULL: "
                        "inode1 %lu inode2 NULL", inode1->i_ino);
                ret = -EIO;
@@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
        BUG_ON(inode1 == NULL && inode2 == NULL);
-        ret = mext_check_null_inode(inode1, inode2, __func__);
+        ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
        if (ret < 0)
                goto out;
@@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
        BUG_ON(inode1 == NULL && inode2 == NULL);
-        ret = mext_check_null_inode(inode1, inode2, __func__);
+        ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
        if (ret < 0)
                goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a43e6617b351..314c0d3b3fa9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
-unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
-{
-        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN || len == 0)
-                return blocksize;
-        return (len & 65532) | ((len & 3) << 16);
-}
-__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
-{
-        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
-                BUG();
-        if (len < 65536)
-                return cpu_to_le16(len);
-        if (len == blocksize) {
-                if (blocksize == 65536)
-                        return cpu_to_le16(EXT4_MAX_REC_LEN);
-                else
-                        return cpu_to_le16(0);
-        }
-        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-}
 /*
 * p is at least 6 bytes before the end of page
 */
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+                if (!ext4_check_dir_entry(dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry("ext4_find_entry",
+                        if (!ext4_check_dir_entry(dir, de, bh, offset))
-                                                  dir, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
                                brelse(bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
@@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
        __u32 ino;
-        struct inode *inode;
        static const struct qstr dotdot = {
                .name = "..",
                .len = 2,
@@ -1097,7 +1071,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
        struct buffer_head *bh;
        bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-        inode = NULL;
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
@@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+                        if (!ext4_check_dir_entry(dir, de, bh, offset))
-                                                  bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle,
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+                if (!ext4_check_dir_entry(dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
@@ -1956,7 +1928,7 @@ static int empty_dir(struct inode *inode)
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6df797eb9aeb..ca5c8aa00a2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -921,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
-        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+        ext4_handle_dirty_super(handle, sb);
-        sb->s_dirt = 1;
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -953,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                      ext4_fsblk_t n_blocks_count)
 {
        ext4_fsblk_t o_blocks_count;
-        ext4_group_t o_groups_count;
        ext4_grpblk_t last;
        ext4_grpblk_t add;
        struct buffer_head *bh;
@@ -965,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
         * yet: we're going to revalidate es->s_blocks_count after
         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
-        o_groups_count = EXT4_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1045,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
-        sb->s_dirt = 1;
        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
        ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+        ext4_handle_dirty_super(handle, sb);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e8983a9811b..8d65575f8c8c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,14 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        vfs_check_frozen(sb, SB_FREEZE_TRANS);
        /* Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, __func__, "Detected aborted journal");
+                        ext4_abort(sb, "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -262,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
 */
-int __ext4_journal_stop(const char *where, handle_t *handle)
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 {
        struct super_block *sb;
        int err;
@@ -279,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        if (!err)
                err = rc;
        if (err)
-                __ext4_std_error(sb, where, err);
+                __ext4_std_error(sb, where, line, err);
        return err;
 }
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
-                struct buffer_head *bh, handle_t *handle, int err)
+                               const char *err_fn, struct buffer_head *bh,
+                               handle_t *handle, int err)
 {
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -300,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
        if (is_handle_aborted(handle))
                return;
-        printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+        printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
-               caller, errstr, err_fn);
+               caller, line, errstr, err_fn);
        jbd2_journal_abort_handle(handle);
 }
+static void __save_error_info(struct super_block *sb, const char *func,
+                            unsigned int line)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+        es->s_last_error_time = cpu_to_le32(get_seconds());
+        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+        es->s_last_error_line = cpu_to_le32(line);
+        if (!es->s_first_error_time) {
+                es->s_first_error_time = es->s_last_error_time;
+                strncpy(es->s_first_error_func, func,
+                        sizeof(es->s_first_error_func));
+                es->s_first_error_line = cpu_to_le32(line);
+                es->s_first_error_ino = es->s_last_error_ino;
+                es->s_first_error_block = es->s_last_error_block;
+        }
+        /*
+         * Start the daily error reporting function if it hasn't been
+         * started already
+         */
+        if (!es->s_error_count)
+                mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+        es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+}
+static void save_error_info(struct super_block *sb, const char *func,
+                            unsigned int line)
+{
+        __save_error_info(sb, func, line);
+        ext4_commit_super(sb, 1);
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
@@ -323,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 static void ext4_handle_error(struct super_block *sb)
 {
-        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        if (sb->s_flags & MS_RDONLY)
                return;
@@ -342,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb)
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-        ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
 }
 void __ext4_error(struct super_block *sb, const char *function,
-                const char *fmt, ...)
+                  unsigned int line, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+               sb->s_id, function, line, current->comm);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -362,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
-void ext4_error_inode(const char *function, struct inode *inode,
+void ext4_error_inode(struct inode *inode, const char *function,
+                      unsigned int line, ext4_fsblk_t block,
                      const char *fmt, ...)
 {
        va_list args;
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+        es->s_last_error_block = cpu_to_le64(block);
+        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, inode->i_ino, current->comm);
+               inode->i_sb->s_id, function, line, inode->i_ino);
+        if (block)
+                printk("block %llu: ", block);
+        printk("comm %s: ", current->comm);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -377,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
        ext4_handle_error(inode->i_sb);
 }
-void ext4_error_file(const char *function, struct file *file,
+void ext4_error_file(struct file *file, const char *function,
-                     const char *fmt, ...)
+                     unsigned int line, const char *fmt, ...)
 {
        va_list args;
+        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
+        es = EXT4_SB(inode->i_sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
        if (!path)
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+               "EXT4-fs error (device %s): %s:%d: inode #%lu "
-               inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino,
+               current->comm, path);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -435,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */
-void __ext4_std_error(struct super_block *sb, const char *function, int errno)
+void __ext4_std_error(struct super_block *sb, const char *function,
+                      unsigned int line, int errno)
 {
        char nbuf[16];
        const char *errstr;
@@ -448,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
                return;
        errstr = ext4_decode_error(sb, errno, nbuf);
-        printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+        printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
-               sb->s_id, function, errstr);
+               sb->s_id, function, line, errstr);
+        save_error_info(sb, function, line);
        ext4_handle_error(sb);
 }
@@ -464,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
 * case we take the easy way out and panic immediately.
 */
-void ext4_abort(struct super_block *sb, const char *function,
+void __ext4_abort(struct super_block *sb, const char *function,
-                const char *fmt, ...)
+                unsigned int line, const char *fmt, ...)
 {
        va_list args;
+        save_error_info(sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+               function, line);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
+        if ((sb->s_flags & MS_RDONLY) == 0) {
+                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+                sb->s_flags |= MS_RDONLY;
+                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                if (EXT4_SB(sb)->s_journal)
+                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+                save_error_info(sb, function, line);
+        }
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs panic from previous error\n");
-        if (sb->s_flags & MS_RDONLY)
-                return;
-        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-        sb->s_flags |= MS_RDONLY;
-        EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
-        if (EXT4_SB(sb)->s_journal)
-                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 void ext4_msg (struct super_block * sb, const char *prefix,
@@ -502,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
 }
 void __ext4_warning(struct super_block *sb, const char *function,
-                  const char *fmt, ...)
+                    unsigned int line, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
-               sb->s_id, function);
+               sb->s_id, function, line);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
 }
-void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+void __ext4_grp_locked_error(const char *function, unsigned int line,
-                           const char *function, const char *fmt, ...)
+                             struct super_block *sb, ext4_group_t grp,
+                             unsigned long ino, ext4_fsblk_t block,
+                             const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(ino);
+        es->s_last_error_block = cpu_to_le64(block);
+        __save_error_info(sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+               sb->s_id, function, line, grp);
+        if (ino)
+                printk("inode %lu: ", ino);
+        if (block)
+                printk("block %llu:", (unsigned long long) block);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
-                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
        ext4_handle_error(sb);
        /*
@@ -660,8 +716,7 @@ static void ext4_put_super(struct super_block *sb)
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
                if (err < 0)
-                        ext4_abort(sb, __func__,
+                        ext4_abort(sb, "Couldn't clean up the journal");
-                                   "Couldn't clean up the journal");
        }
        ext4_release_system_zone(sb);
@@ -946,14 +1001,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",journal_async_commit");
        else if (test_opt(sb, JOURNAL_CHECKSUM))
                seq_puts(seq, ",journal_checksum");
-        if (test_opt(sb, NOBH))
-                seq_puts(seq, ",nobh");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
-        if (!test_opt(sb, DELALLOC))
+        if (!test_opt(sb, DELALLOC) &&
+            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -977,7 +1030,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NO_AUTO_DA_ALLOC))
                seq_puts(seq, ",noauto_da_alloc");
-        if (test_opt(sb, DISCARD))
+        if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
                seq_puts(seq, ",discard");
        if (test_opt(sb, NOLOAD))
@@ -986,6 +1039,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, DIOREAD_NOLOCK))
                seq_puts(seq, ",dioread_nolock");
+        if (test_opt(sb, BLOCK_VALIDITY) &&
+            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
+                seq_puts(seq, ",block_validity");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1065,6 +1122,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                                char *path);
+static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1086,7 +1144,7 @@ static const struct dquot_operations ext4_quota_operations = {
 static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on       = ext4_quota_on,
-        .quota_off      = dquot_quota_off,
+        .quota_off      = ext4_quota_off,
        .quota_sync     = dquot_quota_sync,
        .get_info       = dquot_get_dqinfo,
        .set_info       = dquot_set_dqinfo,
@@ -1624,10 +1682,12 @@ set_qf_format:
                        *n_blocks_count = option;
                        break;
                case Opt_nobh:
-                        set_opt(sbi->s_mount_opt, NOBH);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated nobh option");
                        break;
                case Opt_bh:
-                        clear_opt(sbi->s_mount_opt, NOBH);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
@@ -2249,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 {
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        if (!sb->s_bdev->bd_part)
+                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%lu\n",
                        (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                         sbi->s_sectors_written_start) >> 1);
@@ -2259,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 {
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        if (!sb->s_bdev->bd_part)
+                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%llu\n",
                        (unsigned long long)(sbi->s_kbytes_written +
                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2431,6 +2495,53 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
        return 1;
 }
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+        struct super_block *sb = (struct super_block *) arg;
+        struct ext4_sb_info *sbi;
+        struct ext4_super_block *es;
+        sbi = EXT4_SB(sb);
+        es = sbi->s_es;
+        if (es->s_error_count)
+                ext4_msg(sb, KERN_NOTICE, "error count: %u",
+                         le32_to_cpu(es->s_error_count));
+        if (es->s_first_error_time) {
+                printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+                       sb->s_id, le32_to_cpu(es->s_first_error_time),
+                       (int) sizeof(es->s_first_error_func),
+                       es->s_first_error_func,
+                       le32_to_cpu(es->s_first_error_line));
+                if (es->s_first_error_ino)
+                        printk(": inode %u",
+                               le32_to_cpu(es->s_first_error_ino));
+                if (es->s_first_error_block)
+                        printk(": block %llu", (unsigned long long)
+                               le64_to_cpu(es->s_first_error_block));
+                printk("\n");
+        }
+        if (es->s_last_error_time) {
+                printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+                       sb->s_id, le32_to_cpu(es->s_last_error_time),
+                       (int) sizeof(es->s_last_error_func),
+                       es->s_last_error_func,
+                       le32_to_cpu(es->s_last_error_line));
+                if (es->s_last_error_ino)
+                        printk(": inode %u",
+                               le32_to_cpu(es->s_last_error_ino));
+                if (es->s_last_error_block)
+                        printk(": block %llu", (unsigned long long)
+                               le64_to_cpu(es->s_last_error_block));
+                printk("\n");
+        }
+        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2448,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        char *cp;
        const char *descr;
-        int ret = -EINVAL;
+        int ret = -ENOMEM;
        int blocksize;
        unsigned int db_count;
        unsigned int i;
@@ -2459,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto out_free_orig;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock) {
                kfree(sbi);
-                return -ENOMEM;
+                goto out_free_orig;
        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
@@ -2473,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_resgid = EXT4_DEF_RESGID;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
-        sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+        if (sb->s_bdev->bd_part)
-                                                      sectors[1]);
+                sbi->s_sectors_written_start =
+                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
        unlock_kernel();
@@ -2482,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
+        ret = -EINVAL;
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2546,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, ERRORS_CONT);
        else
                set_opt(sbi->s_mount_opt, ERRORS_RO);
+        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+        if (def_mount_opts & EXT4_DEFM_DISCARD)
+                set_opt(sbi->s_mount_opt, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2553,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-        set_opt(sbi->s_mount_opt, BARRIER);
+        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+                set_opt(sbi->s_mount_opt, BARRIER);
        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
-        if (!IS_EXT3_SB(sb))
+        if (!IS_EXT3_SB(sb) &&
+            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sbi->s_mount_opt, DELALLOC);
+        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+                           &journal_devnum, &journal_ioprio, NULL, 0)) {
+                ext4_msg(sb, KERN_WARNING,
+                         "failed to parse options in superblock: %s",
+                         sbi->s_es->s_mount_opts);
+        }
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2912,18 +3037,7 @@ no_journal:
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount_wq;
        }
-        if (test_opt(sb, NOBH)) {
-                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
-                                "its supported only with writeback mode");
-                        clear_opt(sbi->s_mount_opt, NOBH);
-                }
-                if (test_opt(sb, DIOREAD_NOLOCK)) {
-                        ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
-                                "not supported with nobh mode");
-                        goto failed_mount_wq;
-                }
-        }
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3010,7 +3124,7 @@ no_journal:
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-                ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount4;
        }
@@ -3043,7 +3157,14 @@ no_journal:
                descr = "out journal";
        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-                "Opts: %s", descr, orig_data);
+                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
+        if (es->s_error_count)
+                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
        lock_kernel();
        kfree(orig_data);
@@ -3093,6 +3214,7 @@ out_fail:
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
+out_free_orig:
        kfree(orig_data);
        return ret;
 }
@@ -3110,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
@@ -3119,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
        else
                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3327,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
                err = jbd2_journal_wipe(journal, !really_read_only);
-        if (!err)
+        if (!err) {
+                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+                if (save)
+                        memcpy(save, ((char *) es) +
+                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
+                if (save)
+                        memcpy(((char *) es) + EXT4_S_ERR_START,
+                               save, EXT4_S_ERR_LEN);
+                kfree(save);
+        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3384,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync)
         */
        if (!(sb->s_flags & MS_RDONLY))
                es->s_wtime = cpu_to_le32(get_seconds());
-        es->s_kbytes_written =
+        if (sb->s_bdev->bd_part)
-                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+                es->s_kbytes_written =
+                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
+        else
+                es->s_kbytes_written =
+                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3491,7 +3626,7 @@ int ext4_force_commit(struct super_block *sb)
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
-                vfs_check_frozen(sb, SB_FREEZE_WRITE);
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
                ret = ext4_journal_force_commit(journal);
        }
@@ -3616,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        }
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
-                ext4_abort(sb, __func__, "Abort forced by user");
+                ext4_abort(sb, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3981,6 +4116,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        return err;
 }
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+        /* Force all delayed allocation blocks to be allocated */
+        if (test_opt(sb, DELALLOC)) {
+                down_read(&sb->s_umount);
+                sync_filesystem(sb);
+                up_read(&sb->s_umount);
+        }
+        return dquot_quota_off(sb, type);
+}
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and noone else should touch the files)
@@ -4030,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
-        int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
@@ -4055,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        bh = ext4_bread(handle, inode, blk, 1, &err);
        if (!bh)
                goto out;
-        if (journal_quota) {
+        err = ext4_journal_get_write_access(handle, bh);
-                err = ext4_journal_get_write_access(handle, bh);
+        if (err) {
-                if (err) {
+                brelse(bh);
-                        brelse(bh);
+                goto out;
-                        goto out;
-                }
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_page(bh->b_page);
        unlock_buffer(bh);
-        if (journal_quota)
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
-                err = ext4_handle_dirty_metadata(handle, NULL, bh);
-        else {
-                /* Always do at least ordered writes for quotas */
-                err = ext4_jbd2_file_inode(handle, inode);
-                mark_buffer_dirty(bh);
-        }
        brelse(bh);
 out:
        if (err) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04338009793a..a6f314249574 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
-                sb->s_dirt = 1;
+                ext4_handle_dirty_super(handle, sb);
-                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
 }
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d994..cccaead962c2 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -178,7 +178,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
        fdt->open_fds = (fd_set *)data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = (fd_set *)data;
-        INIT_RCU_HEAD(&fdt->rcu);
        fdt->next = NULL;
        return fdt;
@@ -312,7 +311,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
        new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
        new_fdt->fd = &newf->fd_array[0];
-        INIT_RCU_HEAD(&new_fdt->rcu);
        new_fdt->next = NULL;
        spin_lock(&oldf->file_lock);
@@ -430,7 +428,6 @@ struct files_struct init_files = {
                .fd             = &init_files.fd_array[0],
                .close_on_exec  = (fd_set *)&init_files.close_on_exec_init,
                .open_fds       = (fd_set *)&init_files.open_fds_init,
-                .rcu            = RCU_HEAD_INIT,
        },
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e4..5132c99b1ca2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 }
 /**
- * vxfs_read_super - read superblock into memory and initalize filesystem
+ * vxfs_read_super - read superblock into memory and initialize filesystem
 * @sbp:                VFS superblock (to fill)
 * @dp:                 fs private mount data
 * @silent:             do not complain loudly when sth is wrong
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0609607d3955..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,43 +38,18 @@ int nr_pdflush_threads;
 /*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
-struct wb_writeback_args {
+struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
-};
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
        struct list_head list;          /* pending work list */
-        struct rcu_head rcu_head;       /* for RCU free/clear of work */
+        struct completion *done;        /* set if the caller waits */
-        unsigned long seen;             /* threads that have seen this work */
-        atomic_t pending;               /* number of threads still to do work */
-        struct wb_writeback_args args;  /* writeback arguments */
-        unsigned long state;            /* flag bits, see WS_* */
 };
-enum {
-        WS_INPROGRESS = 0,
-        WS_ONSTACK,
-};
-static inline void bdi_work_init(struct bdi_work *work,
-                                 struct wb_writeback_args *args)
-{
-        INIT_RCU_HEAD(&work->rcu_head);
-        work->args = *args;
-        __set_bit(WS_INPROGRESS, &work->state);
-}
 /**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
@@ -87,49 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
        return !list_empty(&bdi->work_list);
 }
-static void bdi_work_free(struct rcu_head *head)
+static void bdi_queue_work(struct backing_dev_info *bdi,
-{
+                struct wb_writeback_work *work)
-        struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-        clear_bit(WS_INPROGRESS, &work->state);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&work->state, WS_INPROGRESS);
-        if (!test_bit(WS_ONSTACK, &work->state))
-                kfree(work);
-}
-static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 {
-        /*
-         * The caller has retrieved the work arguments from this work,
-         * drop our reference. If this is the last ref, delete and free it
-         */
-        if (atomic_dec_and_test(&work->pending)) {
-                struct backing_dev_info *bdi = wb->bdi;
-                spin_lock(&bdi->wb_lock);
-                list_del_rcu(&work->list);
-                spin_unlock(&bdi->wb_lock);
-                call_rcu(&work->rcu_head, bdi_work_free);
-        }
-}
-static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
-{
-        work->seen = bdi->wb_mask;
-        BUG_ON(!work->seen);
-        atomic_set(&work->pending, bdi->wb_cnt);
-        BUG_ON(!bdi->wb_cnt);
-        /*
-         * list_add_tail_rcu() contains the necessary barriers to
-         * make sure the above stores are seen before the item is
-         * noticed on the list
-         */
        spin_lock(&bdi->wb_lock);
-        list_add_tail_rcu(&work->list, &bdi->work_list);
+        list_add_tail(&work->list, &bdi->work_list);
        spin_unlock(&bdi->wb_lock);
        /*
@@ -146,55 +83,29 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
        }
 }
-/*
+static void
- * Used for on-stack allocated work items. The caller needs to wait until
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- * the wb threads have acked the work before it's safe to continue.
+                bool range_cyclic, bool for_background)
- */
-static void bdi_wait_on_work_done(struct bdi_work *work)
 {
-        wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
+        struct wb_writeback_work *work;
-                    TASK_UNINTERRUPTIBLE);
-}
-static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args)
-{
-        struct bdi_work *work;
        /*
         * This is WB_SYNC_NONE writeback, so if allocation fails just
         * wakeup the thread for old dirty data writeback
         */
-        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        work = kzalloc(sizeof(*work), GFP_ATOMIC);
-        if (work) {
+        if (!work) {
-                bdi_work_init(work, args);
+                if (bdi->wb.task)
-                bdi_queue_work(bdi, work);
+                        wake_up_process(bdi->wb.task);
-        } else {
+                return;
-                struct bdi_writeback *wb = &bdi->wb;
-                if (wb->task)
-                        wake_up_process(wb->task);
        }
-}
-/**
+        work->sync_mode = WB_SYNC_NONE;
- * bdi_queue_work_onstack - start and wait for writeback
+        work->nr_pages  = nr_pages;
- * @sb: write inodes from this super_block
+        work->range_cyclic = range_cyclic;
- *
+        work->for_background = for_background;
- * Description:
- *   This function initiates writeback and waits for the operation to
- *   complete. Callers must hold the sb s_umount semaphore for
- *   reading, to avoid having the super disappear before we are done.
- */
-static void bdi_queue_work_onstack(struct wb_writeback_args *args)
-{
-        struct bdi_work work;
-        bdi_work_init(&work, args);
-        __set_bit(WS_ONSTACK, &work.state);
-        bdi_queue_work(args->sb->s_bdi, &work);
+        bdi_queue_work(bdi, work);
-        bdi_wait_on_work_done(&work);
 }
 /**
@@ -210,13 +121,7 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        struct wb_writeback_args args = {
+        __bdi_start_writeback(bdi, nr_pages, true, false);
-                .sync_mode      = WB_SYNC_NONE,
-                .nr_pages       = nr_pages,
-                .range_cyclic   = 1,
-        };
-        bdi_alloc_queue_work(bdi, &args);
 }
 /**
@@ -230,13 +135,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-        struct wb_writeback_args args = {
+        __bdi_start_writeback(bdi, LONG_MAX, true, true);
-                .sync_mode      = WB_SYNC_NONE,
-                .nr_pages       = LONG_MAX,
-                .for_background = 1,
-                .range_cyclic   = 1,
-        };
-        bdi_alloc_queue_work(bdi, &args);
 }
 /*
@@ -554,29 +453,41 @@ static bool pin_sb_for_writeback(struct super_block *sb)
 /*
 * Write a portion of b_io inodes which belong to @sb.
- * If @wbc->sb != NULL, then find and write all such
+ *
+ * If @only_this_sb is true, then find and write all such
 * inodes. Otherwise write only ones which go sequentially
 * in reverse order.
+ *
 * Return 1, if the caller writeback routine should be
 * interrupted. Otherwise return 0.
 */
-static int writeback_sb_inodes(struct super_block *sb,
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-                               struct bdi_writeback *wb,
+                struct writeback_control *wbc, bool only_this_sb)
-                               struct writeback_control *wbc)
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
                struct inode *inode = list_entry(wb->b_io.prev,
                                                 struct inode, i_list);
-                if (wbc->sb && sb != inode->i_sb) {
-                        /* super block given and doesn't
+                if (inode->i_sb != sb) {
-                           match, skip this inode */
+                        if (only_this_sb) {
-                        redirty_tail(inode);
+                                /*
-                        continue;
+                                 * We only want to write back data for this
-                }
+                                 * superblock, move all inodes not belonging
-                if (sb != inode->i_sb)
+                                 * to it back onto the dirty list.
-                        /* finish with this superblock */
+                                 */
+                                redirty_tail(inode);
+                                continue;
+                        }
+                        /*
+                         * The inode belongs to a different superblock.
+                         * Bounce back to the caller to unpin this and
+                         * pin the next superblock.
+                         */
                        return 0;
+                }
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
@@ -614,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb,
        return 1;
 }
-static void writeback_inodes_wb(struct bdi_writeback *wb,
+void writeback_inodes_wb(struct bdi_writeback *wb,
-                                struct writeback_control *wbc)
+                struct writeback_control *wbc)
 {
        int ret = 0;
@@ -629,29 +540,12 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
-                if (wbc->sb) {
+                if (!pin_sb_for_writeback(sb)) {
-                        /*
+                        requeue_io(inode);
-                         * We are requested to write out inodes for a specific
+                        continue;
-                         * superblock.  This means we already have s_umount
-                         * taken by the caller which also waits for us to
-                         * complete the writeout.
-                         */
-                        if (sb != wbc->sb) {
-                                redirty_tail(inode);
-                                continue;
-                        }
-                        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-                        ret = writeback_sb_inodes(sb, wb, wbc);
-                } else {
-                        if (!pin_sb_for_writeback(sb)) {
-                                requeue_io(inode);
-                                continue;
-                        }
-                        ret = writeback_sb_inodes(sb, wb, wbc);
-                        drop_super(sb);
                }
+                ret = writeback_sb_inodes(sb, wb, wbc, false);
+                drop_super(sb);
                if (ret)
                        break;
@@ -660,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
        /* Leave any unwritten inodes on b_io */
 }
-void writeback_inodes_wbc(struct writeback_control *wbc)
+static void __writeback_inodes_sb(struct super_block *sb,
+                struct bdi_writeback *wb, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = wbc->bdi;
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        writeback_inodes_wb(&bdi->wb, wbc);
+        wbc->wb_start = jiffies; /* livelock avoidance */
+        spin_lock(&inode_lock);
+        if (!wbc->for_kupdate || list_empty(&wb->b_io))
+                queue_io(wb, wbc->older_than_this);
+        writeback_sb_inodes(sb, wb, wbc, true);
+        spin_unlock(&inode_lock);
 }
 /*
@@ -702,16 +602,14 @@ static inline bool over_bground_thresh(void)
 * all dirty pages if they are all attached to "old" mappings.
 */
 static long wb_writeback(struct bdi_writeback *wb,
-                         struct wb_writeback_args *args)
+                         struct wb_writeback_work *work)
 {
        struct writeback_control wbc = {
-                .bdi                    = wb->bdi,
+                .sync_mode              = work->sync_mode,
-                .sb                     = args->sb,
-                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
-                .for_kupdate            = args->for_kupdate,
+                .for_kupdate            = work->for_kupdate,
-                .for_background         = args->for_background,
+                .for_background         = work->for_background,
-                .range_cyclic           = args->range_cyclic,
+                .range_cyclic           = work->range_cyclic,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -731,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
-                if (args->nr_pages <= 0)
+                if (work->nr_pages <= 0)
                        break;
                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
-                if (args->for_background && !over_bground_thresh())
+                if (work->for_background && !over_bground_thresh())
                        break;
                wbc.more_io = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
-                writeback_inodes_wb(wb, &wbc);
+                if (work->sb)
-                args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                        __writeback_inodes_sb(work->sb, wb, &wbc);
+                else
+                        writeback_inodes_wb(wb, &wbc);
+                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                /*
@@ -781,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
 }
 /*
- * Return the next bdi_work struct that hasn't been processed by this
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
- * wb thread yet. ->seen is initially set for each thread that exists
- * for this device, when a thread first notices a piece of work it
- * clears its bit. Depending on writeback type, the thread will notify
- * completion on either receiving the work (WB_SYNC_NONE) or after
- * it is done (WB_SYNC_ALL).
 */
-static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+static struct wb_writeback_work *
-                                           struct bdi_writeback *wb)
+get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
 {
-        struct bdi_work *work, *ret = NULL;
+        struct wb_writeback_work *work = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(work, &bdi->work_list, list) {
+        spin_lock(&bdi->wb_lock);
-                if (!test_bit(wb->nr, &work->seen))
+        if (!list_empty(&bdi->work_list)) {
-                        continue;
+                work = list_entry(bdi->work_list.next,
-                clear_bit(wb->nr, &work->seen);
+                                  struct wb_writeback_work, list);
+                list_del_init(&work->list);
-                ret = work;
-                break;
        }
+        spin_unlock(&bdi->wb_lock);
-        rcu_read_unlock();
+        return work;
-        return ret;
 }
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -830,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        if (nr_pages) {
-                struct wb_writeback_args args = {
+                struct wb_writeback_work work = {
                        .nr_pages       = nr_pages,
                        .sync_mode      = WB_SYNC_NONE,
                        .for_kupdate    = 1,
                        .range_cyclic   = 1,
                };
-                return wb_writeback(wb, &args);
+                return wb_writeback(wb, &work);
        }
        return 0;
@@ -849,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
        struct backing_dev_info *bdi = wb->bdi;
-        struct bdi_work *work;
+        struct wb_writeback_work *work;
        long wrote = 0;
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
-                struct wb_writeback_args args = work->args;
                /*
                 * Override sync mode, in case we must wait for completion
+                 * because this thread is exiting now.
                 */
                if (force_wait)
-                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+                        work->sync_mode = WB_SYNC_ALL;
-                /*
-                 * If this isn't a data integrity operation, just notify
-                 * that we have seen this work and we are now starting it.
-                 */
-                if (!test_bit(WS_ONSTACK, &work->state))
-                        wb_clear_pending(wb, work);
-                wrote += wb_writeback(wb, &args);
+                wrote += wb_writeback(wb, work);
                /*
-                 * This is a data integrity writeback, so only do the
+                 * Notify the caller of completion if this is a synchronous
-                 * notification when we have completed the work.
+                 * work item, otherwise just free it.
                 */
-                if (test_bit(WS_ONSTACK, &work->state))
+                if (work->done)
-                        wb_clear_pending(wb, work);
+                        complete(work->done);
+                else
+                        kfree(work);
        }
        /*
@@ -938,14 +823,9 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 void wakeup_flusher_threads(long nr_pages)
 {
        struct backing_dev_info *bdi;
-        struct wb_writeback_args args = {
-                .sync_mode      = WB_SYNC_NONE,
-        };
-        if (nr_pages) {
+        if (!nr_pages) {
-                args.nr_pages = nr_pages;
+                nr_pages = global_page_state(NR_FILE_DIRTY) +
-        } else {
-                args.nr_pages = global_page_state(NR_FILE_DIRTY) +
                                global_page_state(NR_UNSTABLE_NFS);
        }
@@ -953,7 +833,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                bdi_alloc_queue_work(bdi, &args);
+                __bdi_start_writeback(bdi, nr_pages, false, false);
        }
        rcu_read_unlock();
 }
@@ -1162,17 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb)
 {
        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        struct wb_writeback_args args = {
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
+                .done           = &done,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        args.nr_pages = nr_dirty + nr_unstable +
+        work.nr_pages = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_queue_work_onstack(&args);
+        bdi_queue_work(sb->s_bdi, &work);
+        wait_for_completion(&done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1204,16 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 */
 void sync_inodes_sb(struct super_block *sb)
 {
-        struct wb_writeback_args args = {
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
+                .done           = &done,
        };
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        bdi_queue_work_onstack(&args);
+        bdi_queue_work(sb->s_bdi, &work);
+        wait_for_completion(&done);
        wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 config FSCACHE
        tristate "General filesystem local caching manager"
-        select SLOW_WORK
        help
          This option enables a generic filesystem caching manager that can be
          used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..6a026441c5a6 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+static inline bool fscache_object_congested(void)
+{
+        return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..f9d856773f79 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
                 "FS-Cache debugging mask");
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+                                     void __user *buffer,
+                                     size_t *lenp, loff_t *ppos)
+{
+        struct workqueue_struct **wqp = table->extra1;
+        unsigned int *datap = table->data;
+        int ret;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret == 0)
+                workqueue_set_max_active(*wqp, *datap);
+        return ret;
+}
+ctl_table fscache_sysctls[] = {
+        {
+                .procname       = "object_max_active",
+                .data           = &fscache_object_max_active,
+                .maxlen         = sizeof(unsigned),
+                .mode           = 0644,
+                .proc_handler   = fscache_max_active_sysctl,
+                .extra1         = &fscache_object_wq,
+        },
+        {
+                .procname       = "operation_max_active",
+                .data           = &fscache_op_max_active,
+                .maxlen         = sizeof(unsigned),
+                .mode           = 0644,
+                .proc_handler   = fscache_max_active_sysctl,
+                .extra1         = &fscache_op_wq,
+        },
+        {}
+};
+ctl_table fscache_sysctls_root[] = {
+        {
+                .procname       = "fscache",
+                .mode           = 0555,
+                .child          = fscache_sysctls,
+        },
+        {}
+};
+#endif
 /*
 * initialise the fs caching module
 */
 static int __init fscache_init(void)
 {
+        unsigned int nr_cpus = num_possible_cpus();
+        unsigned int cpu;
        int ret;
-        ret = slow_work_register_user(THIS_MODULE);
+        fscache_object_max_active =
-        if (ret < 0)
+                clamp_val(nr_cpus,
-                goto error_slow_work;
+                          fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+        ret = -ENOMEM;
+        fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+                                            fscache_object_max_active);
+        if (!fscache_object_wq)
+                goto error_object_wq;
+        fscache_op_max_active =
+                clamp_val(fscache_object_max_active / 2,
+                          fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+        ret = -ENOMEM;
+        fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+                                        fscache_op_max_active);
+        if (!fscache_op_wq)
+                goto error_op_wq;
+        for_each_possible_cpu(cpu)
+                init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
        ret = fscache_proc_init();
        if (ret < 0)
                goto error_proc;
+#ifdef CONFIG_SYSCTL
+        ret = -ENOMEM;
+        fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+        if (!fscache_sysctl_header)
+                goto error_sysctl;
+#endif
        fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
                                               sizeof(struct fscache_cookie),
                                               0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
 error_kobj:
        kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
        fscache_proc_cleanup();
 error_proc:
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(fscache_op_wq);
-error_slow_work:
+error_op_wq:
+        destroy_workqueue(fscache_object_wq);
+error_object_wq:
        return ret;
 }
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
        kobject_put(fscache_root);
        kmem_cache_destroy(fscache_cookie_jar);
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(fscache_sysctl_header);
+#endif
        fscache_proc_cleanup();
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(fscache_op_wq);
+        destroy_workqueue(fscache_object_wq);
        printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c5338..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS  0x00000200      /* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS   0x00000400      /* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800      /* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK     0x00001000      /* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK     0x00001000      /* show objects with work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK   0x00002000      /* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK   0x00002000      /* show objects without work */
        u8              buf[512];       /* key and aux data buffer */
 };
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                       READS, NOREADS);
                FILTER(obj->events & obj->event_mask,
                       EVENTS, NOEVENTS);
-                FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+                FILTER(work_busy(&obj->work), WORK, NOWORK);
-                       WORK, NOWORK);
        }
        seq_printf(m,
-                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
                   obj->debug_id,
                   obj->parent ? obj->parent->debug_id : -1,
                   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
                   obj->events,
                   obj->flags,
-                   obj->work.flags);
+                   work_busy(&obj->work));
        no_cookie = true;
        keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
        [FSCACHE_OBJECT_DEAD]           = "DEAD",
 };
-static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_get_object(struct fscache_object *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_put_object(struct fscache_object *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
-const struct slow_work_ops fscache_object_slow_work_ops = {
-        .owner          = THIS_MODULE,
-        .get_ref        = fscache_object_slow_work_get_ref,
-        .put_ref        = fscache_object_slow_work_put_ref,
-        .execute        = fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        .desc           = fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
 /*
 * we need to notify the parent when an op completes that we had outstanding
 * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
 * execute an object
 */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
        if (object->events & object->event_mask)
                fscache_enqueue_object(object);
        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        fscache_put_object(object);
 }
+EXPORT_SYMBOL(fscache_object_work_func);
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-                                          struct seq_file *m)
-{
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
-        seq_printf(m, "FSC: OBJ%x: %s",
-                   object->debug_id,
-                   fscache_object_states_short[object->state]);
-}
-#endif
 /*
 * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
        _enter("");
        ASSERT(object->cookie != NULL);
        ASSERT(object->cookie->parent != NULL);
-        ASSERT(list_empty(&object->work.link));
        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
                object->parent = NULL;
        }
-        /* this just shifts the object release to the slow work processor */
+        /* this just shifts the object release to the work processor */
-        fscache_stat(&fscache_n_cop_put_object);
+        fscache_put_object(object);
-        object->cache->ops->put_object(object);
-        fscache_stat_d(&fscache_n_cop_put_object);
        _leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
 */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
        int ret;
        fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
 */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
        fscache_stat(&fscache_n_cop_put_object);
        object->cache->ops->put_object(object);
        fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
        _enter("{OBJ%x}", object->debug_id);
-        slow_work_enqueue(&object->work);
+        if (fscache_get_object(object) >= 0) {
+                wait_queue_head_t *cong_wq =
+                        &get_cpu_var(fscache_object_cong_wait);
+                if (queue_work(fscache_object_wq, &object->work)) {
+                        if (fscache_object_congested())
+                                wake_up(cong_wq);
+                } else
+                        fscache_put_object(object);
+                put_cpu_var(fscache_object_cong_wait);
+        }
+}
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+        wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+        DEFINE_WAIT(wait);
+        if (fscache_object_congested())
+                return true;
+        add_wait_queue_exclusive(cong_wq, &wait);
+        if (!fscache_object_congested())
+                *timeoutp = schedule_timeout(*timeoutp);
+        finish_wait(cong_wq, &wait);
+        return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 /*
 * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
                /* sort onto appropriate lists */
                fscache_enqueue_object(dep);
-                fscache_stat(&fscache_n_cop_put_object);
+                fscache_put_object(dep);
-                dep->cache->ops->put_object(dep);
-                fscache_stat_d(&fscache_n_cop_put_object);
                if (!list_empty(&object->dependents))
                        cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        fscache_stat(&fscache_n_op_enqueue);
        switch (op->flags & FSCACHE_OP_TYPE) {
-        case FSCACHE_OP_FAST:
+        case FSCACHE_OP_ASYNC:
-                _debug("queue fast");
+                _debug("queue async");
                atomic_inc(&op->usage);
-                if (!schedule_work(&op->fast_work))
+                if (!queue_work(fscache_op_wq, &op->work))
                        fscache_put_operation(op);
                break;
-        case FSCACHE_OP_SLOW:
-                _debug("queue slow");
-                slow_work_enqueue(&op->slow_work);
-                break;
        case FSCACHE_OP_MYTHREAD:
                _debug("queue for caller's attention");
                break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
 }
 /*
- * allow the slow work item processor to get a ref on an operation
+ * execute an operation using fs_op_wq to provide processing context -
- */
+ * the caller holds a ref to this object, so we don't need to hold one
-static int fscache_op_get_ref(struct slow_work *work)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        atomic_inc(&op->usage);
-        return 0;
-}
-/*
- * allow the slow work item processor to discard a ref on an operation
- */
-static void fscache_op_put_ref(struct slow_work *work)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        fscache_put_operation(op);
-}
-/*
- * execute an operation using the slow thread pool to provide processing context
- * - the caller holds a ref to this object, so we don't need to hold one
 */
-static void fscache_op_execute(struct slow_work *work)
+void fscache_op_work_func(struct work_struct *work)
 {
        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
+                container_of(work, struct fscache_operation, work);
        unsigned long start;
        _enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
        start = jiffies;
        op->processor(op);
        fscache_hist(fscache_ops_histogram, start);
+        fscache_put_operation(op);
        _leave("");
 }
-/*
- * describe an operation for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
-                   op->object->debug_id, op->debug_id,
-                   op->name, op->state, op->flags);
-}
-#endif
-const struct slow_work_ops fscache_op_slow_work_ops = {
-        .owner          = THIS_MODULE,
-        .get_ref        = fscache_op_get_ref,
-        .put_ref        = fscache_op_put_ref,
-        .execute        = fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        .desc           = fscache_op_desc,
-#endif
-};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd219..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 page_busy:
        /* we might want to wait here, but that could deadlock the allocator as
-         * the slow-work threads writing to the cache may all end up sleeping
+         * the work threads writing to the cache may all end up sleeping
         * on memory allocation */
        fscache_stat(&fscache_n_store_vmscan_busy);
        return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
                return -ENOMEM;
        }
-        fscache_operation_init(op, NULL);
+        fscache_operation_init(op, fscache_attr_changed_op, NULL);
-        fscache_operation_init_slow(op, fscache_attr_changed_op);
+        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
-        op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
        fscache_set_op_name(op, "Attr");
        spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
 EXPORT_SYMBOL(__fscache_attr_changed);
 /*
- * handle secondary execution given to a retrieval op on behalf of the
- * cache
- */
-static void fscache_retrieval_work(struct work_struct *work)
-{
-        struct fscache_retrieval *op =
-                container_of(work, struct fscache_retrieval, op.fast_work);
-        unsigned long start;
-        _enter("{OP%x}", op->op.debug_id);
-        start = jiffies;
-        op->op.processor(&op->op);
-        fscache_hist(fscache_ops_histogram, start);
-        fscache_put_operation(&op->op);
-}
-/*
 * release a retrieval op reference
 */
 static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
                return NULL;
        }
-        fscache_operation_init(&op->op, fscache_release_retrieval_op);
+        fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
        op->mapping     = mapping;
        op->end_io_func = end_io_func;
        op->context     = context;
        op->start_time  = jiffies;
-        INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
        INIT_LIST_HEAD(&op->to_do);
        fscache_set_op_name(&op->op, "Retr");
        return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        if (!op)
                goto nomem;
-        fscache_operation_init(&op->op, fscache_release_write_op);
+        fscache_operation_init(&op->op, fscache_write_op,
-        fscache_operation_init_slow(&op->op, fscache_write_op);
+                               fscache_release_write_op);
-        op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
        fscache_set_op_name(&op->op, "Write1");
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_stat(&fscache_n_store_ops);
        fscache_stat(&fscache_n_stores_ok);
-        /* the slow work queue now carries its own ref on the object */
+        /* the work queue now carries its own ref on the object */
        fscache_put_operation(&op->op);
        _leave(" = 0");
        return 0;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9424796d6634..69ad053ffd78 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-        req->in.h.unique = fuse_get_unique(fc);
        req->in.h.len = sizeof(struct fuse_in_header) +
                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
        list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
                req = list_entry(fc->bg_queue.next, struct fuse_req, list);
                list_del(&req->list);
                fc->active_background++;
+                req->in.h.unique = fuse_get_unique(fc);
                queue_request(fc, req);
        }
 }
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
        else if (fc->conn_error)
                req->out.h.error = -ECONNREFUSED;
        else {
+                req->in.h.unique = fuse_get_unique(fc);
                queue_request(fc, req);
                /* acquire extra reference, since request is still needed
                   after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+                                          struct fuse_req *req, u64 unique)
+{
+        int err = -ENODEV;
+        req->isreply = 0;
+        req->in.h.unique = unique;
+        spin_lock(&fc->lock);
+        if (fc->connected) {
+                queue_request(fc, req);
+                err = 0;
+        }
+        spin_unlock(&fc->lock);
+        return err;
+}
 /*
 * Called under fc->lock
 *
@@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
                if (!cs->write) {
                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
                } else {
-                        kunmap_atomic(cs->mapaddr, KM_USER0);
+                        kunmap(buf->page);
                        buf->len = PAGE_SIZE - cs->len;
                }
                cs->currbuf = NULL;
                cs->mapaddr = NULL;
        } else if (cs->mapaddr) {
-                kunmap_atomic(cs->mapaddr, KM_USER0);
+                kunmap(cs->pg);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
                        set_page_dirty_lock(cs->pg);
@@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        BUG_ON(!cs->nr_segs);
                        cs->currbuf = buf;
-                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
                        cs->len = buf->len;
                        cs->buf = cs->mapaddr + buf->offset;
                        cs->pipebufs++;
@@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        buf->len = 0;
                        cs->currbuf = buf;
-                        cs->mapaddr = kmap_atomic(page, KM_USER0);
+                        cs->mapaddr = kmap(page);
                        cs->buf = cs->mapaddr;
                        cs->len = PAGE_SIZE;
                        cs->pipebufs++;
@@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        return err;
                BUG_ON(err != 1);
                offset = cs->addr % PAGE_SIZE;
-                cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+                cs->mapaddr = kmap(cs->pg);
                cs->buf = cs->mapaddr + offset;
                cs->len = min(PAGE_SIZE - offset, cs->seglen);
                cs->seglen -= cs->len;
@@ -1231,6 +1249,199 @@ err:
        return err;
 }
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+                             struct fuse_copy_state *cs)
+{
+        struct fuse_notify_store_out outarg;
+        struct inode *inode;
+        struct address_space *mapping;
+        u64 nodeid;
+        int err;
+        pgoff_t index;
+        unsigned int offset;
+        unsigned int num;
+        loff_t file_size;
+        loff_t end;
+        err = -EINVAL;
+        if (size < sizeof(outarg))
+                goto out_finish;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto out_finish;
+        err = -EINVAL;
+        if (size - sizeof(outarg) != outarg.size)
+                goto out_finish;
+        nodeid = outarg.nodeid;
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto out_up_killsb;
+        inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+        if (!inode)
+                goto out_up_killsb;
+        mapping = inode->i_mapping;
+        index = outarg.offset >> PAGE_CACHE_SHIFT;
+        offset = outarg.offset & ~PAGE_CACHE_MASK;
+        file_size = i_size_read(inode);
+        end = outarg.offset + outarg.size;
+        if (end > file_size) {
+                file_size = end;
+                fuse_write_update_size(inode, file_size);
+        }
+        num = outarg.size;
+        while (num) {
+                struct page *page;
+                unsigned int this_num;
+                err = -ENOMEM;
+                page = find_or_create_page(mapping, index,
+                                           mapping_gfp_mask(mapping));
+                if (!page)
+                        goto out_iput;
+                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+                err = fuse_copy_page(cs, &page, offset, this_num, 0);
+                if (!err && offset == 0 && (num != 0 || file_size == end))
+                        SetPageUptodate(page);
+                unlock_page(page);
+                page_cache_release(page);
+                if (err)
+                        goto out_iput;
+                num -= this_num;
+                offset = 0;
+                index++;
+        }
+        err = 0;
+out_iput:
+        iput(inode);
+out_up_killsb:
+        up_read(&fc->killsb);
+out_finish:
+        fuse_copy_finish(cs);
+        return err;
+}
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+        int i;
+        for (i = 0; i < req->num_pages; i++) {
+                struct page *page = req->pages[i];
+                page_cache_release(page);
+        }
+}
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+                         struct fuse_notify_retrieve_out *outarg)
+{
+        int err;
+        struct address_space *mapping = inode->i_mapping;
+        struct fuse_req *req;
+        pgoff_t index;
+        loff_t file_size;
+        unsigned int num;
+        unsigned int offset;
+        size_t total_len;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        offset = outarg->offset & ~PAGE_CACHE_MASK;
+        req->in.h.opcode = FUSE_NOTIFY_REPLY;
+        req->in.h.nodeid = outarg->nodeid;
+        req->in.numargs = 2;
+        req->in.argpages = 1;
+        req->page_offset = offset;
+        req->end = fuse_retrieve_end;
+        index = outarg->offset >> PAGE_CACHE_SHIFT;
+        file_size = i_size_read(inode);
+        num = outarg->size;
+        if (outarg->offset > file_size)
+                num = 0;
+        else if (outarg->offset + num > file_size)
+                num = file_size - outarg->offset;
+        while (num) {
+                struct page *page;
+                unsigned int this_num;
+                page = find_get_page(mapping, index);
+                if (!page)
+                        break;
+                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+                req->pages[req->num_pages] = page;
+                req->num_pages++;
+                num -= this_num;
+                total_len += this_num;
+        }
+        req->misc.retrieve_in.offset = outarg->offset;
+        req->misc.retrieve_in.size = total_len;
+        req->in.args[0].size = sizeof(req->misc.retrieve_in);
+        req->in.args[0].value = &req->misc.retrieve_in;
+        req->in.args[1].size = total_len;
+        err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+        if (err)
+                fuse_retrieve_end(fc, req);
+        return err;
+}
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+                                struct fuse_copy_state *cs)
+{
+        struct fuse_notify_retrieve_out outarg;
+        struct inode *inode;
+        int err;
+        err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto copy_finish;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto copy_finish;
+        fuse_copy_finish(cs);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (fc->sb) {
+                u64 nodeid = outarg.nodeid;
+                inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+                if (inode) {
+                        err = fuse_retrieve(fc, inode, &outarg);
+                        iput(inode);
+                }
+        }
+        up_read(&fc->killsb);
+        return err;
+copy_finish:
+        fuse_copy_finish(cs);
+        return err;
+}
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1244,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_INVAL_ENTRY:
                return fuse_notify_inval_entry(fc, size, cs);
+        case FUSE_NOTIFY_STORE:
+                return fuse_notify_store(fc, size, cs);
+        case FUSE_NOTIFY_RETRIEVE:
+                return fuse_notify_retrieve(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3cdc5f78a406..431be0795b6b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
                   exist.  So if permissions are revoked this won't be
                   noticed immediately, only after the attribute
                   timeout has expired */
-        } else if (mask & MAY_ACCESS) {
+        } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
                err = fuse_access(inode, mask);
        } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb5..147c1f71bdb9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
        return 0;
 }
-static void fuse_write_update_size(struct inode *inode, loff_t pos)
+void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064e..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
                        struct fuse_write_in in;
                        struct fuse_write_out out;
                } write;
+                struct fuse_notify_retrieve_in retrieve_in;
                struct fuse_lk_in lk_in;
        } misc;
@@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
+void fuse_write_update_size(struct inode *inode, loff_t pos);
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b43107112..cc9665522148 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,7 +7,6 @@ config GFS2_FS
        select IP_SCTP if DLM_SCTP
        select FS_POSIX_ACL
        select CRC32
-        select SLOW_WORK
        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9f8b52500d63..5e96cbd8a454 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
        if (ret <= 0)
                return ret;
-        ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+        return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
-        if (ret == -EAGAIN)
-                ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-        return ret;
 }
 /**
@@ -637,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                }
        }
-        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
+        alloc_required = gfs2_write_alloc_required(ip, pos, len);
-        if (error)
-                goto out_unlock;
        if (alloc_required || gfs2_is_jdata(ip))
                gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b402..6f482809d1a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1040,7 +1040,8 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                u64 dsize = size + sizeof(struct gfs2_inode);
+                u64 dsize = size + sizeof(struct gfs2_dinode);
+                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -1243,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 * @ip: the file being written to
 * @offset: the offset to write to
 * @len: the number of bytes being written
- * @alloc_required: set to 1 if an alloc is required, 0 otherwise
 *
- * Returns: errno
+ * Returns: 1 if an alloc is required, 0 otherwise
 */
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                              unsigned int len, int *alloc_required)
+                              unsigned int len)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head bh;
@@ -1257,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        u64 lblock, lblock_stop, size;
        u64 end_of_file;
-        *alloc_required = 0;
        if (!len)
                return 0;
        if (gfs2_is_stuffed(ip)) {
                if (offset + len >
                    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
-                        *alloc_required = 1;
+                        return 1;
                return 0;
        }
-        *alloc_required = 1;
        shift = sdp->sd_sb.sb_bsize_shift;
        BUG_ON(gfs2_is_dir(ip));
        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
        lblock = offset >> shift;
        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
        if (lblock_stop > end_of_file)
-                return 0;
+                return 1;
        size = (lblock_stop - lblock) << shift;
        do {
@@ -1284,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                bh.b_size = size;
                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
                if (!buffer_mapped(&bh))
-                        return 0;
+                        return 1;
                size -= bh.b_size;
                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
        } while(size > 0);
-        *alloc_required = 0;
        return 0;
 }
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05ac..a20a5213135a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                              unsigned int len, int *alloc_required);
+                              unsigned int len);
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a9..b9dd88a78dd4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
        unsigned totlen = be16_to_cpu(dent->de_rec_len);
        if (gfs2_dirent_sentinel(dent))
-                actual = GFS2_DIRENT_SIZE(0);
+                actual = 0;
        if (totlen - actual >= required)
                return 1;
        return 0;
@@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        /* Change the pointers.
           Don't bother distinguishing stuffed from non-stuffed.
           This code is complicated enough already. */
-        lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
+        lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+        if (!lp) {
+                error = -ENOMEM;
+                goto fail_brelse;
+        }
        /*  Change the pointers  */
        for (x = 0; x < half_len; x++)
                lp[x] = cpu_to_be64(bn);
@@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        /*  Allocate both the "from" and "to" buffers in one big chunk  */
-        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
        return 0;
 }
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+        void *ptr = NULL;
+        if (size < KMALLOC_MAX_SIZE)
+                ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+        if (!ptr)
+                ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+        return ptr;
+}
+static void gfs2_free_sort_buffer(void *ptr)
+{
+        if (is_vmalloc_addr(ptr))
+                vfree(ptr);
+        else
+                kfree(ptr);
+}
 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                              filldir_t filldir, int *copied, unsigned *depth,
                              u64 leaf_no)
@@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
         * 99 is the maximum number of entries that can fit in a single
         * leaf block.
         */
-        larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+        larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
        if (!larr)
                goto out;
        darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        do {
                error = get_leaf(ip, lfn, &bh);
                if (error)
-                        goto out_kfree;
+                        goto out_free;
                lf = (struct gfs2_leaf *)bh->b_data;
                lfn = be64_to_cpu(lf->lf_next);
                if (lf->lf_entries) {
@@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                                gfs2_dirent_gather, NULL, &g);
                        error = PTR_ERR(dent);
                        if (IS_ERR(dent))
-                                goto out_kfree;
+                                goto out_free;
                        if (entries2 != g.offset) {
                                fs_warn(sdp, "Number of entries corrupt in dir "
                                                "leaf %llu, entries2 (%u) != "
@@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                        entries2, g.offset);
                                        
                                error = -EIO;
-                                goto out_kfree;
+                                goto out_free;
                        }
                        error = 0;
                        larr[leaf++] = bh;
@@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        BUG_ON(entries2 != entries);
        error = do_filldir_main(ip, offset, opaque, filldir, darr,
                                entries, copied);
-out_kfree:
+out_free:
        for(i = 0; i < leaf; i++)
                brelse(larr[i]);
-        vfree(larr);
+        gfs2_free_sort_buffer(larr);
 out:
        return error;
 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ed9a94f0ef15..4edd662c8232 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned long last_index;
        u64 pos = page->index << PAGE_CACHE_SHIFT;
        unsigned int data_blocks, ind_blocks, rblocks;
-        int alloc_required = 0;
        struct gfs2_holder gh;
        struct gfs2_alloc *al;
        int ret;
@@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+        if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
-        if (ret || !alloc_required)
                goto out_unlock;
        ret = -ENOMEM;
        al = gfs2_alloc_get(ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf493536..9adf8f924e08 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 }
 /**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+        struct gfs2_holder *gh, *tmp;
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (ret & LM_OUT_ERROR)
+                        gh->gh_error = -EIO;
+                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        gh->gh_error = GLR_TRYFAILED;
+                else
+                        continue;
+                list_del_init(&gh->gh_list);
+                trace_gfs2_glock_queue(gh, 0);
+                gfs2_holder_wake(gh);
+        }
+}
+/**
 * do_promote - promote as many requests as possible on the current queue
 * @gl: The glock
 * 
@@ -375,36 +399,13 @@ restart:
                }
                if (gh->gh_list.prev == &gl->gl_holders)
                        return 1;
+                do_error(gl, 0);
                break;
        }
        return 0;
 }
 /**
- * do_error - Something unexpected has happened during a lock request
- *
- */
-static inline void do_error(struct gfs2_glock *gl, const int ret)
-{
-        struct gfs2_holder *gh, *tmp;
-        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
-                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-                        continue;
-                if (ret & LM_OUT_ERROR)
-                        gh->gh_error = -EIO;
-                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-                        gh->gh_error = GLR_TRYFAILED;
-                else
-                        continue;
-                list_del_init(&gh->gh_list);
-                trace_gfs2_glock_queue(gh, 0);
-                gfs2_holder_wake(gh);
-        }
-}
-/**
 * find_first_waiter - find the first gh that's waiting for the glock
 * @gl: the glock
 */
@@ -1062,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
+        if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+            test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
@@ -1319,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 }
 /**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+        const struct gfs2_holder *gh;
+        if (gl->gl_reply & ~LM_OUT_ST_MASK)
+                return 0;
+        if (gl->gl_target == LM_ST_UNLOCKED)
+                return 0;
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (LM_FLAG_NOEXP & gh->gh_flags)
+                        return 0;
+        }
+        return 1;
+}
+/**
 * gfs2_glock_complete - Callback used by locking
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
@@ -1328,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                struct gfs2_holder *gh;
                spin_lock(&gl->gl_spin);
-                gh = find_first_waiter(gl);
+                if (gfs2_should_freeze(gl)) {
-                if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
-                     (gl->gl_target != LM_ST_UNLOCKED)) ||
-                    ((ret & ~LM_OUT_ST_MASK) != 0))
                        set_bit(GLF_FROZEN, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
+                        spin_unlock(&gl->gl_spin);
-                if (test_bit(GLF_FROZEN, &gl->gl_flags))
                        return;
+                }
+                spin_unlock(&gl->gl_spin);
        }
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
        gfs2_glock_hold(gl);
@@ -1348,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d7363b22da..fdbf4b366fa5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
-#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
-        struct slow_work jd_work;
+        struct work_struct jd_work;
        struct inode *jd_inode;
        unsigned long jd_flags;
 #define JDF_RECOVERY 1
@@ -460,6 +459,7 @@ enum {
        SDF_NOBARRIERS          = 3,
        SDF_NORECOVERY          = 4,
        SDF_DEMOTE              = 5,
+        SDF_NOJOURNALID         = 6,
 };
 #define GFS2_FSNAME_LEN         256
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a5..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 {
        struct inode *inode;
        struct gfs2_inode *ip;
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error;
        inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
+                io_gl = NULL;
                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
                        goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-        gfs2_glock_put(io_gl);
+        if (io_gl)
+                gfs2_glock_put(io_gl);
 fail_put:
        if (inode->i_state & I_NEW)
                ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_inode *ip;
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error;
        struct gfs2_holder gh;
        struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
        ip->i_iopen_gh.gh_gl->gl_object = ip;
        gfs2_glock_put(io_gl);
+        io_gl = NULL;
        inode->i_mode = DT2IF(DT_UNKNOWN);
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-        gfs2_glock_put(io_gl);
+        if (io_gl)
+                gfs2_glock_put(io_gl);
 fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index fb2a5f93b7c3..b1e9630eb46a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
-#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -24,6 +23,7 @@
 #include "util.h"
 #include "glock.h"
 #include "quota.h"
+#include "recovery.h"
 static struct shrinker qd_shrinker = {
        .shrink = gfs2_shrink_qd_memory,
@@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
-        error = slow_work_register_user(THIS_MODULE);
+        error = -ENOMEM;
-        if (error)
+        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                goto fail_slow;
+                                          WQ_NON_REENTRANT | WQ_RESCUER, 0);
+        if (!gfs_recovery_wq)
+                goto fail_wq;
        gfs2_register_debugfs();
@@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void)
        return 0;
-fail_slow:
+fail_wq:
        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
        unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(gfs_recovery_wq);
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3593b3a7290e..4f44bdeb2f03 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/slow-work.h>
 #include <linux/quotaops.h>
 #include "gfs2.h"
@@ -76,7 +75,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        sb->s_fs_info = sdp;
        sdp->sd_vfs = sb;
+        set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
        gfs2_tune_init(&sdp->sd_tune);
        init_waitqueue_head(&sdp->sd_glock_wait);
@@ -673,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
-                slow_work_init(&jd->jd_work, &gfs2_recover_ops);
+                INIT_WORK(&jd->jd_work, gfs2_recover_func);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -782,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_lockstruct.ls_first) {
                unsigned int x;
                for (x = 0; x < sdp->sd_journals; x++) {
-                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
+                                                     true);
                        if (error) {
                                fs_err(sdp, "error recovering journal %u: %d\n",
                                       x, error);
@@ -792,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                gfs2_others_may_mount(sdp);
        } else if (!sdp->sd_args.ar_spectator) {
-                error = gfs2_recover_journal(sdp->sd_jdesc);
+                error = gfs2_recover_journal(sdp->sd_jdesc, true);
                if (error) {
                        fs_err(sdp, "error recovering my journal: %d\n", error);
                        goto fail_jinode_gh;
@@ -1050,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                        ret = match_int(&tmp[0], &option);
                        if (ret || option < 0) 
                                goto hostdata_error;
-                        ls->ls_jid = option;
+                        if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+                                ls->ls_jid = option;
                        break;
                case Opt_id:
                        /* Obsolete, but left for backward compat purposes */
@@ -1102,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
                lm->lm_unmount(sdp);
 }
+static int gfs2_journalid_wait(void *word)
+{
+        if (signal_pending(current))
+                return -EINTR;
+        schedule();
+        return 0;
+}
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+        if (sdp->sd_args.ar_spectator)
+                return 0;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                return 0;
+        return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
+}
 void gfs2_online_uevent(struct gfs2_sbd *sdp)
 {
        struct super_block *sb = sdp->sd_vfs;
@@ -1194,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (error)
                goto fail_locking;
+        error = wait_on_journal(sdp);
+        if (error)
+                goto fail_sb;
        error = init_inodes(sdp, DO);
        if (error)
                goto fail_sb;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769e..1bc6b5695e6d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_sbd *sdp;
@@ -694,10 +694,8 @@ get_a_page:
                if (!buffer_mapped(bh))
                        goto unlock_out;
                /* If it's a newly allocated disk block for quota, zero it */
-                if (buffer_new(bh)) {
+                if (buffer_new(bh))
-                        memset(bh->b_data, 0, bh->b_size);
+                        zero_user(page, pos - blocksize, bh->b_size);
-                        set_buffer_uptodate(bh);
-                }
        }
        if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
        /* If quota straddles page boundary, we need to update the rest of the
         * quota at the beginning of the next page */
-        if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+        if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
                ptr = ptr + nbytes;
                nbytes = sizeof(struct gfs2_quota) - nbytes;
                offset = 0;
@@ -789,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                goto out;
        for (x = 0; x < num_qd; x++) {
-                int alloc_required;
                offset = qd2offset(qda[x]);
-                error = gfs2_write_alloc_required(ip, offset,
+                if (gfs2_write_alloc_required(ip, offset,
-                                                  sizeof(struct gfs2_quota),
+                                              sizeof(struct gfs2_quota)))
-                                                  &alloc_required);
-                if (error)
-                        goto out_gunlock;
-                if (alloc_required)
                        nalloc++;
        }
@@ -1457,10 +1449,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        switch (sdp->sd_args.ar_quota) {
        case GFS2_QUOTA_ON:
-                fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+                fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
                /*FALLTHRU*/
        case GFS2_QUOTA_ACCOUNT:
-                fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
                break;
        case GFS2_QUOTA_OFF:
                break;
@@ -1506,7 +1498,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
        fdq->d_version = FS_DQUOT_VERSION;
-        fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1541,12 +1533,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        switch(type) {
        case USRQUOTA:
                type = QUOTA_USER;
-                if (fdq->d_flags != XFS_USER_QUOTA)
+                if (fdq->d_flags != FS_USER_QUOTA)
                        return -EINVAL;
                break;
        case GRPQUOTA:
                type = QUOTA_GROUP;
-                if (fdq->d_flags != XFS_GROUP_QUOTA)
+                if (fdq->d_flags != FS_GROUP_QUOTA)
                        return -EINVAL;
                break;
        default:
@@ -1586,10 +1578,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                goto out_i;
        offset = qd2offset(qd);
-        error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
-                                          &alloc_required);
-        if (error)
-                goto out_i;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
        return ret;
 }
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d437..f7f89a94a5a4 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -28,6 +27,8 @@
 #include "util.h"
 #include "dir.h"
+struct workqueue_struct *gfs_recovery_wq;
 int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh)
 {
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
-static int gfs2_recover_get_ref(struct slow_work *work)
+void gfs2_recover_func(struct work_struct *work)
-{
-        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
-                return -EBUSY;
-        return 0;
-}
-static void gfs2_recover_put_ref(struct slow_work *work)
-{
-        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-        clear_bit(JDF_RECOVERY, &jd->jd_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
-}
-static void gfs2_recover_work(struct slow_work *work)
 {
        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work)
                gfs2_glock_dq_uninit(&j_gh);
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-        return;
+        goto done;
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
@@ -590,32 +575,35 @@ fail_gunlock_j:
        }
        fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 fail:
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+done:
+        clear_bit(JDF_RECOVERY, &jd->jd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
 }
-struct slow_work_ops gfs2_recover_ops = {
-        .owner   = THIS_MODULE,
-        .get_ref = gfs2_recover_get_ref,
-        .put_ref = gfs2_recover_put_ref,
-        .execute = gfs2_recover_work,
-};
 static int gfs2_recovery_wait(void *word)
 {
        schedule();
        return 0;
 }
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
 {
        int rv;
-        rv = slow_work_enqueue(&jd->jd_work);
-        if (rv)
+        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
-                return rv;
+                return -EBUSY;
-        wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
+        /* we have JDF_RECOVERY, queue should always succeed */
+        rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+        BUG_ON(!rv);
+        if (wait)
+                wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
+                            TASK_UNINTERRUPTIBLE);
        return 0;
 }
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569a..2226136c7647 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
 #include "incore.h"
+extern struct workqueue_struct *gfs_recovery_wq;
 static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 {
        if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
-extern struct slow_work_ops gfs2_recover_ops;
+extern void gfs2_recover_func(struct work_struct *work);
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 4d1aad38f1b1..4140811a921c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-        int ar;
-        int error;
        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
@@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        }
        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
+        if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
-        if (!error && ar) {
                gfs2_consist_inode(ip);
-                error = -EIO;
+                return -EIO;
        }
-        return error;
+        return 0;
 }
 /**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 37f5393e68e6..ccacffd2faaa 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
 #include "quota.h"
 #include "util.h"
 #include "glops.h"
+#include "recovery.h"
 struct gfs2_attr {
        struct attribute attr;
@@ -325,6 +326,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_first);
 }
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        unsigned first;
+        int rv;
+        rv = sscanf(buf, "%u", &first);
+        if (rv != 1 || first > 1)
+                return -EINVAL;
+        spin_lock(&sdp->sd_jindex_spin);
+        rv = -EBUSY;
+        if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+                goto out;
+        rv = -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                goto out;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                goto out;
+        sdp->sd_lockstruct.ls_first = first;
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
 static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -352,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                if (jd->jd_jid != jid)
                        continue;
-                rv = slow_work_enqueue(&jd->jd_work);
+                rv = gfs2_recover_journal(jd, false);
                break;
        }
 out:
@@ -377,14 +402,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
 }
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        unsigned jid;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        spin_lock(&sdp->sd_jindex_spin);
+        rv = -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                goto out;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                goto out;
+        rv = -EBUSY;
+        if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+                goto out;
+        sdp->sd_lockstruct.ls_jid = jid;
+        smp_mb__after_clear_bit();
+        wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
 GDLM_ATTR(block,                0644, block_show,               block_store);
 GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
+GDLM_ATTR(jid,                  0644, jid_show,                 jid_store);
-GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
+GDLM_ATTR(first,                0644, lkfirst_show,             lkfirst_store);
 GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
 GDLM_ATTR(recover,              0600, NULL,                     recover_store);
 GDLM_ATTR(recover_done,         0444, recover_done_show,        NULL);
@@ -564,7 +616,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
-        if (!sdp->sd_args.ar_spectator)
+        if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
        if (gfs2_uuid_valid(uuid))
                add_uevent_var(env, "UUID=%pUB", uuid);
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d65..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
 * This function is passed the number of inodes to scan, and it returns the
 * total number of remaining possibly-reclaimable inodes.
 */
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                /*
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 93d1e47647bd..f19ce94693d8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1281,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
 int journal_check_available_features (journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
 {
-        journal_superblock_t *sb;
        if (!compat && !ro && !incompat)
                return 1;
-        sb = journal->j_superblock;
        /* We can support any known requested features iff the
         * superblock is in version 2.  Otherwise we fail to support any
         * extended sb features. */
@@ -1481,7 +1477,6 @@ int journal_flush(journal_t *journal)
 int journal_wipe(journal_t *journal, int write)
 {
-        journal_superblock_t *sb;
        int err = 0;
        J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1490,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write)
        if (err)
                return err;
-        sb = journal->j_superblock;
        if (!journal->j_tail)
                goto no_recovery;
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b17..81051dafebf5 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
 int journal_skip_recovery(journal_t *journal)
 {
        int                     err;
-        journal_superblock_t *  sb;
        struct recovery_info    info;
        memset (&info, 0, sizeof(info));
-        sb = journal->j_superblock;
        err = do_one_pass(journal, &info, PASS_SCAN);
@@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal)
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD_DEBUG
-                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+                int dropped = info.end_transaction -
+                              be32_to_cpu(journal->j_superblock->s_sequence);
 #endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
        unsigned int            sequence;
        int                     blocktype;
-        /* Precompute the maximum metadata descriptors in a descriptor block */
-        int                     MAX_BLOCKS_PER_DESC;
-        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-                               / sizeof(journal_block_tag_t));
        /*
         * First thing is to establish what we expect to find in the log
         * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f95..1c23a0f4e8a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
        int nblocks, space_left;
-        assert_spin_locked(&journal->j_state_lock);
+        /* assert_spin_locked(&journal->j_state_lock); */
        nblocks = jbd_space_needed(journal);
        while (__jbd2_log_space_left(journal) < nblocks) {
                if (journal->j_flags & JBD2_ABORT)
                        return;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                mutex_lock(&journal->j_checkpoint_mutex);
                /*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                 * filesystem, so abort the journal and leave a stack
                 * trace for forensic evidence.
                 */
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
                space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                        if (journal->j_committing_transaction)
                                tid = journal->j_committing_transaction->t_tid;
                        spin_unlock(&journal->j_list_lock);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        if (chkpt) {
                                jbd2_log_do_checkpoint(journal);
                        } else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                                WARN_ON(1);
                                jbd2_journal_abort(journal, 0);
                        }
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                } else {
                        spin_unlock(&journal->j_list_lock);
                }
@@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         * next transaction ID we will write, and where it will
         * start. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
        if (transaction) {
@@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        /* If the oldest pinned transaction is at the tail of the log
           already then there's not much we can do right now. */
        if (journal->j_tail_sequence == first_tid) {
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                return 1;
        }
@@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        journal->j_free += freed;
        journal->j_tail_sequence = first_tid;
        journal->j_tail = blocknr;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        /*
         * If there is an external journal, we need to make sure that
@@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_log_list == NULL);
        J_ASSERT(transaction->t_checkpoint_list == NULL);
        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-        J_ASSERT(transaction->t_updates == 0);
+        J_ASSERT(atomic_read(&transaction->t_updates) == 0);
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 75716d3d2be0..f52e5e8049f1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -150,11 +150,11 @@ static int journal_submit_commit_record(journal_t *journal,
         */
        if (ret == -EOPNOTSUPP && barrier_done) {
                printk(KERN_WARNING
-                       "JBD: barrier-based sync failed on %s - "
+                       "JBD2: Disabling barriers on %s, "
-                       "disabling barriers\n", journal->j_devname);
+                       "not supported by device\n", journal->j_devname);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                journal->j_flags &= ~JBD2_BARRIER;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                /* And try again, without the barrier */
                lock_buffer(bh);
@@ -180,11 +180,11 @@ retry:
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
                printk(KERN_WARNING
-                       "JBD2: wait_on_commit_record: sync failed on %s - "
+                       "JBD2: %s: disabling barries on %s - not supported "
-                       "disabling barriers\n", journal->j_devname);
+                       "by device\n", __func__, journal->j_devname);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                journal->j_flags &= ~JBD2_BARRIER;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                lock_buffer(bh);
                clear_buffer_dirty(bh);
@@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
        /*
@@ -417,23 +417,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                              stats.run.rs_locked);
        spin_lock(&commit_transaction->t_handle_lock);
-        while (commit_transaction->t_updates) {
+        while (atomic_read(&commit_transaction->t_updates)) {
                DEFINE_WAIT(wait);
                prepare_to_wait(&journal->j_wait_updates, &wait,
                                        TASK_UNINTERRUPTIBLE);
-                if (commit_transaction->t_updates) {
+                if (atomic_read(&commit_transaction->t_updates)) {
                        spin_unlock(&commit_transaction->t_handle_lock);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        schedule();
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                        spin_lock(&commit_transaction->t_handle_lock);
                }
                finish_wait(&journal->j_wait_updates, &wait);
        }
        spin_unlock(&commit_transaction->t_handle_lock);
-        J_ASSERT (commit_transaction->t_outstanding_credits <=
+        J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
                        journal->j_max_transaction_buffers);
        /*
@@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        jbd_debug (3, "JBD: commit phase 2\n");
@@ -519,19 +519,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_COMMIT;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        trace_jbd2_commit_logging(journal, commit_transaction);
        stats.run.rs_logging = jiffies;
        stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
                                               stats.run.rs_logging);
-        stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+        stats.run.rs_blocks =
+                atomic_read(&commit_transaction->t_outstanding_credits);
        stats.run.rs_blocks_logged = 0;
        J_ASSERT(commit_transaction->t_nr_buffers <=
-                 commit_transaction->t_outstanding_credits);
+                 atomic_read(&commit_transaction->t_outstanding_credits));
        err = 0;
        descriptor = NULL;
@@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                 * the free space in the log, but this counter is changed
                 * by jbd2_journal_next_log_block() also.
                 */
-                commit_transaction->t_outstanding_credits--;
+                atomic_dec(&commit_transaction->t_outstanding_credits);
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
@@ -977,7 +978,7 @@ restart_loop:
         * __jbd2_journal_drop_transaction(). Otherwise we could race with
         * other checkpointing code processing the transaction...
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        /*
         * Now recheck if some buffers did not get attached to the transaction
@@ -985,7 +986,7 @@ restart_loop:
         */
        if (commit_transaction->t_forget) {
                spin_unlock(&journal->j_list_lock);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                goto restart_loop;
        }
@@ -1003,7 +1004,8 @@ restart_loop:
         * File the transaction statistics
         */
        stats.ts_tid = commit_transaction->t_tid;
-        stats.run.rs_handle_count = commit_transaction->t_handle_count;
+        stats.run.rs_handle_count =
+                atomic_read(&commit_transaction->t_handle_count);
        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
                             commit_transaction->t_tid, &stats.run);
@@ -1037,7 +1039,7 @@ restart_loop:
                                journal->j_average_commit_time*3) / 4;
        else
                journal->j_average_commit_time = commit_time;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff5932769..ad5866aaf0f9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
-EXPORT_SYMBOL(jbd2_journal_start);
-EXPORT_SYMBOL(jbd2_journal_restart);
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +142,7 @@ static int kjournald2(void *arg)
        /*
         * And now, wait forever for commit wakeup events.
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
 loop:
        if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +153,10 @@ loop:
        if (journal->j_commit_sequence != journal->j_commit_request) {
                jbd_debug(1, "OK, requests differ\n");
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                del_timer_sync(&journal->j_commit_timer);
                jbd2_journal_commit_transaction(journal);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                goto loop;
        }
@@ -169,9 +168,9 @@ loop:
                 * be already stopped.
                 */
                jbd_debug(1, "Now suspending kjournald2\n");
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                refrigerator();
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        } else {
                /*
                 * We assume on resume that commits are already there,
@@ -191,9 +190,9 @@ loop:
                if (journal->j_flags & JBD2_UNMOUNT)
                        should_sleep = 0;
                if (should_sleep) {
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        schedule();
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                }
                finish_wait(&journal->j_wait_commit, &wait);
        }
@@ -211,7 +210,7 @@ loop:
        goto loop;
 end_loop:
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        del_timer_sync(&journal->j_commit_timer);
        journal->j_task = NULL;
        wake_up(&journal->j_wait_done_commit);
@@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
 static void journal_kill_thread(journal_t *journal)
 {
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_UNMOUNT;
        while (journal->j_task) {
                wake_up(&journal->j_wait_commit);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /*
@@ -297,7 +296,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
-        struct jbd2_buffer_trigger_type *triggers;
        journal_t *journal = transaction->t_journal;
        /*
@@ -311,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
         */
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+retry_alloc:
+        new_bh = alloc_buffer_head(GFP_NOFS);
+        if (!new_bh) {
+                /*
+                 * Failure is not an option, but __GFP_NOFAIL is going
+                 * away; so we retry ourselves here.
+                 */
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                goto retry_alloc;
+        }
        /* keep subsequent assertions sane */
        new_bh->b_state = 0;
        init_buffer(new_bh, NULL, NULL);
@@ -328,21 +336,21 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
-                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
-         * Fire any commit trigger.  Do this before checking for escaping,
+         * Fire data frozen trigger if data already wasn't frozen.  Do this
-         * as the trigger may modify the magic offset.  If a copy-out
+         * before checking for escaping, as the trigger may modify the magic
-         * happens afterwards, it will have the correct data in the buffer.
+         * offset.  If a copy-out happens afterwards, it will have the correct
+         * data in the buffer.
         */
-        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+        if (!done_copy_out)
-                                   triggers);
+                jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+                                           jh_in->b_triggers);
        /*
         * Check for escaping
@@ -443,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal)
 {
        int left = journal->j_free;
-        assert_spin_locked(&journal->j_state_lock);
+        /* assert_spin_locked(&journal->j_state_lock); */
        /*
         * Be pessimistic here about the number of those free blocks which
@@ -488,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 {
        int ret;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        ret = __jbd2_log_start_commit(journal, tid);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return ret;
 }
@@ -509,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
        transaction_t *transaction = NULL;
        tid_t tid;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
                __jbd2_log_start_commit(journal, transaction->t_tid);
@@ -517,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
                transaction = journal->j_committing_transaction;
        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                return 0;       /* Nothing to retry */
        }
        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        jbd2_log_wait_commit(journal, tid);
        return 1;
 }
@@ -536,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
        int ret = 0;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;
@@ -555,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return ret;
 }
@@ -567,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
        int err = 0;
+        read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
-        spin_lock(&journal->j_state_lock);
        if (!tid_geq(journal->j_commit_request, tid)) {
                printk(KERN_EMERG
                       "%s: error: j_commit_request=%d, tid=%d\n",
                       __func__, journal->j_commit_request, tid);
        }
-        spin_unlock(&journal->j_state_lock);
 #endif
-        spin_lock(&journal->j_state_lock);
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        if (unlikely(is_journal_aborted(journal))) {
                printk(KERN_EMERG "journal commit I/O error\n");
@@ -603,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 {
        unsigned long blocknr;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        J_ASSERT(journal->j_free > 1);
        blocknr = journal->j_head;
@@ -611,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
        journal->j_free--;
        if (journal->j_head == journal->j_last)
                journal->j_head = journal->j_first;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return jbd2_journal_bmap(journal, blocknr, retp);
 }
@@ -831,7 +837,7 @@ static journal_t * journal_init_common (void)
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
        spin_lock_init(&journal->j_list_lock);
-        spin_lock_init(&journal->j_state_lock);
+        rwlock_init(&journal->j_state_lock);
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
@@ -1097,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                set_buffer_uptodate(bh);
        }
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
        sb->s_start    = cpu_to_be32(journal->j_tail);
        sb->s_errno    = cpu_to_be32(journal->j_errno);
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
@@ -1125,12 +1131,12 @@ out:
         * any future commit will have to be careful to update the
         * superblock again to re-record the true start of the log. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (sb->s_start)
                journal->j_flags &= ~JBD2_FLUSHED;
        else
                journal->j_flags |= JBD2_FLUSHED;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /*
@@ -1392,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
 {
-        journal_superblock_t *sb;
        if (!compat && !ro && !incompat)
                return 1;
-        sb = journal->j_superblock;
        /* We can support any known requested features iff the
         * superblock is in version 2.  Otherwise we fail to support any
         * extended sb features. */
@@ -1546,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal)
        transaction_t *transaction = NULL;
        unsigned long old_tail;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        /* Force everything buffered to the log... */
        if (journal->j_running_transaction) {
@@ -1559,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal)
        if (transaction) {
                tid_t tid = transaction->t_tid;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                jbd2_log_wait_commit(journal, tid);
        } else {
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
        }
        /* ...and flush everything in the log out to disk. */
@@ -1586,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal)
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        old_tail = journal->j_tail;
        journal->j_tail = 0;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        jbd2_journal_update_superblock(journal, 1);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_tail = old_tail;
        J_ASSERT(!journal->j_running_transaction);
@@ -1599,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal)
        J_ASSERT(!journal->j_checkpoint_transactions);
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return 0;
 }
@@ -1618,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal)
 int jbd2_journal_wipe(journal_t *journal, int write)
 {
-        journal_superblock_t *sb;
        int err = 0;
        J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1627,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        if (err)
                return err;
-        sb = journal->j_superblock;
        if (!journal->j_tail)
                goto no_recovery;
@@ -1666,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
        printk(KERN_ERR "Aborting journal on device %s.\n",
               journal->j_devname);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_ABORT;
        transaction = journal->j_running_transaction;
        if (transaction)
                __jbd2_log_start_commit(journal, transaction->t_tid);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /* Soft abort: record the abort error status in the journal superblock,
@@ -1756,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal)
 {
        int err;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                err = journal->j_errno;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        return err;
 }
@@ -1776,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal)
 {
        int err = 0;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                journal->j_errno = 0;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return err;
 }
@@ -1794,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal)
 */
 void jbd2_journal_ack_err(journal_t *journal)
 {
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_errno)
                journal->j_flags |= JBD2_ACK_ERR;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -2202,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
 void jbd2_journal_release_jbd_inode(journal_t *journal,
                                    struct jbd2_inode *jinode)
 {
-        int writeout = 0;
        if (!journal)
                return;
 restart:
@@ -2220,9 +2217,6 @@ restart:
                goto restart;
        }
-        /* Do we need to wait for data writeback? */
-        if (journal->j_committing_transaction == jinode->i_transaction)
-                writeout = 1;
        if (jinode->i_transaction) {
                list_del(&jinode->i_list);
                jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb89..2bc4d5f116f1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
 int jbd2_journal_skip_recovery(journal_t *journal)
 {
        int                     err;
-        journal_superblock_t *  sb;
        struct recovery_info    info;
        memset (&info, 0, sizeof(info));
-        sb = journal->j_superblock;
        err = do_one_pass(journal, &info, PASS_SCAN);
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD2_DEBUG
-                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+                int dropped = info.end_transaction - 
+                        be32_to_cpu(journal->j_superblock->s_sequence);
 #endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
        int                     tag_bytes = journal_tag_bytes(journal);
        __u32                   crc32_sum = ~0; /* Transactional Checksums */
-        /* Precompute the maximum metadata descriptors in a descriptor block */
-        int                     MAX_BLOCKS_PER_DESC;
-        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-                               / tag_bytes);
        /*
         * First thing is to establish what we expect to find in the log
         * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..d95cc9d0401d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
+        atomic_set(&transaction->t_updates, 0);
+        atomic_set(&transaction->t_outstanding_credits, 0);
+        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@ -83,65 +88,75 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 * transaction's buffer credits.
 */
-static int start_this_handle(journal_t *journal, handle_t *handle)
+static int start_this_handle(journal_t *journal, handle_t *handle,
+                             int gfp_mask)
 {
        transaction_t *transaction;
        int needed;
        int nblocks = handle->h_buffer_credits;
        transaction_t *new_transaction = NULL;
-        int ret = 0;
        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
                       current->comm, nblocks,
                       journal->j_max_transaction_buffers);
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto out;
        }
 alloc_transaction:
        if (!journal->j_running_transaction) {
-                new_transaction = kzalloc(sizeof(*new_transaction),
+                new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
-                                                GFP_NOFS|__GFP_NOFAIL);
                if (!new_transaction) {
-                        ret = -ENOMEM;
+                        /*
-                        goto out;
+                         * If __GFP_FS is not present, then we may be
+                         * being called from inside the fs writeback
+                         * layer, so we MUST NOT fail.  Since
+                         * __GFP_NOFAIL is going away, we will arrange
+                         * to retry the allocation ourselves.
+                         */
+                        if ((gfp_mask & __GFP_FS) == 0) {
+                                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                goto alloc_transaction;
+                        }
+                        return -ENOMEM;
                }
        }
        jbd_debug(3, "New handle %p going live.\n", handle);
-repeat:
        /*
         * We need to hold j_state_lock until t_updates has been incremented,
         * for proper journal barrier handling
         */
-        spin_lock(&journal->j_state_lock);
+repeat:
-repeat_locked:
+        read_lock(&journal->j_state_lock);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
-                ret = -EROFS;
+                kfree(new_transaction);
-                goto out;
+                return -EROFS;
        }
        /* Wait on the journal's transaction barrier if necessary */
        if (journal->j_barrier_count) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
                goto repeat;
        }
        if (!journal->j_running_transaction) {
-                if (!new_transaction) {
+                read_unlock(&journal->j_state_lock);
-                        spin_unlock(&journal->j_state_lock);
+                if (!new_transaction)
                        goto alloc_transaction;
+                write_lock(&journal->j_state_lock);
+                if (!journal->j_running_transaction) {
+                        jbd2_get_transaction(journal, new_transaction);
+                        new_transaction = NULL;
                }
-                jbd2_get_transaction(journal, new_transaction);
+                write_unlock(&journal->j_state_lock);
-                new_transaction = NULL;
+                goto repeat;
        }
        transaction = journal->j_running_transaction;
@@ -155,7 +170,7 @@ repeat_locked:
                prepare_to_wait(&journal->j_wait_transaction_locked,
                                        &wait, TASK_UNINTERRUPTIBLE);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -166,8 +181,8 @@ repeat_locked:
         * buffers requested by this operation, we need to stall pending a log
         * checkpoint to free some more log space.
         */
-        spin_lock(&transaction->t_handle_lock);
+        needed = atomic_add_return(nblocks,
-        needed = transaction->t_outstanding_credits + nblocks;
+                                   &transaction->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {
                /*
@@ -178,11 +193,11 @@ repeat_locked:
                DEFINE_WAIT(wait);
                jbd_debug(2, "Handle %p starting new commit...\n", handle);
-                spin_unlock(&transaction->t_handle_lock);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
                                TASK_UNINTERRUPTIBLE);
                __jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -215,35 +230,48 @@ repeat_locked:
         */
        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
                jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-                spin_unlock(&transaction->t_handle_lock);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                __jbd2_log_wait_for_space(journal);
+                read_unlock(&journal->j_state_lock);
-                goto repeat_locked;
+                write_lock(&journal->j_state_lock);
+                if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+                        __jbd2_log_wait_for_space(journal);
+                write_unlock(&journal->j_state_lock);
+                goto repeat;
        }
        /* OK, account for the buffers that this operation expects to
-         * use and add the handle to the running transaction. */
+         * use and add the handle to the running transaction. 
+         *
-        if (time_after(transaction->t_start, ts)) {
+         * In order for t_max_wait to be reliable, it must be
+         * protected by a lock.  But doing so will mean that
+         * start_this_handle() can not be run in parallel on SMP
+         * systems, which limits our scalability.  So we only enable
+         * it when debugging is enabled.  We may want to use a
+         * separate flag, eventually, so we can enable this
+         * independently of debugging.
+         */
+#ifdef CONFIG_JBD2_DEBUG
+        if (jbd2_journal_enable_debug &&
+            time_after(transaction->t_start, ts)) {
                ts = jbd2_time_diff(ts, transaction->t_start);
+                spin_lock(&transaction->t_handle_lock);
                if (ts > transaction->t_max_wait)
                        transaction->t_max_wait = ts;
+                spin_unlock(&transaction->t_handle_lock);
        }
+#endif
        handle->h_transaction = transaction;
-        transaction->t_outstanding_credits += nblocks;
+        atomic_inc(&transaction->t_updates);
-        transaction->t_updates++;
+        atomic_inc(&transaction->t_handle_count);
-        transaction->t_handle_count++;
        jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-                  handle, nblocks, transaction->t_outstanding_credits,
+                  handle, nblocks,
+                  atomic_read(&transaction->t_outstanding_credits),
                  __jbd2_log_space_left(journal));
-        spin_unlock(&transaction->t_handle_lock);
+        read_unlock(&journal->j_state_lock);
-        spin_unlock(&journal->j_state_lock);
        lock_map_acquire(&handle->h_lockdep_map);
-out:
+        kfree(new_transaction);
-        if (unlikely(new_transaction))          /* It's usually NULL */
+        return 0;
-                kfree(new_transaction);
-        return ret;
 }
 static struct lock_class_key jbd2_handle_key;
@@ -278,7 +306,7 @@ static handle_t *new_handle(int nblocks)
 *
 * Return a pointer to a newly allocated handle, or NULL on failure
 */
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -298,7 +326,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
        current->journal_info = handle;
-        err = start_this_handle(journal, handle);
+        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
                jbd2_free_handle(handle);
                current->journal_info = NULL;
@@ -308,6 +336,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 out:
        return handle;
 }
+EXPORT_SYMBOL(jbd2__journal_start);
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+        return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
 /**
 * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +379,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        result = 1;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        /* Don't extend a locked-down transaction! */
        if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +389,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        }
        spin_lock(&transaction->t_handle_lock);
-        wanted = transaction->t_outstanding_credits + nblocks;
+        wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
        if (wanted > journal->j_max_transaction_buffers) {
                jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +404,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        }
        handle->h_buffer_credits += nblocks;
-        transaction->t_outstanding_credits += nblocks;
+        atomic_add(nblocks, &transaction->t_outstanding_credits);
        result = 0;
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
 unlock:
        spin_unlock(&transaction->t_handle_lock);
 error_out:
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
 out:
        return result;
 }
@@ -394,8 +431,7 @@ out:
 * transaction capabable of guaranteeing the requested number of
 * credits.
 */
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
-int jbd2_journal_restart(handle_t *handle, int nblocks)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
@@ -410,29 +446,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
-        J_ASSERT(transaction->t_updates > 0);
+        J_ASSERT(atomic_read(&transaction->t_updates) > 0);
        J_ASSERT(journal_current_handle() == handle);
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
-        transaction->t_outstanding_credits -= handle->h_buffer_credits;
+        atomic_sub(handle->h_buffer_credits,
-        transaction->t_updates--;
+                   &transaction->t_outstanding_credits);
+        if (atomic_dec_and_test(&transaction->t_updates))
-        if (!transaction->t_updates)
                wake_up(&journal->j_wait_updates);
        spin_unlock(&transaction->t_handle_lock);
        jbd_debug(2, "restarting handle %p\n", handle);
        __jbd2_log_start_commit(journal, transaction->t_tid);
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
-        ret = start_this_handle(journal, handle);
+        ret = start_this_handle(journal, handle, gfp_mask);
        return ret;
 }
+EXPORT_SYMBOL(jbd2__journal_restart);
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+        return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
 /**
 * void jbd2_journal_lock_updates () - establish a transaction barrier.
 * @journal:  Journal to establish a barrier on.
@@ -447,7 +489,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 {
        DEFINE_WAIT(wait);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;
        /* Wait until there are no running updates */
@@ -458,19 +500,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
                        break;
                spin_lock(&transaction->t_handle_lock);
-                if (!transaction->t_updates) {
+                if (!atomic_read(&transaction->t_updates)) {
                        spin_unlock(&transaction->t_handle_lock);
                        break;
                }
                prepare_to_wait(&journal->j_wait_updates, &wait,
                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_updates, &wait);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        /*
         * We have now established a barrier against other normal updates, but
@@ -494,9 +536,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        J_ASSERT(journal->j_barrier_count != 0);
        mutex_unlock(&journal->j_barrier);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_wait_transaction_locked);
 }
@@ -725,6 +767,9 @@ done:
                page = jh2bh(jh)->b_page;
                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
                source = kmap_atomic(page, KM_USER0);
+                /* Fire data frozen trigger just before we copy the data */
+                jbd2_buffer_frozen_trigger(jh, source + offset,
+                                           jh->b_triggers);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
@@ -963,15 +1008,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
        jh->b_triggers = type;
 }
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
 {
        struct buffer_head *bh = jh2bh(jh);
-        if (!triggers || !triggers->t_commit)
+        if (!triggers || !triggers->t_frozen)
                return;
-        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
 }
 void jbd2_buffer_abort_trigger(struct journal_head *jh,
@@ -1235,7 +1280,8 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int err;
+        int err, wait_for_commit = 0;
+        tid_t tid;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1243,7 +1289,7 @@ int jbd2_journal_stop(handle_t *handle)
        if (is_handle_aborted(handle))
                err = -EIO;
        else {
-                J_ASSERT(transaction->t_updates > 0);
+                J_ASSERT(atomic_read(&transaction->t_updates) > 0);
                err = 0;
        }
@@ -1288,9 +1334,9 @@ int jbd2_journal_stop(handle_t *handle)
                journal->j_last_sync_writer = pid;
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));
@@ -1311,14 +1357,8 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
-        spin_lock(&transaction->t_handle_lock);
+        atomic_sub(handle->h_buffer_credits,
-        transaction->t_outstanding_credits -= handle->h_buffer_credits;
+                   &transaction->t_outstanding_credits);
-        transaction->t_updates--;
-        if (!transaction->t_updates) {
-                wake_up(&journal->j_wait_updates);
-                if (journal->j_barrier_count)
-                        wake_up(&journal->j_wait_transaction_locked);
-        }
        /*
         * If the handle is marked SYNC, we need to set another commit
@@ -1327,15 +1367,13 @@ int jbd2_journal_stop(handle_t *handle)
         * transaction is too old now.
         */
        if (handle->h_sync ||
-                        transaction->t_outstanding_credits >
+            (atomic_read(&transaction->t_outstanding_credits) >
-                                journal->j_max_transaction_buffers ||
+             journal->j_max_transaction_buffers) ||
-                        time_after_eq(jiffies, transaction->t_expires)) {
+            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */
-                tid_t tid = transaction->t_tid;
-                spin_unlock(&transaction->t_handle_lock);
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
@@ -1346,11 +1384,25 @@ int jbd2_journal_stop(handle_t *handle)
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-                        err = jbd2_log_wait_commit(journal, tid);
+                        wait_for_commit = 1;
-        } else {
-                spin_unlock(&transaction->t_handle_lock);
        }
+        /*
+         * Once we drop t_updates, if it goes to zero the transaction
+         * could start commiting on us and eventually disappear.  So
+         * once we do this, we must not dereference transaction
+         * pointer again.
+         */
+        tid = transaction->t_tid;
+        if (atomic_dec_and_test(&transaction->t_updates)) {
+                wake_up(&journal->j_wait_updates);
+                if (journal->j_barrier_count)
+                        wake_up(&journal->j_wait_transaction_locked);
+        }
+        if (wait_for_commit)
+                err = jbd2_log_wait_commit(journal, tid);
        lock_map_release(&handle->h_lockdep_map);
        jbd2_free_handle(handle);
@@ -1716,7 +1768,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                goto zap_buffer_unlocked;
        /* OK, we have data buffer in journaled mode */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
@@ -1769,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                        jbd2_journal_put_journal_head(jh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        return ret;
                } else {
                        /* There is no currently-running transaction. So the
@@ -1783,7 +1835,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                                jbd2_journal_put_journal_head(jh);
                                spin_unlock(&journal->j_list_lock);
                                jbd_unlock_bh_state(bh);
-                                spin_unlock(&journal->j_state_lock);
+                                write_unlock(&journal->j_state_lock);
                                return ret;
                        } else {
                                /* The orphan record's transaction has
@@ -1807,7 +1859,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
@@ -1826,7 +1878,7 @@ zap_buffer:
 zap_buffer_no_jh:
        spin_unlock(&journal->j_list_lock);
        jbd_unlock_bh_state(bh);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2133,9 +2185,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
        /* Locks are here just to force reading of recent values, it is
         * enough that the transaction was not committing before we started
         * a transaction adding the inode to orphan list */
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        inode_trans = jinode->i_transaction;
        spin_unlock(&journal->j_list_lock);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b4..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-        /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+        /* success of check_xattr_ref_inode() means that inode (ic) dose not have
         * duplicate name/value pairs. If duplicate name/value pair would be found,
         * one will be removed.
         */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
 * What the mbcache registers as to get shrunk dynamically.
 */
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 static struct shrinker mb_cache_shrinker = {
        .shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
 * This function is called by the kernel memory management when memory
 * gets low.
 *
+ * @shrink: (ignored)
 * @nr_to_scan: Number of objects to scan
 * @gfp_mask: (ignored)
 *
 * Returns the number of objects which are present in the cache.
 */
 static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(free_list);
        struct list_head *l, *ltmp;
diff --git a/fs/namei.c b/fs/namei.c
index 868d0cb9d473..42d2d28fb827 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
        if (retval)
                return retval;
-        return security_inode_permission(inode,
+        return security_inode_permission(inode, mask);
-                        mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 /**
@@ -1484,8 +1483,7 @@ static int handle_truncate(struct path *path)
         */
        error = locks_verify_locked(inode);
        if (!error)
-                error = security_path_truncate(path, 0,
+                error = security_path_truncate(path);
-                                       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa3385154023..1e634deff941 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -728,8 +728,8 @@ out_fput:
 out_bdi:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
-         * The previously used put_filp(ncp_filp); was bogous, since
+         * The previously used put_filp(ncp_filp); was bogus, since
-         * it doesn't proper unlocking.
+         * it doesn't perform proper unlocking.
         */
        fput(ncp_filp);
 out:
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b924..cc1bb33b59b8 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,8 +61,8 @@ config NFS_V3_ACL
          If unsure, say N.
 config NFS_V4
-        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
+        bool "NFS client support for NFS version 4"
-        depends on NFS_FS && EXPERIMENTAL
+        depends on NFS_FS
        select RPCSEC_GSS_KRB5
        help
          This option enables support for version 4 of the NFS protocol
@@ -72,16 +72,16 @@ config NFS_V4
          space programs which can be found in the Linux nfs-utils package,
          available from http://linux-nfs.org/.
-          If unsure, say N.
+          If unsure, say Y.
 config NFS_V4_1
-        bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
+        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
        depends on NFS_V4 && EXPERIMENTAL
        help
          This option enables support for minor version 1 of the NFSv4 protocol
          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
-          Unless you're an NFS developer, say N.
+          If unsure, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e857..930d10fecdaf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
        if (inode == NULL)
                goto out_putclient;
        nfsi = NFS_I(inode);
-        down_read(&nfsi->rwsem);
+        rcu_read_lock();
-        delegation = nfsi->delegation;
+        delegation = rcu_dereference(nfsi->delegation);
        if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
                goto out_iput;
        res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
                args->bitmap[1];
        res->status = 0;
 out_iput:
-        up_read(&nfsi->rwsem);
+        rcu_read_unlock();
        iput(inode);
 out_putclient:
        nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
        return res->status;
 }
-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
-{
-#if defined(CONFIG_NFS_V4_1)
-        if (clp->cl_minorversion > 0)
-                return nfs41_validate_delegation_stateid;
-#endif
-        return nfs4_validate_delegation_stateid;
-}
 __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
        struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
                inode = nfs_delegation_find_inode(clp, &args->fh);
                if (inode != NULL) {
                        /* Set up a helper thread to actually return the delegation */
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid,
+                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                                                                  nfs_validate_delegation_stateid(clp))) {
                                case 0:
                                        res = 0;
                                        break;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a1..4e7df2adb212 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_boot_time = CURRENT_TIME;
        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
        clp->cl_minorversion = cl_init->minorversion;
+        clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
                clp->cl_session = NULL;
        }
-        clp->cl_call_sync = _nfs4_call_sync;
+        clp->cl_mvops = nfs_v4_minor_ops[0];
 #endif /* CONFIG_NFS_V4_1 */
 }
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down(clp->cl_minorversion);
+                nfs_callback_down(clp->cl_mvops->minor_version);
 }
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
                                return error;
                }
-                error = nfs_callback_up(clp->cl_minorversion,
+                error = nfs_callback_up(clp->cl_mvops->minor_version,
                                        clp->cl_rpcclient->cl_xprt);
                if (error < 0) {
                        dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
 */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
-        clp->cl_call_sync = _nfs4_call_sync;
 #if defined(CONFIG_NFS_V4_1)
-        if (clp->cl_minorversion) {
+        if (clp->cl_mvops->minor_version) {
                struct nfs4_session *session = NULL;
                /*
                 * Create the session and mark it expired.
@@ -1158,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
                        return -ENOMEM;
                clp->cl_session = session;
-                clp->cl_call_sync = _nfs4_call_sync_session;
+                /*
+                 * The create session reply races with the server back
+                 * channel probe. Mark the client NFS_CS_SESSION_INITING
+                 * so that the client back channel can find the
+                 * nfs_client struct
+                 */
+                clp->cl_cons_state = NFS_CS_SESSION_INITING;
        }
 #endif /* CONFIG_NFS_V4_1 */
@@ -1454,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
                                data->authflavor,
                                parent_server->client->cl_xprt->prot,
                                parent_server->client->cl_timeout,
-                                parent_client->cl_minorversion);
+                                parent_client->cl_mvops->minor_version);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 301634543974..b9c3c43cea1d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -268,14 +268,6 @@ out:
        return status;
 }
-/* Sync all data to disk upon delegation return */
-static void nfs_msync_inode(struct inode *inode)
-{
-        filemap_fdatawrite(inode->i_mapping);
-        nfs_wb_all(inode);
-        filemap_fdatawait(inode->i_mapping);
-}
 /*
 * Basic procedure for returning a delegation to the server
 */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
-                        nfs_msync_inode(inode);
+                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
                }
        }
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /*
 * Asynchronous delegation recall!
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
-                                      int (*validate_stateid)(struct nfs_delegation *delegation,
-                                                              const nfs4_stateid *stateid))
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (!validate_stateid(delegation, stateid)) {
+        if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
                rcu_read_unlock();
                return -ENOENT;
        }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b8140122..2026304bda19 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
-                                      int (*validate_stateid)(struct nfs_delegation *delegation,
-                                                              const nfs4_stateid *stateid));
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 782b431ef91c..29539ceeb745 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1652,16 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
-        /*
-         * ... prune child dentries and writebacks if needed.
-         */
-        if (atomic_read(&old_dentry->d_count) > 1) {
-                if (S_ISREG(old_inode->i_mode))
-                        nfs_wb_all(old_inode);
-                shrink_dcache_parent(old_dentry);
-        }
        nfs_inode_return_delegation(old_inode);
        if (new_inode != NULL)
                nfs_inode_return_delegation(new_inode);
@@ -1710,7 +1701,7 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi;
@@ -1953,7 +1944,7 @@ int nfs_permission(struct inode *inode, int mask)
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
                goto out;
        /* Is this sys_access() ? */
-        if (mask & MAY_ACCESS)
+        if (mask & (MAY_ACCESS | MAY_CHDIR))
                goto force_lookup;
        switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
        /* I/O parameters */
        struct nfs_open_context *ctx;           /* file open context info */
+        struct nfs_lock_context *l_ctx;         /* Lock context info */
        struct kiocb *          iocb;           /* controlling i/o request */
        struct inode *          inode;          /* target file of i/o */
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        INIT_LIST_HEAD(&dreq->rewrite_list);
        dreq->iocb = NULL;
        dreq->ctx = NULL;
+        dreq->l_ctx = NULL;
        spin_lock_init(&dreq->lock);
        atomic_set(&dreq->io_count, 0);
        dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
 {
        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+        if (dreq->l_ctx != NULL)
+                nfs_put_lock_context(dreq->l_ctx);
        if (dreq->ctx != NULL)
                put_nfs_open_context(dreq->ctx);
        kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
+                data->args.lock_context = dreq->l_ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
-        ssize_t result = 0;
+        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
        dreq = nfs_direct_req_alloc();
-        if (!dreq)
+        if (dreq == NULL)
-                return -ENOMEM;
+                goto out;
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        if (dreq->l_ctx == NULL)
+                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
        if (!result)
                result = nfs_direct_wait(dreq);
+out_release:
        nfs_direct_req_release(dreq);
+out:
        return result;
 }
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.offset = 0;
        data->args.count = 0;
        data->args.context = dreq->ctx;
+        data->args.lock_context = dreq->l_ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
+                data->args.lock_context = dreq->l_ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos,
                                size_t count)
 {
-        ssize_t result = 0;
+        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
        size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq = nfs_direct_req_alloc();
        if (!dreq)
-                return -ENOMEM;
+                goto out;
        nfs_alloc_commit_data(dreq);
        if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        if (dreq->l_ctx != NULL)
+                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
        if (!result)
                result = nfs_direct_wait(dreq);
+out_release:
        nfs_direct_req_release(dreq);
+out:
        return result;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 36a5e74f51b4..2d141a74ae82 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/aio.h>
 #include <linux/gfp.h>
+#include <linux/swap.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -202,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 }
 /*
- * Helper for nfs_file_flush() and nfs_file_fsync()
- *
- * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
- * disk, but it retrieves and clears ctx->error after synching, despite
- * the two being set at the same time in nfs_context_set_write_error().
- * This is because the former is used to notify the _next_ call to
- * nfs_file_write() that a write error occured, and hence cause it to
- * fall back to doing a synchronous write.
- */
-static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
-{
-        int have_error, status;
-        int ret = 0;
-        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-        status = nfs_wb_all(inode);
-        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-        if (have_error)
-                ret = xchg(&ctx->error, 0);
-        if (!ret)
-                ret = status;
-        return ret;
-}
-/*
 * Flush all dirty pages, and check for write errors.
 */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
@@ -245,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
                return 0;
        /* Flush writes to the server and return any errors */
-        return nfs_do_fsync(ctx, inode);
+        return vfs_fsync(file, 0);
 }
 static ssize_t
@@ -320,6 +295,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * Flush any dirty pages for this process, and check for write errors.
 * The return status from this call provides a reliable indication of
 * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
 */
 static int
 nfs_file_fsync(struct file *file, int datasync)
@@ -327,13 +309,23 @@ nfs_file_fsync(struct file *file, int datasync)
        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
+        int have_error, status;
+        int ret = 0;
        dprintk("NFS: fsync file(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-        return nfs_do_fsync(ctx, inode);
+        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+        status = nfs_commit_inode(inode, FLUSH_SYNC);
+        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+        if (have_error)
+                ret = xchg(&ctx->error, 0);
+        if (!ret)
+                ret = status;
+        return ret;
 }
 /*
@@ -493,11 +485,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+        struct address_space *mapping = page->mapping;
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* Only do I/O if gfp is a superset of GFP_KERNEL */
-        if ((gfp & GFP_KERNEL) == GFP_KERNEL)
+        if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
-                nfs_wb_page(page->mapping->host, page);
+                int how = FLUSH_SYNC;
+                /* Don't let kswapd deadlock waiting for OOM RPC calls */
+                if (current_is_kswapd())
+                        how = 0;
+                nfs_commit_inode(mapping->host, how);
+        }
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
                return 0;
@@ -639,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        /* Return error values for O_DSYNC and IS_SYNC() */
        if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-                int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
+                int err = vfs_fsync(iocb->ki_filp, 0);
                if (err < 0)
                        result = err;
        }
@@ -675,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
                written = ret;
        if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
-                int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
+                int err = vfs_fsync(filp, 0);
                if (err < 0)
                        ret = err;
        }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518feea..581d8f081e68 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -413,10 +413,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                return 0;
        /* Write all dirty data */
-        if (S_ISREG(inode->i_mode)) {
+        if (S_ISREG(inode->i_mode))
-                filemap_write_and_wait(inode->i_mapping);
                nfs_wb_all(inode);
-        }
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
@@ -530,6 +528,68 @@ out:
        return err;
 }
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+        atomic_set(&l_ctx->count, 1);
+        l_ctx->lockowner = current->files;
+        l_ctx->pid = current->tgid;
+        INIT_LIST_HEAD(&l_ctx->list);
+}
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+        struct nfs_lock_context *pos;
+        list_for_each_entry(pos, &ctx->lock_context.list, list) {
+                if (pos->lockowner != current->files)
+                        continue;
+                if (pos->pid != current->tgid)
+                        continue;
+                atomic_inc(&pos->count);
+                return pos;
+        }
+        return NULL;
+}
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+        struct nfs_lock_context *res, *new = NULL;
+        struct inode *inode = ctx->path.dentry->d_inode;
+        spin_lock(&inode->i_lock);
+        res = __nfs_find_lock_context(ctx);
+        if (res == NULL) {
+                spin_unlock(&inode->i_lock);
+                new = kmalloc(sizeof(*new), GFP_KERNEL);
+                if (new == NULL)
+                        return NULL;
+                nfs_init_lock_context(new);
+                spin_lock(&inode->i_lock);
+                res = __nfs_find_lock_context(ctx);
+                if (res == NULL) {
+                        list_add_tail(&new->list, &ctx->lock_context.list);
+                        new->open_context = ctx;
+                        res = new;
+                        new = NULL;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        kfree(new);
+        return res;
+}
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+        struct nfs_open_context *ctx = l_ctx->open_context;
+        struct inode *inode = ctx->path.dentry->d_inode;
+        if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+                return;
+        list_del(&l_ctx->list);
+        spin_unlock(&inode->i_lock);
+        kfree(l_ctx);
+}
 /**
 * nfs_close_context - Common close_context() routine NFSv2/v3
 * @ctx: pointer to context
@@ -566,11 +626,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
-                ctx->lockowner = current->files;
                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
-                atomic_set(&ctx->count, 1);
+                nfs_init_lock_context(&ctx->lock_context);
+                ctx->lock_context.open_context = ctx;
        }
        return ctx;
 }
@@ -578,7 +638,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
        if (ctx != NULL)
-                atomic_inc(&ctx->count);
+                atomic_inc(&ctx->lock_context.count);
        return ctx;
 }
@@ -586,7 +646,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct inode *inode = ctx->path.dentry->d_inode;
-        if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+        if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
                return;
        list_del(&ctx->list);
        spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386c..4c2150d86714 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 /* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+                                        int nr_to_scan, gfp_t gfp_mask);
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
@@ -369,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 * Helper for restarting RPC calls in the possible presence of NFSv4.1
 * sessions.
 */
-static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
 {
        if (nfs4_has_session(clp))
-                rpc_restart_call_prepare(task);
+                return rpc_restart_call_prepare(task);
-        else
+        return rpc_restart_call(task);
-                rpc_restart_call(task);
 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 offset = (u32)args->offset;
        u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
 static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
-        struct rpc_task *task = req->rq_task;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da365..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -675,7 +675,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c538c6106e16..311e15cc8af0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
        NFS4CLNT_SESSION_RESET,
-        NFS4CLNT_SESSION_DRAINING,
        NFS4CLNT_RECALL_SLOT,
 };
+enum nfs4_session_state {
+        NFS4_SESSION_INITING,
+        NFS4_SESSION_DRAINING,
+};
+struct nfs4_minor_version_ops {
+        u32     minor_version;
+        int     (*call_sync)(struct nfs_server *server,
+                        struct rpc_message *msg,
+                        struct nfs4_sequence_args *args,
+                        struct nfs4_sequence_res *res,
+                        int cache_reply);
+        int     (*validate_stateid)(struct nfs_delegation *,
+                        const nfs4_stateid *);
+        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+        const struct nfs4_state_maintenance_ops *state_renewal_ops;
+};
 /*
 * struct rpc_sequence ensures that RPC calls are sent in the exact
 * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
 */
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
-        struct nfs_client    *so_client;
        struct nfs_server    *so_server;
        struct rb_node       so_client_node;
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
        atomic_t             so_count;
        unsigned long        so_flags;
        struct list_head     so_states;
-        struct list_head     so_delegations;
        struct nfs_seqid_counter so_seqid;
        struct rpc_sequence  so_sequence;
 };
@@ -125,10 +142,20 @@ enum {
 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
 */
+struct nfs4_lock_owner {
+        unsigned int lo_type;
+#define NFS4_ANY_LOCK_TYPE      (0U)
+#define NFS4_FLOCK_LOCK_TYPE    (1U << 0)
+#define NFS4_POSIX_LOCK_TYPE    (1U << 1)
+        union {
+                fl_owner_t posix_owner;
+                pid_t flock_owner;
+        } lo_u;
+};
 struct nfs4_lock_state {
        struct list_head        ls_locks;       /* Other lock stateids */
        struct nfs4_state *     ls_state;       /* Pointer to open state */
-        fl_owner_t              ls_owner;       /* POSIX lock owner */
 #define NFS_LOCK_INITIALIZED 1
        int                     ls_flags;
        struct nfs_seqid_counter        ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
        struct nfs_unique_id    ls_id;
        nfs4_stateid            ls_stateid;
        atomic_t                ls_count;
+        struct nfs4_lock_owner  ls_owner;
 };
 /* bits for nfs4_state->flags */
@@ -219,11 +247,15 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
 #if defined(CONFIG_NFS_V4_1)
-extern int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+        return server->nfs_client->cl_session;
+}
+extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
 #else /* CONFIG_NFS_v4_1 */
-static inline int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+        return NULL;
+}
+static inline int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task)
 {
@@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
 }
 #endif /* CONFIG_NFS_V4_1 */
-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -284,7 +321,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 70015dd60a98..7ffbb98ddec3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -303,15 +303,19 @@ do_state_recovery:
 }
-static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
 {
-        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
        if (time_before(clp->cl_last_renewal,timestamp))
                clp->cl_last_renewal = timestamp;
        spin_unlock(&clp->cl_lock);
 }
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+        do_renew_lease(server->nfs_client, timestamp);
+}
 #if defined(CONFIG_NFS_V4_1)
 /*
@@ -356,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
-        if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
                task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
                if (task)
                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -370,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        complete(&ses->complete);
 }
-static void nfs41_sequence_free_slot(const struct nfs_client *clp,
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
-                              struct nfs4_sequence_res *res)
 {
        struct nfs4_slot_table *tbl;
-        tbl = &clp->cl_session->fc_slot_table;
+        tbl = &res->sr_session->fc_slot_table;
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
@@ -385,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slotid);
-        nfs41_check_drain_session_complete(clp->cl_session);
+        nfs41_check_drain_session_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
-static void nfs41_sequence_done(struct nfs_client *clp,
+static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
-                                struct nfs4_sequence_res *res,
-                                int rpc_status)
 {
        unsigned long timestamp;
        struct nfs4_slot_table *tbl;
        struct nfs4_slot *slot;
+        struct nfs_client *clp;
        /*
         * sr_status remains 1 if an RPC level error occurred. The server
@@ -411,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp,
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
                goto out;
+        tbl = &res->sr_session->fc_slot_table;
+        slot = tbl->slots + res->sr_slotid;
        /* Check the SEQUENCE operation status */
-        if (res->sr_status == 0) {
+        switch (res->sr_status) {
-                tbl = &clp->cl_session->fc_slot_table;
+        case 0:
-                slot = tbl->slots + res->sr_slotid;
                /* Update the slot's sequence and clientid lease timer */
                ++slot->seq_nr;
                timestamp = res->sr_renewal_time;
-                spin_lock(&clp->cl_lock);
+                clp = res->sr_session->clp;
-                if (time_before(clp->cl_last_renewal, timestamp))
+                do_renew_lease(clp, timestamp);
-                        clp->cl_last_renewal = timestamp;
-                spin_unlock(&clp->cl_lock);
                /* Check sequence flags */
                if (atomic_read(&clp->cl_count) > 1)
                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                break;
+        case -NFS4ERR_DELAY:
+                /* The server detected a resend of the RPC call and
+                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
+                 * of RFC5661.
+                 */
+                dprintk("%s: slot=%d seq=%d: Operation in progress\n",
+                                __func__, res->sr_slotid, slot->seq_nr);
+                goto out_retry;
+        default:
+                /* Just update the slot sequence no. */
+                ++slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
-        nfs41_sequence_free_slot(clp, res);
+        nfs41_sequence_free_slot(res);
+        return 1;
+out_retry:
+        if (!rpc_restart_call(task))
+                goto out;
+        rpc_delay(task, NFS4_POLL_RETRY_MAX);
+        return 0;
+}
+static int nfs4_sequence_done(struct rpc_task *task,
+                               struct nfs4_sequence_res *res)
+{
+        if (res->sr_session == NULL)
+                return 1;
+        return nfs41_sequence_done(task, res);
 }
 /*
@@ -480,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
                return 0;
-        memset(res, 0, sizeof(*res));
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
+        if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
                /*
                 * The state manager will wait until the slot table is empty.
@@ -525,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        res->sr_session = session;
        res->sr_slotid = slotid;
        res->sr_renewal_time = jiffies;
+        res->sr_status_flags = 0;
        /*
         * sr_status is only set in decode_sequence, and so will remain
         * set to 1 if an rpc level failure occurs.
@@ -533,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        return 0;
 }
-int nfs4_setup_sequence(struct nfs_client *clp,
+int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
                        struct nfs4_sequence_res *res,
                        int cache_reply,
                        struct rpc_task *task)
 {
+        struct nfs4_session *session = nfs4_get_session(server);
        int ret = 0;
+        if (session == NULL) {
+                args->sa_session = NULL;
+                res->sr_session = NULL;
+                goto out;
+        }
        dprintk("--> %s clp %p session %p sr_slotid %d\n",
-                __func__, clp, clp->cl_session, res->sr_slotid);
+                __func__, session->clp, session, res->sr_slotid);
-        if (!nfs4_has_session(clp))
+        ret = nfs41_setup_sequence(session, args, res, cache_reply,
-                goto out;
-        ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
                                   task);
-        if (ret && ret != -EAGAIN) {
-                /* terminate rpc task */
-                task->tk_status = ret;
-                task->tk_action = NULL;
-        }
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
 struct nfs41_call_sync_data {
-        struct nfs_client *clp;
+        const struct nfs_server *seq_server;
        struct nfs4_sequence_args *seq_args;
        struct nfs4_sequence_res *seq_res;
        int cache_reply;
@@ -569,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
-        dprintk("--> %s data->clp->cl_session %p\n", __func__,
+        dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
-                data->clp->cl_session);
-        if (nfs4_setup_sequence(data->clp, data->seq_args,
+        if (nfs4_setup_sequence(data->seq_server, data->seq_args,
                                data->seq_res, data->cache_reply, task))
                return;
        rpc_call_start(task);
@@ -587,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
-        nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
+        nfs41_sequence_done(task, data->seq_res);
 }
 struct rpc_call_ops nfs41_call_sync_ops = {
@@ -600,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
-static int nfs4_call_sync_sequence(struct nfs_client *clp,
+static int nfs4_call_sync_sequence(struct nfs_server *server,
-                                   struct rpc_clnt *clnt,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
@@ -611,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
        int ret;
        struct rpc_task *task;
        struct nfs41_call_sync_data data = {
-                .clp = clp,
+                .seq_server = server,
                .seq_args = args,
                .seq_res = res,
                .cache_reply = cache_reply,
        };
        struct rpc_task_setup task_setup = {
-                .rpc_client = clnt,
+                .rpc_client = server->client,
                .rpc_message = msg,
                .callback_ops = &nfs41_call_sync_ops,
                .callback_data = &data
@@ -642,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
                            struct nfs4_sequence_res *res,
                            int cache_reply)
 {
-        return nfs4_call_sync_sequence(server->nfs_client, server->client,
+        return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
-                                       msg, args, res, cache_reply, 0);
 }
+#else
+static int nfs4_sequence_done(struct rpc_task *task,
+                               struct nfs4_sequence_res *res)
+{
+        return 1;
+}
 #endif /* CONFIG_NFS_V4_1 */
 int _nfs4_call_sync(struct nfs_server *server,
@@ -659,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server,
 }
 #define nfs4_call_sync(server, msg, args, res, cache_reply) \
-        (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+        (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
                        &(res)->seq_res, (cache_reply))
-static void nfs4_sequence_done(const struct nfs_server *server,
-                               struct nfs4_sequence_res *res, int rpc_status)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(server->nfs_client))
-                nfs41_sequence_done(server->nfs_client, res, rpc_status);
-#endif /* CONFIG_NFS_V4_1 */
-}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
        struct nfs_inode *nfsi = NFS_I(dir);
@@ -745,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->o_arg.server = server;
        p->o_arg.bitmask = server->attr_bitmask;
        p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-        if (flags & O_EXCL) {
+        if (flags & O_CREAT) {
-                if (nfs4_has_persistent_session(server->nfs_client)) {
+                u32 *s;
-                        /* GUARDED */
-                        p->o_arg.u.attrs = &p->attrs;
-                        memcpy(&p->attrs, attrs, sizeof(p->attrs));
-                } else { /* EXCLUSIVE4_1 */
-                        u32 *s = (u32 *) p->o_arg.u.verifier.data;
-                        s[0] = jiffies;
-                        s[1] = current->pid;
-                }
-        } else if (flags & O_CREAT) {
                p->o_arg.u.attrs = &p->attrs;
                memcpy(&p->attrs, attrs, sizeof(p->attrs));
+                s = (u32 *) p->o_arg.u.verifier.data;
+                s[0] = jiffies;
+                s[1] = current->pid;
        }
        p->c_arg.fh = &p->o_res.fh;
        p->c_arg.stateid = &p->o_res.stateid;
@@ -1255,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
        struct nfs4_opendata *data = calldata;
        data->rpc_status = task->tk_status;
-        if (RPC_ASSASSINATED(task))
-                return;
        if (data->rpc_status == 0) {
                memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
                                sizeof(data->o_res.stateid.data));
@@ -1356,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        }
        /* Update sequence id. */
        data->o_arg.id = sp->so_owner_id.id;
-        data->o_arg.clientid = sp->so_client->cl_clientid;
+        data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
-        if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+        if (nfs4_setup_sequence(data->o_arg.server,
                                &data->o_arg.seq_args,
                                &data->o_res.seq_res, 1, task))
                return;
@@ -1385,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
        data->rpc_status = task->tk_status;
-        nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
+        if (!nfs4_sequence_done(task, &data->o_res.seq_res))
-                        task->tk_status);
-        if (RPC_ASSASSINATED(task))
                return;
        if (task->tk_status == 0) {
                switch (data->o_res.f_attr->mode & S_IFMT) {
                        case S_IFREG:
@@ -1773,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
                /* Use that stateid */
        } else if (state != NULL) {
-                nfs4_copy_stateid(&arg.stateid, state, current->files);
+                nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1838,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
-        nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
-        if (RPC_ASSASSINATED(task))
                return;
        /* hmm. we are done with the inode, and in the process of freeing
         * the state_owner. we keep this around to process errors
@@ -1903,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
-        if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
+        if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
                                &calldata->arg.seq_args, &calldata->res.seq_res,
                                1, task))
                return;
@@ -2648,7 +2661,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
-        nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &res->seq_res))
+                return 0;
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
@@ -3093,7 +3107,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        dprintk("--> %s\n", __func__);
-        nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, server->nfs_client);
@@ -3116,8 +3131,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                           task->tk_status);
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3145,8 +3160,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                           task->tk_status);
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
@@ -3196,10 +3212,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
                        nfs4_schedule_state_recovery(clp);
                return;
        }
-        spin_lock(&clp->cl_lock);
+        do_renew_lease(clp, timestamp);
-        if (time_before(clp->cl_last_renewal,timestamp))
-                clp->cl_last_renewal = timestamp;
-        spin_unlock(&clp->cl_lock);
 }
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3240,10 +3253,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
        if (status < 0)
                return status;
-        spin_lock(&clp->cl_lock);
+        do_renew_lease(clp, now);
-        if (time_before(clp->cl_last_renewal,now))
-                clp->cl_last_renewal = now;
-        spin_unlock(&clp->cl_lock);
        return 0;
 }
@@ -3464,9 +3474,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
-        if (!clp || task->tk_status >= 0)
+        struct nfs_client *clp = server->nfs_client;
+        if (task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
                case -NFS4ERR_ADMIN_REVOKED:
@@ -3498,8 +3510,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
                case -NFS4ERR_DELAY:
-                        if (server)
+                        nfs_inc_server_stats(server, NFSIOS_DELAY);
-                                nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                case -EKEYEXPIRED:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3520,12 +3531,6 @@ do_state_recovery:
        return -EAGAIN;
 }
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
-{
-        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
-}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                unsigned short port, struct rpc_cred *cred,
                struct nfs4_setclientid_res *res)
@@ -3641,8 +3646,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
-        nfs4_sequence_done(data->res.server, &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                        task->tk_status);
+                return;
        switch (task->tk_status) {
        case -NFS4ERR_STALE_STATEID:
@@ -3672,7 +3677,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
        d_data = (struct nfs4_delegreturndata *)data;
-        if (nfs4_setup_sequence(d_data->res.server->nfs_client,
+        if (nfs4_setup_sequence(d_data->res.server,
                                &d_data->args.seq_args,
                                &d_data->res.seq_res, 1, task))
                return;
@@ -3892,9 +3897,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
        struct nfs4_unlockdata *calldata = data;
-        nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
+        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
-                           task->tk_status);
-        if (RPC_ASSASSINATED(task))
                return;
        switch (task->tk_status) {
                case 0:
@@ -3927,7 +3930,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                return;
        }
        calldata->timestamp = jiffies;
-        if (nfs4_setup_sequence(calldata->server->nfs_client,
+        if (nfs4_setup_sequence(calldata->server,
                                &calldata->arg.seq_args,
                                &calldata->res.seq_res, 1, task))
                return;
@@ -4082,7 +4085,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        } else
                data->arg.new_lock_owner = 0;
        data->timestamp = jiffies;
-        if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
+        if (nfs4_setup_sequence(data->server,
+                                &data->arg.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
@@ -4101,12 +4105,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
        dprintk("%s: begin!\n", __func__);
-        nfs4_sequence_done(data->server, &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                        task->tk_status);
+                return;
        data->rpc_status = task->tk_status;
-        if (RPC_ASSASSINATED(task))
-                goto out;
        if (data->arg.new_lock_owner != 0) {
                if (data->rpc_status == 0)
                        nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4424,6 +4426,34 @@ out:
        return err;
 }
+static void nfs4_release_lockowner_release(void *calldata)
+{
+        kfree(calldata);
+}
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+        .rpc_release = nfs4_release_lockowner_release,
+};
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_release_lockowner_args *args;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+        };
+        if (server->nfs_client->cl_mvops->minor_version != 0)
+                return;
+        args = kmalloc(sizeof(*args), GFP_NOFS);
+        if (!args)
+                return;
+        args->lock_owner.clientid = server->nfs_client->cl_clientid;
+        args->lock_owner.id = lsp->ls_id.id;
+        msg.rpc_argp = args;
+        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4611,7 +4641,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
                        (struct nfs4_get_lease_time_data *)calldata;
        dprintk("--> %s\n", __func__);
-        nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+        if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+                return;
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
@@ -4805,13 +4836,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        if (!session)
                return NULL;
-        /*
-         * The create session reply races with the server back
-         * channel probe. Mark the client NFS_CS_SESSION_INITING
-         * so that the client back channel can find the
-         * nfs_client struct
-         */
-        clp->cl_cons_state = NFS_CS_SESSION_INITING;
        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
@@ -4824,6 +4848,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        session->session_state = 1<<NFS4_SESSION_INITING;
        session->clp = clp;
        return session;
 }
@@ -5040,6 +5066,10 @@ int nfs4_init_session(struct nfs_server *server)
        if (!nfs4_has_session(clp))
                return 0;
+        session = clp->cl_session;
+        if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+                return 0;
        rsize = server->rsize;
        if (rsize == 0)
                rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5047,7 +5077,6 @@ int nfs4_init_session(struct nfs_server *server)
        if (wsize == 0)
                wsize = NFS_MAX_FILE_IO_SIZE;
-        session = clp->cl_session;
        session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
        session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
@@ -5060,69 +5089,70 @@ int nfs4_init_session(struct nfs_server *server)
 /*
 * Renew the cl_session lease.
 */
-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+struct nfs4_sequence_data {
-{
+        struct nfs_client *clp;
        struct nfs4_sequence_args args;
        struct nfs4_sequence_res res;
+};
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
-                .rpc_argp = &args,
-                .rpc_resp = &res,
-                .rpc_cred = cred,
-        };
-        args.sa_cache_this = 0;
-        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
-                                       &res, args.sa_cache_this, 1);
-}
 static void nfs41_sequence_release(void *data)
 {
-        struct nfs_client *clp = (struct nfs_client *)data;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
        if (atomic_read(&clp->cl_count) > 1)
                nfs4_schedule_state_renewal(clp);
        nfs_put_client(clp);
+        kfree(calldata);
+}
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+        switch(task->tk_status) {
+        case -NFS4ERR_DELAY:
+        case -EKEYEXPIRED:
+                rpc_delay(task, NFS4_POLL_RETRY_MAX);
+                return -EAGAIN;
+        default:
+                nfs4_schedule_state_recovery(clp);
+        }
+        return 0;
 }
 static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 {
-        struct nfs_client *clp = (struct nfs_client *)data;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
-        nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
+        if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+                return;
        if (task->tk_status < 0) {
                dprintk("%s ERROR %d\n", __func__, task->tk_status);
                if (atomic_read(&clp->cl_count) == 1)
                        goto out;
-                if (_nfs4_async_handle_error(task, NULL, clp, NULL)
+                if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
-                                                                == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
-                        nfs_restart_rpc(task, clp);
                        return;
                }
        }
        dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 out:
-        kfree(task->tk_msg.rpc_argp);
-        kfree(task->tk_msg.rpc_resp);
        dprintk("<-- %s\n", __func__);
 }
 static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 {
-        struct nfs_client *clp;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
        struct nfs4_sequence_args *args;
        struct nfs4_sequence_res *res;
-        clp = (struct nfs_client *)data;
        args = task->tk_msg.rpc_argp;
        res = task->tk_msg.rpc_resp;
-        if (nfs4_setup_sequence(clp, args, res, 0, task))
+        if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
                return;
        rpc_call_start(task);
 }
@@ -5133,32 +5163,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
        .rpc_release = nfs41_sequence_release,
 };
-static int nfs41_proc_async_sequence(struct nfs_client *clp,
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
-                                     struct rpc_cred *cred)
 {
-        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_data *calldata;
-        struct nfs4_sequence_res *res;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
                .rpc_cred = cred,
        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs41_sequence_ops,
+                .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+        };
        if (!atomic_inc_not_zero(&clp->cl_count))
-                return -EIO;
+                return ERR_PTR(-EIO);
-        args = kzalloc(sizeof(*args), GFP_NOFS);
+        calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
-        res = kzalloc(sizeof(*res), GFP_NOFS);
+        if (calldata == NULL) {
-        if (!args || !res) {
-                kfree(args);
-                kfree(res);
                nfs_put_client(clp);
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-        msg.rpc_argp = args;
+        msg.rpc_argp = &calldata->args;
-        msg.rpc_resp = res;
+        msg.rpc_resp = &calldata->res;
+        calldata->clp = clp;
+        task_setup_data.callback_data = calldata;
-        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+        return rpc_run_task(&task_setup_data);
-                              &nfs41_sequence_ops, (void *)clp);
+}
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct rpc_task *task;
+        int ret = 0;
+        task = _nfs41_proc_sequence(clp, cred);
+        if (IS_ERR(task))
+                ret = PTR_ERR(task);
+        else
+                rpc_put_task(task);
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
+}
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct rpc_task *task;
+        int ret;
+        task = _nfs41_proc_sequence(clp, cred);
+        if (IS_ERR(task)) {
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        ret = rpc_wait_for_completion_task(task);
+        if (!ret)
+                ret = task->tk_status;
+        rpc_put_task(task);
+out:
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
 }
 struct nfs4_reclaim_complete_data {
@@ -5172,13 +5237,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
        struct nfs4_reclaim_complete_data *calldata = data;
        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
+        if (nfs41_setup_sequence(calldata->clp->cl_session,
+                                &calldata->arg.seq_args,
                                &calldata->res.seq_res, 0, task))
                return;
        rpc_call_start(task);
 }
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+        switch(task->tk_status) {
+        case 0:
+        case -NFS4ERR_COMPLETE_ALREADY:
+        case -NFS4ERR_WRONG_CRED: /* What to do here? */
+                break;
+        case -NFS4ERR_DELAY:
+        case -EKEYEXPIRED:
+                rpc_delay(task, NFS4_POLL_RETRY_MAX);
+                return -EAGAIN;
+        default:
+                nfs4_schedule_state_recovery(clp);
+        }
+        return 0;
+}
 static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
 {
        struct nfs4_reclaim_complete_data *calldata = data;
@@ -5186,32 +5269,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
        struct nfs4_sequence_res *res = &calldata->res.seq_res;
        dprintk("--> %s\n", __func__);
-        nfs41_sequence_done(clp, res, task->tk_status);
+        if (!nfs41_sequence_done(task, res))
-        switch (task->tk_status) {
+                return;
-        case 0:
-        case -NFS4ERR_COMPLETE_ALREADY:
-                break;
-        case -NFS4ERR_BADSESSION:
-        case -NFS4ERR_DEADSESSION:
-                /*
-                 * Handle the session error, but do not retry the operation, as
-                 * we have no way of telling whether the clientid had to be
-                 * reset before we got our reply.  If reset, a new wave of
-                 * reclaim operations will follow, containing their own reclaim
-                 * complete.  We don't want our retry to get on the way of
-                 * recovery by incorrectly indicating to the server that we're
-                 * done reclaiming state since the process had to be restarted.
-                 */
-                _nfs4_async_handle_error(task, NULL, clp, NULL);
-                break;
-        default:
-                if (_nfs4_async_handle_error(
-                                task, NULL, clp, NULL) == -EAGAIN) {
-                        rpc_restart_call_prepare(task);
-                        return;
-                }
-        }
+        if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+                rpc_restart_call_prepare(task);
+                return;
+        }
        dprintk("<-- %s\n", __func__);
 }
@@ -5325,28 +5389,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
-/*
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
- * Per minor version reboot and network partition recovery ops
+        .minor_version = 0,
- */
+        .call_sync = _nfs4_call_sync,
+        .validate_stateid = nfs4_validate_delegation_stateid,
-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
+        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
-        &nfs40_reboot_recovery_ops,
+        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
-#if defined(CONFIG_NFS_V4_1)
+        .state_renewal_ops = &nfs40_state_renewal_ops,
-        &nfs41_reboot_recovery_ops,
-#endif
 };
-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
-        &nfs40_nograce_recovery_ops,
 #if defined(CONFIG_NFS_V4_1)
-        &nfs41_nograce_recovery_ops,
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
-#endif
+        .minor_version = 1,
+        .call_sync = _nfs4_call_sync_session,
+        .validate_stateid = nfs41_validate_delegation_stateid,
+        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+        .state_renewal_ops = &nfs41_state_renewal_ops,
 };
+#endif
-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
-        &nfs40_state_renewal_ops,
+        [0] = &nfs_v4_0_minor_ops,
 #if defined(CONFIG_NFS_V4_1)
-        &nfs41_state_renewal_ops,
+        [1] = &nfs_v4_1_minor_ops,
 #endif
 };
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b72..72b6c580af13 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
 void
 nfs4_renew_state(struct work_struct *work)
 {
-        struct nfs4_state_maintenance_ops *ops;
+        const struct nfs4_state_maintenance_ops *ops;
        struct nfs_client *clp =
                container_of(work, struct nfs_client, cl_renewd.work);
        struct rpc_cred *cred;
        long lease;
        unsigned long last, now;
-        ops = nfs4_state_renewal_ops[clp->cl_minorversion];
+        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 34acf5926fdc..3e2f19b04c06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -145,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        struct nfs4_session *ses = clp->cl_session;
        int max_slots;
-        if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+        if (ses == NULL)
+                return;
+        if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
                spin_lock(&ses->fc_slot_table.slot_tbl_lock);
                max_slots = ses->fc_slot_table.max_slots;
                while (max_slots--) {
@@ -167,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
                INIT_COMPLETION(ses->complete);
                spin_unlock(&tbl->slot_tbl_lock);
@@ -371,7 +373,6 @@ nfs4_alloc_state_owner(void)
                return NULL;
        spin_lock_init(&sp->so_lock);
        INIT_LIST_HEAD(&sp->so_states);
-        INIT_LIST_HEAD(&sp->so_delegations);
        rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
        sp->so_seqid.sequence = &sp->so_sequence;
        spin_lock_init(&sp->so_sequence.lock);
@@ -384,7 +385,7 @@ static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-                struct nfs_client *clp = sp->so_client;
+                struct nfs_client *clp = sp->so_server->nfs_client;
                spin_lock(&clp->cl_lock);
                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -406,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new = nfs4_alloc_state_owner();
        if (new == NULL)
                return NULL;
-        new->so_client = clp;
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
@@ -423,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-        struct nfs_client *clp = sp->so_client;
+        struct nfs_client *clp = sp->so_server->nfs_client;
        struct rpc_cred *cred = sp->so_cred;
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -602,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 * that is compatible with current->files
 */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *pos;
        list_for_each_entry(pos, &state->lock_states, ls_locks) {
-                if (pos->ls_owner != fl_owner)
+                if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
                        continue;
+                switch (pos->ls_owner.lo_type) {
+                case NFS4_POSIX_LOCK_TYPE:
+                        if (pos->ls_owner.lo_u.posix_owner != fl_owner)
+                                continue;
+                        break;
+                case NFS4_FLOCK_LOCK_TYPE:
+                        if (pos->ls_owner.lo_u.flock_owner != fl_pid)
+                                continue;
+                }
                atomic_inc(&pos->ls_count);
                return pos;
        }
@@ -619,10 +628,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_client;
+        struct nfs_client *clp = state->owner->so_server->nfs_client;
        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
@@ -633,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
-        lsp->ls_owner = fl_owner;
+        lsp->ls_owner.lo_type = type;
+        switch (lsp->ls_owner.lo_type) {
+        case NFS4_FLOCK_LOCK_TYPE:
+                lsp->ls_owner.lo_u.flock_owner = fl_pid;
+                break;
+        case NFS4_POSIX_LOCK_TYPE:
+                lsp->ls_owner.lo_u.posix_owner = fl_owner;
+                break;
+        default:
+                kfree(lsp);
+                return NULL;
+        }
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
@@ -643,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_client;
+        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
        spin_lock(&clp->cl_lock);
        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -657,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp, *new = NULL;
        
        for(;;) {
                spin_lock(&state->state_lock);
-                lsp = __nfs4_find_lock_state(state, owner);
+                lsp = __nfs4_find_lock_state(state, owner, pid, type);
                if (lsp != NULL)
                        break;
                if (new != NULL) {
@@ -674,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                        break;
                }
                spin_unlock(&state->state_lock);
-                new = nfs4_alloc_lock_state(state, owner);
+                new = nfs4_alloc_lock_state(state, owner, pid, type);
                if (new == NULL)
                        return NULL;
        }
@@ -701,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
+        if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+                nfs4_release_lockowner(lsp);
        nfs4_free_lock_state(lsp);
 }
@@ -728,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
        if (fl->fl_ops != NULL)
                return 0;
-        lsp = nfs4_get_lock_state(state, fl->fl_owner);
+        if (fl->fl_flags & FL_POSIX)
+                lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
+        else if (fl->fl_flags & FL_FLOCK)
+                lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+        else
+                return -EINVAL;
        if (lsp == NULL)
                return -ENOMEM;
        fl->fl_u.nfs4_fl.owner = lsp;
@@ -740,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 * Byte-range lock aware utility to initialize the stateid of read/write
 * requests.
 */
-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
+void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
 {
        struct nfs4_lock_state *lsp;
        int seq;
@@ -753,7 +780,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
                return;
        spin_lock(&state->state_lock);
-        lsp = __nfs4_find_lock_state(state, fl_owner);
+        lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
        if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
                memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
        spin_unlock(&state->state_lock);
@@ -1041,11 +1068,11 @@ restart:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
-                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
                                break;
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
-                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
@@ -1120,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
                return;
-        nfs4_reclaim_complete(clp,
+        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
-                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1211,8 +1237,8 @@ restart:
 static int nfs4_check_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
-        struct nfs4_state_maintenance_ops *ops =
+        const struct nfs4_state_maintenance_ops *ops =
-                nfs4_state_renewal_ops[clp->cl_minorversion];
+                clp->cl_mvops->state_renewal_ops;
        int status = -NFS4ERR_EXPIRED;
        /* Is the client already known to have an expired lease? */
@@ -1235,8 +1261,8 @@ out:
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
-        struct nfs4_state_recovery_ops *ops =
+        const struct nfs4_state_recovery_ops *ops =
-                nfs4_reboot_recovery_ops[clp->cl_minorversion];
+                clp->cl_mvops->reboot_recovery_ops;
        int status = -ENOENT;
        cred = ops->get_clid_cred(clp);
@@ -1444,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                /* First recover reboot state... */
                if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
-                                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
+                                clp->cl_mvops->reboot_recovery_ops);
                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
                                continue;
@@ -1458,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
-                                nfs4_nograce_recovery_ops[clp->cl_minorversion]);
+                                clp->cl_mvops->nograce_recovery_ops);
                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
                            test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c8dae4b267..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int);
 #define encode_link_maxsz       (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
 #define decode_link_maxsz       (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz  (7)
 #define encode_lock_maxsz       (op_encode_hdr_maxsz + \
                                 7 + \
-                                 1 + encode_stateid_maxsz + 8)
+                                 1 + encode_stateid_maxsz + 1 + \
+                                 encode_lockowner_maxsz)
 #define decode_lock_denied_maxsz \
                                (8 + decode_lockowner_maxsz)
 #define decode_lock_maxsz       (op_decode_hdr_maxsz + \
                                 decode_lock_denied_maxsz)
-#define encode_lockt_maxsz      (op_encode_hdr_maxsz + 12)
+#define encode_lockt_maxsz      (op_encode_hdr_maxsz + 5 + \
+                                encode_lockowner_maxsz)
 #define decode_lockt_maxsz      (op_decode_hdr_maxsz + \
                                 decode_lock_denied_maxsz)
 #define encode_locku_maxsz      (op_encode_hdr_maxsz + 3 + \
@@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int);
                                 4)
 #define decode_locku_maxsz      (op_decode_hdr_maxsz + \
                                 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+                                (op_encode_hdr_maxsz + \
+                                 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+                                (op_decode_hdr_maxsz)
 #define encode_access_maxsz     (op_encode_hdr_maxsz + 1)
 #define decode_access_maxsz     (op_decode_hdr_maxsz + 2)
 #define encode_symlink_maxsz    (op_encode_hdr_maxsz + \
@@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int);
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_lockowner_maxsz)
 #define NFS4_enc_access_sz      (compound_encode_hdr_maxsz + \
                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
@@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
                                struct compound_hdr *hdr)
 {
        __be32 *p;
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        /* initialize running count of expected bytes in reply.
         * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
        return fl->fl_end - fl->fl_start + 1;
 }
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 28);
+        p = xdr_encode_hyper(p, lowner->clientid);
+        *p++ = cpu_to_be32(16);
+        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        xdr_encode_hyper(p, lowner->id);
+}
 /*
 * opcode,type,reclaim,offset,length,new_lock_owner = 32
 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
-                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
+                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
                *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
                p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
                *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-                p = xdr_encode_hyper(p, args->lock_owner.clientid);
+                encode_lockowner(xdr, &args->lock_owner);
-                *p++ = cpu_to_be32(16);
-                p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-                xdr_encode_hyper(p, args->lock_owner.id);
        }
        else {
                p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
        __be32 *p;
-        p = reserve_space(xdr, 52);
+        p = reserve_space(xdr, 24);
        *p++ = cpu_to_be32(OP_LOCKT);
        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
        p = xdr_encode_hyper(p, args->fl->fl_start);
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-        p = xdr_encode_hyper(p, args->lock_owner.clientid);
+        encode_lockowner(xdr, &args->lock_owner);
-        *p++ = cpu_to_be32(16);
-        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-        xdr_encode_hyper(p, args->lock_owner.id);
        hdr->nops++;
        hdr->replen += decode_lockt_maxsz;
 }
@@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        hdr->replen += decode_locku_maxsz;
 }
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4);
+        *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+        encode_lockowner(xdr, lowner);
+        hdr->nops++;
+        hdr->replen += decode_release_lockowner_maxsz;
+}
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        int len = name->len;
@@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
                break;
        default:
                clp = arg->server->nfs_client;
-                if (clp->cl_minorversion > 0) {
+                if (clp->cl_mvops->minor_version > 0) {
                        if (nfs4_has_persistent_session(clp)) {
                                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
                                encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
 {
        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
-                nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
+                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_READ);
-        encode_stateid(xdr, args->context);
+        encode_stateid(xdr, args->context, args->lock_context);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
@@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_WRITE);
-        encode_stateid(xdr, args->context);
+        encode_stateid(xdr, args->context, args->lock_context);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 {
 #if defined(CONFIG_NFS_V4_1)
        if (args->sa_session)
-                return args->sa_session->clp->cl_minorversion;
+                return args->sa_session->clp->cl_mvops->minor_version;
 #endif /* CONFIG_NFS_V4_1 */
        return 0;
 }
@@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
        return 0;
 }
+static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = 0,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 /*
 * Encode a READLINK request
 */
@@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = args->client->cl_minorversion,
+                .minorversion = args->client->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = args->client->cl_minorversion,
+                .minorversion = args->client->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = session->clp->cl_minorversion,
+                .minorversion = session->clp->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
        return status;
 }
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+        return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
 static int decode_lookup(struct xdr_stream *xdr)
 {
        return decode_op_hdr(xdr, OP_LOOKUP);
@@ -5259,6 +5308,19 @@ out:
        return status;
 }
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_release_lockowner(&xdr);
+        return status;
+}
 /*
 * Decode READLINK response
 */
@@ -5866,6 +5928,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 6bd19d843af7..df101d9f546a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b589..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
        req->wb_context = get_nfs_open_context(ctx);
+        req->wb_lock_context = nfs_get_lock_context(ctx);
        kref_init(&req->wb_kref);
        return req;
 }
@@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
        struct nfs_open_context *ctx = req->wb_context;
+        struct nfs_lock_context *l_ctx = req->wb_lock_context;
        if (page != NULL) {
                page_cache_release(page);
                req->wb_page = NULL;
        }
+        if (l_ctx != NULL) {
+                nfs_put_lock_context(l_ctx);
+                req->wb_lock_context = NULL;
+        }
        if (ctx != NULL) {
                put_nfs_open_context(ctx);
                req->wb_context = NULL;
@@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return 0;
-        if (req->wb_context->lockowner != prev->wb_context->lockowner)
+        if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
                return 0;
        if (req->wb_context->state != prev->wb_context->state)
                return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6e2b06e6ca79..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->args.pages  = data->pagevec;
        data->args.count  = count;
        data->args.context = get_nfs_open_context(req->wb_context);
+        data->args.lock_context = req->wb_lock_context;
        data->res.fattr   = &data->fattr;
        data->res.count   = count;
@@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
                                &data->args.seq_args, &data->res.seq_res,
                                0, task))
                return;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a56..f1ae39f6cb02 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 {
        struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+        if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+                return;
        switch (sap->sa_family) {
        case AF_INET: {
                struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options,
                 * can deal with.
                 */
                args->flags             = data->flags & NFS_MOUNT_FLAGMASK;
+                args->flags             |= NFS_MOUNT_LEGACY_INTERFACE;
                args->rsize             = data->rsize;
                args->wsize             = data->wsize;
                args->timeo             = data->timeo;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index a2242af6a17d..2f84adaad427 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
        struct nfs_unlinkdata *data = calldata;
        struct nfs_server *server = NFS_SERVER(data->dir);
-        if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 91679e2631ee..874972d9427c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
-static struct nfs_page *nfs_find_and_lock_request(struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
 {
        struct inode *inode = page->mapping->host;
        struct nfs_page *req;
@@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
                 *       request as dirty (in which case we don't care).
                 */
                spin_unlock(&inode->i_lock);
-                ret = nfs_wait_on_request(req);
+                if (!nonblock)
+                        ret = nfs_wait_on_request(req);
+                else
+                        ret = -EAGAIN;
                nfs_release_request(req);
                if (ret != 0)
                        return ERR_PTR(ret);
@@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
 * May return an error if the user signalled nfs_wait_on_request().
 */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-                                struct page *page)
+                                struct page *page, bool nonblock)
 {
        struct nfs_page *req;
        int ret = 0;
-        req = nfs_find_and_lock_request(page);
+        req = nfs_find_and_lock_request(page, nonblock);
        if (!req)
                goto out;
        ret = PTR_ERR(req);
@@ -283,12 +286,20 @@ out:
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
        struct inode *inode = page->mapping->host;
+        int ret;
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
        nfs_pageio_cond_complete(pgio, page->index);
-        return nfs_page_async_flush(pgio, page);
+        ret = nfs_page_async_flush(pgio, page,
+                        wbc->sync_mode == WB_SYNC_NONE ||
+                        wbc->nonblocking != 0);
+        if (ret == -EAGAIN) {
+                redirty_page_for_writepage(wbc, page);
+                ret = 0;
+        }
+        return ret;
 }
 /*
@@ -689,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        return 0;
-                do_flush = req->wb_page != page || req->wb_context != ctx;
+                do_flush = req->wb_page != page || req->wb_context != ctx ||
+                        req->wb_lock_context->lockowner != current->files ||
+                        req->wb_lock_context->pid != current->tgid;
                nfs_release_request(req);
                if (!do_flush)
                        return 0;
@@ -813,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.pages  = data->pagevec;
        data->args.count  = count;
        data->args.context = get_nfs_open_context(req->wb_context);
+        data->args.lock_context = req->wb_lock_context;
        data->args.stable  = NFS_UNSTABLE;
        if (how & FLUSH_STABLE) {
                data->args.stable = NFS_DATA_SYNC;
@@ -1036,9 +1050,9 @@ out:
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
-        struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
-        if (nfs4_setup_sequence(clp, &data->args.seq_args,
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+                                &data->args.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
@@ -1379,7 +1393,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
        .rpc_release = nfs_commit_release,
 };
-static int nfs_commit_inode(struct inode *inode, int how)
+int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
        int may_wait = how & FLUSH_SYNC;
@@ -1443,11 +1457,6 @@ out_mark_dirty:
        return ret;
 }
 #else
-static int nfs_commit_inode(struct inode *inode, int how)
-{
-        return 0;
-}
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
        return 0;
@@ -1546,7 +1555,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        nfs_fscache_release_page(page, GFP_KERNEL);
-        req = nfs_find_and_lock_request(page);
+        req = nfs_find_and_lock_request(page, false);
        ret = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b9..5b7e3021e06b 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
        svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
        fh_copy(&resp->fh, &argp->fh);
-        nfserr = nfsd_read(rqstp, &resp->fh, NULL,
+        nfserr = nfsd_read(rqstp, &resp->fh,
                                  argp->offset,
                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        fh_init(&resp->fh, NFS3_FHSIZE);
        nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
                                    &argp->attrs, S_IFDIR, 0, &resp->fh);
+        fh_unlock(&resp->dirfh);
        RETURN_STATUS(nfserr);
 }
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
        type = nfs3_ftypes[argp->ftype];
        nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
                                    &argp->attrs, type, rdev, &resp->fh);
+        fh_unlock(&resp->dirfh);
        RETURN_STATUS(nfserr);
 }
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
        /* Unlink. -S_IFDIR means file must not be a directory */
        fh_copy(&resp->fh, &argp->fh);
        nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+        fh_unlock(&resp->fh);
        RETURN_STATUS(nfserr);
 }
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
        fh_copy(&resp->fh, &argp->fh);
        nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+        fh_unlock(&resp->fh);
        RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index eb78e7e22077..988cbb3a19b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -143,8 +143,6 @@ struct nfs4_cb_compound_hdr {
        u32             minorversion;
        /* res */
        int             status;
-        u32             taglen;
-        char            *tag;
 };
 static struct {
@@ -205,6 +203,16 @@ nfs_cb_stat_to_errno(int stat)
 */
 static void
+encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+{
+        __be32 *p;
+        RESERVE_SPACE(sizeof(stateid_t));
+        WRITE32(sid->si_generation);
+        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+}
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 * p;
@@ -229,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
        __be32 *p;
        int len = dp->dl_fh.fh_size;
-        RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
+        RESERVE_SPACE(4);
        WRITE32(OP_CB_RECALL);
-        WRITE32(dp->dl_stateid.si_generation);
+        encode_stateid(xdr, &dp->dl_stateid);
-        WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
+        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
        WRITE32(0); /* truncate optimization not implemented */
        WRITE32(len);
        WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
 static int
 decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
        __be32 *p;
+        u32 taglen;
        READ_BUF(8);
        READ32(hdr->status);
-        READ32(hdr->taglen);
+        /* We've got no use for the tag; ignore it: */
-        READ_BUF(hdr->taglen + 4);
+        READ32(taglen);
-        hdr->tag = (char *)p;
+        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(hdr->taglen);
+        p += XDR_QUADLEN(taglen);
        READ32(hdr->nops);
        return 0;
 }
@@ -667,28 +676,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
        }
        switch (task->tk_status) {
-        case -EIO:
+        case 0:
+                return;
+        case -EBADHANDLE:
+        case -NFS4ERR_BAD_STATEID:
+                /* Race: client probably got cb_recall
+                 * before open reply granting delegation */
+                break;
+        default:
                /* Network partition? */
                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
                if (current_rpc_client != task->tk_client) {
                        /* queue a callback on the new connection: */
+                        atomic_inc(&dp->dl_count);
                        nfsd4_cb_recall(dp);
                        return;
                }
-        case -EBADHANDLE:
-        case -NFS4ERR_BAD_STATEID:
-                /* Race: client probably got cb_recall
-                 * before open reply granting delegation */
-                break;
-        default:
-                /* success, or error we can't handle */
-                return;
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
                task->tk_status = 0;
-                rpc_restart_call(task);
+                rpc_restart_call_prepare(task);
                return;
        } else {
                atomic_set(&clp->cl_cb_set, 0);
@@ -752,18 +761,16 @@ static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
                .rpc_cred = callback_cred
        };
-        int status;
-        if (clnt == NULL)
+        if (clnt == NULL) {
+                nfs4_put_delegation(dp);
                return; /* Client is shutting down; give up. */
+        }
        args->args_op = dp;
        msg.rpc_argp = args;
        dp->dl_retries = 1;
-        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
+        rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
-                                &nfsd4_cb_recall_ops, dp);
-        if (status)
-                nfs4_put_delegation(dp);
 }
 void nfsd4_do_callback_rpc(struct work_struct *w)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4a2734758778..2e7357104cfd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -51,7 +51,6 @@ static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
-static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -163,6 +162,46 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
 static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+        BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+        atomic_inc(&fp->fi_access[oflag]);
+}
+static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+        if (oflag == O_RDWR) {
+                __nfs4_file_get_access(fp, O_RDONLY);
+                __nfs4_file_get_access(fp, O_WRONLY);
+        } else
+                __nfs4_file_get_access(fp, oflag);
+}
+static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+{
+        if (fp->fi_fds[oflag]) {
+                fput(fp->fi_fds[oflag]);
+                fp->fi_fds[oflag] = NULL;
+        }
+}
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+        if (atomic_dec_and_test(&fp->fi_access[oflag])) {
+                nfs4_file_put_fd(fp, O_RDWR);
+                nfs4_file_put_fd(fp, oflag);
+        }
+}
+static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+        if (oflag == O_RDWR) {
+                __nfs4_file_put_access(fp, O_RDONLY);
+                __nfs4_file_put_access(fp, O_WRONLY);
+        } else
+                __nfs4_file_put_access(fp, oflag);
+}
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
+        /*
+         * Major work on the lease subsystem (for example, to support
+         * calbacks on stat) will be required before we can support
+         * write delegations properly.
+         */
+        if (type != NFS4_OPEN_DELEGATE_READ)
+                return NULL;
        if (fp->fi_had_conflict)
                return NULL;
        if (num_delegations > max_delegations)
@@ -185,9 +231,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
+        nfs4_file_get_access(fp, O_RDONLY);
        dp->dl_flock = NULL;
-        get_file(stp->st_vfs_file);
-        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
        dp->dl_ident = cb->cb_ident;
        dp->dl_stateid.si_boot = boot_time;
@@ -222,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        struct file *filp = dp->dl_vfs_file;
+        struct file *filp = find_readable_file(dp->dl_file);
        dprintk("NFSD: close_delegation dp %p\n",dp);
-        dp->dl_vfs_file = NULL;
-        /* The following nfsd_close may not actually close the file,
-         * but we want to remove the lease in any case. */
        if (dp->dl_flock)
                vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
-        nfsd_close(filp);
+        nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 /* Called under the state lock. */
@@ -302,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
 static void release_lock_stateid(struct nfs4_stateid *stp)
 {
+        struct file *file;
        unhash_generic_stateid(stp);
-        locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+        file = find_any_file(stp->st_file);
+        if (file)
+                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
        free_generic_stateid(stp);
 }
@@ -341,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
        }
 }
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *      OPEN allow read, deny write
+ *      OPEN allow both, deny none
+ *      DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static void
+set_access(unsigned int *access, unsigned long bmap) {
+        int i;
+        *access = 0;
+        for (i = 1; i < 4; i++) {
+                if (test_bit(i, &bmap))
+                        *access |= i;
+        }
+}
+static void
+set_deny(unsigned int *deny, unsigned long bmap) {
+        int i;
+        *deny = 0;
+        for (i = 0; i < 4; i++) {
+                if (test_bit(i, &bmap))
+                        *deny |= i ;
+        }
+}
+static int
+test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+        unsigned int access, deny;
+        set_access(&access, stp->st_access_bmap);
+        set_deny(&deny, stp->st_deny_bmap);
+        if ((access & open->op_share_deny) || (deny & open->op_share_access))
+                return 0;
+        return 1;
+}
+static int nfs4_access_to_omode(u32 access)
+{
+        switch (access) {
+        case NFS4_SHARE_ACCESS_READ:
+                return O_RDONLY;
+        case NFS4_SHARE_ACCESS_WRITE:
+                return O_WRONLY;
+        case NFS4_SHARE_ACCESS_BOTH:
+                return O_RDWR;
+        }
+        BUG();
+}
+static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
+{
+        unsigned int access;
+        set_access(&access, stp->st_access_bmap);
+        return nfs4_access_to_omode(access);
+}
 static void release_open_stateid(struct nfs4_stateid *stp)
 {
+        int oflag = nfs4_access_bmap_to_omode(stp);
        unhash_generic_stateid(stp);
        release_stateid_lockowners(stp);
-        nfsd_close(stp->st_vfs_file);
+        nfs4_file_put_access(stp->st_file, oflag);
        free_generic_stateid(stp);
 }
@@ -457,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
        spin_unlock(&nfsd_drc_lock);
        if (fchan->maxreqs == 0)
-                return nfserr_serverfault;
+                return nfserr_jukebox;
        fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
        return 0;
@@ -542,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
                     + sizeof(struct nfsd4_session) > PAGE_SIZE);
-        status = nfserr_serverfault;
+        status = nfserr_jukebox;
        /* allocate struct nfsd4_session and slot table pointers in one piece */
        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
@@ -591,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
        dump_sessionid(__func__, sessionid);
        idx = hash_sessionid(sessionid);
-        dprintk("%s: idx is %d\n", __func__, idx);
        /* Search in the appropriate list */
        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
-                dump_sessionid("list traversal", &elem->se_sessionid);
                if (!memcmp(elem->se_sessionid.data, sessionid->data,
                            NFS4_MAX_SESSIONID_LEN)) {
                        return elem;
@@ -714,7 +832,6 @@ release_session_client(struct nfsd4_session *session)
        } else
                renew_client_locked(clp);
        spin_unlock(&client_lock);
-        nfsd4_put_session(session);
 }
 /* must be called under the client_lock */
@@ -1220,7 +1337,7 @@ out_new:
        /* Normal case */
        new = create_client(exid->clname, dname, rqstp, &verf);
        if (new == NULL) {
-                status = nfserr_serverfault;
+                status = nfserr_jukebox;
                goto out;
        }
@@ -1760,6 +1877,8 @@ alloc_init_file(struct inode *ino)
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+                memset(fp->fi_access, 0, sizeof(fp->fi_access));
                spin_lock(&recall_lock);
                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
                spin_unlock(&recall_lock);
@@ -1971,57 +2090,6 @@ static inline int deny_valid(u32 x)
 }
 /*
- * We store the NONE, READ, WRITE, and BOTH bits separately in the
- * st_{access,deny}_bmap field of the stateid, in order to track not
- * only what share bits are currently in force, but also what
- * combinations of share bits previous opens have used.  This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
- *
- * XXX: This enforcement is actually incomplete, since we don't keep
- * track of access/deny bit combinations; so, e.g., we allow:
- *
- *      OPEN allow read, deny write
- *      OPEN allow both, deny none
- *      DOWNGRADE allow read, deny none
- *
- * which we should reject.
- */
-static void
-set_access(unsigned int *access, unsigned long bmap) {
-        int i;
-        *access = 0;
-        for (i = 1; i < 4; i++) {
-                if (test_bit(i, &bmap))
-                        *access |= i;
-        }
-}
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
-        int i;
-        *deny = 0;
-        for (i = 0; i < 4; i++) {
-                if (test_bit(i, &bmap))
-                        *deny |= i ;
-        }
-}
-static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
-        unsigned int access, deny;
-        set_access(&access, stp->st_access_bmap);
-        set_deny(&deny, stp->st_deny_bmap);
-        if ((access & open->op_share_deny) || (deny & open->op_share_access))
-                return 0;
-        return 1;
-}
-/*
 * Called to check deny when READ with all zero stateid or
 * WRITE with all zero or all one stateid
 */
@@ -2052,14 +2120,12 @@ out:
 }
 static inline void
-nfs4_file_downgrade(struct file *filp, unsigned int share_access)
+nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 {
-        if (share_access & NFS4_SHARE_ACCESS_WRITE) {
+        if (share_access & NFS4_SHARE_ACCESS_WRITE)
-                drop_file_write_access(filp);
+                nfs4_file_put_access(fp, O_WRONLY);
-                spin_lock(&filp->f_lock);
+        if (share_access & NFS4_SHARE_ACCESS_READ)
-                filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+                nfs4_file_put_access(fp, O_RDONLY);
-                spin_unlock(&filp->f_lock);
-        }
 }
 /*
@@ -2255,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
        return NULL;
 }
+int share_access_to_flags(u32 share_access)
+{
+        share_access &= ~NFS4_SHARE_WANT_MASK;
+        return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
 static __be32
 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
                struct nfs4_delegation **dp)
@@ -2265,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
        *dp = find_delegation_file(fp, &open->op_delegate_stateid);
        if (*dp == NULL)
                goto out;
-        flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ?
+        flags = share_access_to_flags(open->op_share_access);
-                                                RD_STATE : WR_STATE;
        status = nfs4_check_delegmode(*dp, flags);
        if (status)
                *dp = NULL;
@@ -2308,30 +2380,53 @@ nfs4_alloc_stateid(void)
        return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
 }
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+        int flags = 0;
+        if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+                flags |= NFSD_MAY_READ;
+        if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+                flags |= NFSD_MAY_WRITE;
+        return flags;
+}
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
+*fp, struct svc_fh *cur_fh, u32 nfs4_access)
+{
+        __be32 status;
+        int oflag = nfs4_access_to_omode(nfs4_access);
+        int access = nfs4_access_to_access(nfs4_access);
+        if (!fp->fi_fds[oflag]) {
+                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
+                        &fp->fi_fds[oflag]);
+                if (status == nfserr_dropit)
+                        status = nfserr_jukebox;
+                if (status)
+                        return status;
+        }
+        nfs4_file_get_access(fp, oflag);
+        return nfs_ok;
+}
 static __be32
 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
-                struct nfs4_delegation *dp,
+                struct nfs4_file *fp, struct svc_fh *cur_fh,
-                struct svc_fh *cur_fh, int flags)
+                struct nfsd4_open *open)
 {
        struct nfs4_stateid *stp;
+        __be32 status;
        stp = nfs4_alloc_stateid();
        if (stp == NULL)
                return nfserr_resource;
-        if (dp) {
+        status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
-                get_file(dp->dl_vfs_file);
+        if (status) {
-                stp->st_vfs_file = dp->dl_vfs_file;
+                kmem_cache_free(stateid_slab, stp);
-        } else {
+                return status;
-                __be32 status;
-                status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
-                                &stp->st_vfs_file);
-                if (status) {
-                        if (status == nfserr_dropit)
-                                status = nfserr_jukebox;
-                        kmem_cache_free(stateid_slab, stp);
-                        return status;
-                }
        }
        *stpp = stp;
        return 0;
@@ -2353,35 +2448,30 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-        struct file *filp = stp->st_vfs_file;
+        u32 op_share_access, new_access;
-        struct inode *inode = filp->f_path.dentry->d_inode;
-        unsigned int share_access, new_writer;
        __be32 status;
-        set_access(&share_access, stp->st_access_bmap);
+        set_access(&new_access, stp->st_access_bmap);
-        new_writer = (~share_access) & open->op_share_access
+        new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
-                        & NFS4_SHARE_ACCESS_WRITE;
+        if (new_access) {
-        if (new_writer) {
+                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
-                int err = get_write_access(inode);
+                if (status)
-                if (err)
+                        return status;
-                        return nfserrno(err);
-                err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
-                if (err)
-                        return nfserrno(err);
-                file_take_write(filp);
        }
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status) {
-                if (new_writer)
+                if (new_access) {
-                        put_write_access(inode);
+                        int oflag = nfs4_access_to_omode(new_access);
+                        nfs4_file_put_access(fp, oflag);
+                }
                return status;
        }
        /* remember the open */
-        filp->f_mode |= open->op_share_access;
+        op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
-        __set_bit(open->op_share_access, &stp->st_access_bmap);
+        __set_bit(op_share_access, &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        return nfs_ok;
@@ -2444,13 +2534,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
        fl.fl_end = OFFSET_MAX;
        fl.fl_owner =  (fl_owner_t)dp;
-        fl.fl_file = stp->st_vfs_file;
+        fl.fl_file = find_readable_file(stp->st_file);
+        BUG_ON(!fl.fl_file);
        fl.fl_pid = current->tgid;
        /* vfs_setlease checks to see if delegation should be handed out.
         * the lock_manager callbacks fl_mylease and fl_change are used
         */
-        if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) {
+        if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
                unhash_delegation(dp);
                flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2514,18 +2605,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
         */
        if (stp) {
                /* Stateid was found, this is an OPEN upgrade */
-                status = nfs4_upgrade_open(rqstp, current_fh, stp, open);
+                status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
                if (status)
                        goto out;
                update_stateid(&stp->st_stateid);
        } else {
-                /* Stateid was not found, this is a new OPEN */
+                status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
-                int flags = 0;
-                if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-                        flags |= NFSD_MAY_READ;
-                if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                        flags |= NFSD_MAY_WRITE;
-                status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
                if (status)
                        goto out;
                init_stateid(stp, fp, open);
@@ -2727,7 +2812,7 @@ search_close_lru(u32 st_id, int flags)
 static inline int
 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 {
-        return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode;
+        return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
 }
 static int
@@ -2760,6 +2845,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
 {
        __be32 status = nfserr_openmode;
+        /* For lock stateid's, we test the parent open, not the lock: */
+        if (stp->st_openstp)
+                stp = stp->st_openstp;
        if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
                goto out;
        if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2872,7 +2960,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        goto out;
                renew_client(dp->dl_client);
                if (filpp)
-                        *filpp = dp->dl_vfs_file;
+                        *filpp = find_readable_file(dp->dl_file);
+                BUG_ON(!*filpp);
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
                if (!stp)
@@ -2889,8 +2978,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                renew_client(stp->st_stateowner->so_client);
-                if (filpp)
+                if (filpp) {
-                        *filpp = stp->st_vfs_file;
+                        if (flags & RD_STATE)
+                                *filpp = find_readable_file(stp->st_file);
+                        else
+                                *filpp = find_writeable_file(stp->st_file);
+                        BUG_ON(!*filpp); /* assured by check_openmode */
+                }
        }
        status = nfs_ok;
 out:
@@ -3126,8 +3220,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                goto out;
        }
        set_access(&share_access, stp->st_access_bmap);
-        nfs4_file_downgrade(stp->st_vfs_file,
+        nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
-                            share_access & ~od->od_share_access);
        reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
        reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3346,11 +3439,9 @@ static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
        struct nfs4_stateowner *sop;
-        unsigned int hval;
        if (fl->fl_lmops == &nfsd_posix_mng_ops) {
                sop = (struct nfs4_stateowner *) fl->fl_owner;
-                hval = lockownerid_hashval(sop->so_id);
                kref_get(&sop->so_ref);
                deny->ld_sop = sop;
                deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3446,8 +3537,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
-        stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
-        stp->st_access_bmap = open_stp->st_access_bmap;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
@@ -3547,7 +3636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                lock_sop = lock->lk_replay_owner;
        }
        /* lock->lk_replay_owner and lock_stp have been created or found */
-        filp = lock_stp->st_vfs_file;
        status = nfserr_grace;
        if (locks_in_grace() && !lock->lk_reclaim)
@@ -3560,11 +3648,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
+                        filp = find_readable_file(lock_stp->st_file);
                        file_lock.fl_type = F_RDLCK;
                        cmd = F_SETLK;
                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
+                        filp = find_writeable_file(lock_stp->st_file);
                        file_lock.fl_type = F_WRLCK;
                        cmd = F_SETLK;
                break;
@@ -3572,6 +3662,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        status = nfserr_inval;
                goto out;
        }
+        if (!filp) {
+                status = nfserr_openmode;
+                goto out;
+        }
        file_lock.fl_owner = (fl_owner_t)lock_sop;
        file_lock.fl_pid = current->tgid;
        file_lock.fl_file = filp;
@@ -3740,7 +3834,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                        &locku->lu_stateowner, &stp, NULL)))
                goto out;
-        filp = stp->st_vfs_file;
+        filp = find_any_file(stp->st_file);
+        if (!filp) {
+                status = nfserr_lock_range;
+                goto out;
+        }
        BUG_ON(!filp);
        locks_init_lock(&file_lock);
        file_lock.fl_type = F_UNLCK;
@@ -3787,10 +3885,10 @@ out_nfserr:
 *      0: no locks held by lockowner
 */
 static int
-check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
 {
        struct file_lock **flpp;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = filp->fi_inode;
        int status = 0;
        lock_kernel();
@@ -3841,7 +3939,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                                continue;
                        list_for_each_entry(stp, &sop->so_stateids,
                                        st_perstateowner) {
-                                if (check_for_locks(stp->st_vfs_file, sop))
+                                if (check_for_locks(stp->st_file, sop))
                                        goto out;
                                /* Note: so_perclient unused for lockowners,
                                 * so it's OK to fool with here. */
@@ -4066,16 +4164,8 @@ out_free_laundry:
 int
 nfs4_state_start(void)
 {
-        int ret;
-        if (nfs4_init)
-                return 0;
        nfsd4_load_reboot_recovery_data();
-        ret = __nfs4_state_start();
+        return __nfs4_state_start();
-        if (ret)
-                return ret;
-        nfs4_init = 1;
-        return 0;
 }
 static void
@@ -4110,7 +4200,6 @@ __nfs4_state_shutdown(void)
        }
        nfsd4_shutdown_recdir();
-        nfs4_init = 0;
 }
 void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ac17a7080239..f8931acb05f3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2630,7 +2630,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        }
        read->rd_vlen = v;
-        nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+        nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
                        read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
                        &maxcount);
@@ -3325,6 +3325,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                }
                /* Renew the clientid on success and on replay */
                release_session_client(cs->session);
+                nfsd4_put_session(cs->session);
        }
        return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 508941c23af7..b53b1d042f1f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -949,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf)
        if (err != 0)
                return err;
-        err = lockd_up();
-        if (err != 0)
-                goto out;
        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
-        if (err < 0)
+        if (err < 0) {
-                lockd_down();
+                svc_destroy(nfsd_serv);
+                return err;
+        }
-out:
        /* Decrease the count, but don't shut down the service */
        nfsd_serv->sv_nrthreads--;
        return err;
@@ -978,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf)
        if (nfsd_serv != NULL)
                len = svc_sock_names(nfsd_serv, buf,
                                        SIMPLE_TRANSACTION_LIMIT, toclose);
-        if (len >= 0)
-                lockd_down();
        kfree(toclose);
        return len;
 }
@@ -1014,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf)
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
+        /* Decrease the count, but don't shut down the service */
+        nfsd_serv->sv_nrthreads--;
        return 0;
 out_close:
        xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1022,8 +1019,7 @@ out_close:
                svc_xprt_put(xprt);
        }
 out_err:
-        /* Decrease the count, but don't shut down the service */
+        svc_destroy(nfsd_serv);
-        nfsd_serv->sv_nrthreads--;
        return err;
 }
@@ -1194,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                        bsize = NFSSVC_MAXBLKSIZE;
                bsize &= ~(1024-1);
                mutex_lock(&nfsd_mutex);
-                if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+                if (nfsd_serv) {
                        mutex_unlock(&nfsd_mutex);
                        return -EBUSY;
                }
@@ -1310,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
                        return -EINVAL;
                status = nfs4_reset_recoverydir(recdir);
+                if (status)
+                        return status;
        }
        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 72377761270e..b76ac3a82e39 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -153,6 +153,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_bad_seqid        cpu_to_be32(NFSERR_BAD_SEQID)
 #define nfserr_symlink          cpu_to_be32(NFSERR_SYMLINK)
 #define nfserr_not_same         cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range       cpu_to_be32(NFSERR_LOCK_RANGE)
 #define nfserr_restorefh        cpu_to_be32(NFSERR_RESTOREFH)
 #define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111ef..08e17264784b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
        svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
        resp->count = argp->count;
-        nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
+        nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
                                  argp->offset,
                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
         * gospel of sun micro
         */
        if (type != S_IFREG) {
-                int     is_borc = 0;
                if (type != S_IFBLK && type != S_IFCHR) {
                        rdev = 0;
                } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                        type = S_IFIFO;
                } else {
                        /* Okay, char or block special */
-                        is_borc = 1;
                        if (!rdev)
                                rdev = wanted;
                }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 06b2a26edfe0..e2c43464f237 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -180,15 +180,80 @@ int nfsd_nrthreads(void)
        return rv;
 }
+static int nfsd_init_socks(int port)
+{
+        int error;
+        if (!list_empty(&nfsd_serv->sv_permsocks))
+                return 0;
+        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+                                        SVC_SOCK_DEFAULTS);
+        if (error < 0)
+                return error;
+        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+                                        SVC_SOCK_DEFAULTS);
+        if (error < 0)
+                return error;
+        return 0;
+}
+static bool nfsd_up = false;
+static int nfsd_startup(unsigned short port, int nrservs)
+{
+        int ret;
+        if (nfsd_up)
+                return 0;
+        /*
+         * Readahead param cache - will no-op if it already exists.
+         * (Note therefore results will be suboptimal if number of
+         * threads is modified after nfsd start.)
+         */
+        ret = nfsd_racache_init(2*nrservs);
+        if (ret)
+                return ret;
+        ret = nfsd_init_socks(port);
+        if (ret)
+                goto out_racache;
+        ret = lockd_up();
+        if (ret)
+                goto out_racache;
+        ret = nfs4_state_start();
+        if (ret)
+                goto out_lockd;
+        nfsd_up = true;
+        return 0;
+out_lockd:
+        lockd_down();
+out_racache:
+        nfsd_racache_shutdown();
+        return ret;
+}
+static void nfsd_shutdown(void)
+{
+        /*
+         * write_ports can create the server without actually starting
+         * any threads--if we get shut down before any threads are
+         * started, then nfsd_last_thread will be run before any of this
+         * other initialization has been done.
+         */
+        if (!nfsd_up)
+                return;
+        nfs4_state_shutdown();
+        lockd_down();
+        nfsd_racache_shutdown();
+        nfsd_up = false;
+}
 static void nfsd_last_thread(struct svc_serv *serv)
 {
        /* When last nfsd thread exits we need to do some clean-up */
-        struct svc_xprt *xprt;
-        list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
-                lockd_down();
        nfsd_serv = NULL;
-        nfsd_racache_shutdown();
+        nfsd_shutdown();
-        nfs4_state_shutdown();
        printk(KERN_WARNING "nfsd: last server has exited, flushing export "
                            "cache\n");
@@ -263,45 +328,18 @@ int nfsd_create_serv(void)
                       nfsd_max_blksize >= 8*1024*2)
                        nfsd_max_blksize /= 2;
        }
+        nfsd_reset_versions();
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
-                err = -ENOMEM;
+                return -ENOMEM;
-        else
-                set_max_drc();
+        set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
 }
-static int nfsd_init_socks(int port)
-{
-        int error;
-        if (!list_empty(&nfsd_serv->sv_permsocks))
-                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
-                                        SVC_SOCK_DEFAULTS);
-        if (error < 0)
-                return error;
-        error = lockd_up();
-        if (error < 0)
-                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
-                                        SVC_SOCK_DEFAULTS);
-        if (error < 0)
-                return error;
-        error = lockd_up();
-        if (error < 0)
-                return error;
-        return 0;
-}
 int nfsd_nrpools(void)
 {
        if (nfsd_serv == NULL)
@@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
        return err;
 }
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
 int
 nfsd_svc(unsigned short port, int nrservs)
 {
        int     error;
+        bool    nfsd_up_before;
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
@@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs)
        if (nrservs == 0 && nfsd_serv == NULL)
                goto out;
-        /* Readahead param cache - will no-op if it already exists */
+        error = nfsd_create_serv();
-        error = nfsd_racache_init(2*nrservs);
-        if (error<0)
-                goto out;
-        error = nfs4_state_start();
        if (error)
                goto out;
-        nfsd_reset_versions();
+        nfsd_up_before = nfsd_up;
-        error = nfsd_create_serv();
+        error = nfsd_startup(port, nrservs);
        if (error)
-                goto out;
+                goto out_destroy;
-        error = nfsd_init_socks(port);
-        if (error)
-                goto failure;
        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
-        if (error == 0)
+        if (error)
-                /* We are holding a reference to nfsd_serv which
+                goto out_shutdown;
-                 * we don't want to count in the return value,
+        /* We are holding a reference to nfsd_serv which
-                 * so subtract 1
+         * we don't want to count in the return value,
-                 */
+         * so subtract 1
-                error = nfsd_serv->sv_nrthreads - 1;
+         */
- failure:
+        error = nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+        if (error < 0 && !nfsd_up_before)
+                nfsd_shutdown();
+out_destroy:
        svc_destroy(nfsd_serv);         /* Release server */
- out:
+out:
        mutex_unlock(&nfsd_mutex);
        return error;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 006c84230c7c..7731a75971dd 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -88,7 +88,6 @@ struct nfs4_delegation {
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
        struct file_lock        *dl_flock;
-        struct file             *dl_vfs_file;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -342,12 +341,50 @@ struct nfs4_file {
        struct list_head        fi_hash;    /* hash by "struct inode *" */
        struct list_head        fi_stateids;
        struct list_head        fi_delegations;
+        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+        struct file *           fi_fds[3];
+        /* One each for O_RDONLY, O_WRONLY: */
+        atomic_t                fi_access[2];
+        /*
+         * Each open stateid contributes 1 to either fi_readers or
+         * fi_writers, or both, depending on the open mode.  A
+         * delegation also takes an fi_readers reference.  Lock
+         * stateid's take none.
+         */
+        atomic_t                fi_readers;
+        atomic_t                fi_writers;
        struct inode            *fi_inode;
        u32                     fi_id;      /* used with stateowner->so_id 
                                             * for stateid_hashtbl hash */
        bool                    fi_had_conflict;
 };
+/* XXX: for first cut may fall back on returning file that doesn't work
+ * at all? */
+static inline struct file *find_writeable_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_RDWR])
+                return f->fi_fds[O_RDWR];
+        return f->fi_fds[O_WRONLY];
+}
+static inline struct file *find_readable_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_RDWR])
+                return f->fi_fds[O_RDWR];
+        return f->fi_fds[O_RDONLY];
+}
+static inline struct file *find_any_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_RDWR])
+                return f->fi_fds[O_RDWR];
+        else if (f->fi_fds[O_RDWR])
+                return f->fi_fds[O_WRONLY];
+        else
+                return f->fi_fds[O_RDONLY];
+}
 /*
 * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
 *
@@ -373,7 +410,6 @@ struct nfs4_stateid {
        struct nfs4_stateowner      * st_stateowner;
        struct nfs4_file            * st_file;
        stateid_t                     st_stateid;
-        struct file                 * st_vfs_file;
        unsigned long                 st_access_bmap;
        unsigned long                 st_deny_bmap;
        struct nfs4_stateid         * st_openstp;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b619..9df85a13af28 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -604,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
        return error;
 }
-#endif /* defined(CONFIG_NFS_V4) */
+#endif /* defined(CONFIG_NFSD_V4) */
 #ifdef CONFIG_NFSD_V3
 /*
@@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
        struct inode *inode;
-        struct raparms  *ra;
        mm_segment_t    oldfs;
        __be32          err;
        int             host_err;
@@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
                goto out;
-        /* Get readahead parameters */
-        ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-        if (ra && ra->p_set)
-                file->f_ra = ra->p_ra;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
                        .len            = 0,
@@ -937,16 +930,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                set_fs(oldfs);
        }
-        /* Write back readahead params */
-        if (ra) {
-                struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-                spin_lock(&rab->pb_lock);
-                ra->p_ra = file->f_ra;
-                ra->p_set = 1;
-                ra->p_count--;
-                spin_unlock(&rab->pb_lock);
-        }
        if (host_err >= 0) {
                nfsdstats.io_read += host_err;
                *count = host_err;
@@ -1086,8 +1069,45 @@ out:
 * on entry. On return, *count contains the number of bytes actually read.
 * N.B. After this call fhp needs an fh_put
 */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+        loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+        struct file *file;
+        struct inode *inode;
+        struct raparms  *ra;
+        __be32 err;
+        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+        if (err)
+                return err;
+        inode = file->f_path.dentry->d_inode;
+        /* Get readahead parameters */
+        ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+        if (ra && ra->p_set)
+                file->f_ra = ra->p_ra;
+        err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+        /* Write back readahead params */
+        if (ra) {
+                struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+                spin_lock(&rab->pb_lock);
+                ra->p_ra = file->f_ra;
+                ra->p_set = 1;
+                ra->p_count--;
+                spin_unlock(&rab->pb_lock);
+        }
+        nfsd_close(file);
+        return err;
+}
+/* As above, but use the provided file descriptor. */
 __be32
-nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                loff_t offset, struct kvec *vec, int vlen,
                unsigned long *count)
 {
@@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                if (err)
                        goto out;
                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-        } else {
+        } else /* Note file may still be NULL in NFSv4 special stateid case: */
-                err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+                err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
-                if (err)
-                        goto out;
-                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-                nfsd_close(file);
-        }
 out:
        return err;
 }
@@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                                char *name, int len, struct svc_fh *tfhp)
 {
        struct dentry   *ddir, *dnew, *dold;
-        struct inode    *dirp, *dest;
+        struct inode    *dirp;
        __be32          err;
        int             host_err;
@@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                goto out_nfserr;
        dold = tfhp->fh_dentry;
-        dest = dold->d_inode;
        host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
        if (host_err) {
@@ -2038,7 +2052,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
-        struct path     path;
        int             err;
        if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2124,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
-        if (err)
-                goto nfsd_out;
-        /* Do integrity (permission) checking now, but defer incrementing
-         * IMA counts to the actual file open.
-         */
-        path.mnt = exp->ex_path.mnt;
-        path.dentry = dentry;
-nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 217a62c2a357..9a370a5e36b7 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -64,7 +64,9 @@ __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 __be32          nfsd_open(struct svc_rqst *, struct svc_fh *, int,
                                int, struct file **);
 void            nfsd_close(struct file *);
-__be32          nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+__be32          nfsd_read(struct svc_rqst *, struct svc_fh *,
+                                loff_t, struct kvec *, int, unsigned long *);
+__be32          nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
                                loff_t, struct kvec *, int, unsigned long *);
 __be32          nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
                                loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c11..3dbdc1d356bf 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
 #include "nilfs.h"
 #include "bmap.h"
 #include "sb.h"
+#include "btree.h"
+#include "direct.h"
 #include "btnode.h"
 #include "mdt.h"
 #include "dat.h"
@@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
 void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
-        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+        memcpy(gcbmap, bmap, sizeof(*bmap));
        init_rwsem(&gcbmap->b_sem);
        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
@@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
-        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+        memcpy(bmap, gcbmap, sizeof(*bmap));
        init_rwsem(&bmap->b_sem);
        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab91..a20569b19929 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
 #define NILFS_BMAP_INVALID_PTR  0
-#define nilfs_bmap_dkey_to_key(dkey)    le64_to_cpu(dkey)
-#define nilfs_bmap_key_to_dkey(key)     cpu_to_le64(key)
-#define nilfs_bmap_dptr_to_ptr(dptr)    le64_to_cpu(dptr)
-#define nilfs_bmap_ptr_to_dptr(ptr)     cpu_to_le64(ptr)
 #define nilfs_bmap_keydiff_abs(diff)    ((diff) < 0 ? -(diff) : (diff))
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
        int (*bop_delete)(struct nilfs_bmap *, __u64);
        void (*bop_clear)(struct nilfs_bmap *);
-        int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+        int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
        void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
                                         struct list_head *);
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
 * @b_last_allocated_ptr: last allocated ptr for data block
 * @b_ptr_type: pointer type
 * @b_state: state
+ * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
 */
 struct nilfs_bmap {
        union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
        __u64 b_last_allocated_ptr;
        int b_ptr_type;
        int b_state;
+        __u16 b_nchildren_per_block;
 };
 /* pointer type */
@@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
                nilfs_dat_abort_end(dat, &req->bpr_req);
 }
+static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
+                                           __u64 ptr)
+{
+        bmap->b_last_allocated_key = key;
+        bmap->b_last_allocated_ptr = ptr;
+}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
                              const struct buffer_head *);
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47b..000000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * bmap_union.h - NILFS block mapping.
- *
- * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- */
-#ifndef _NILFS_BMAP_UNION_H
-#define _NILFS_BMAP_UNION_H
-#include "bmap.h"
-#include "direct.h"
-#include "btree.h"
-/**
- * nilfs_bmap_union -
- * @bi_bmap: bmap structure
- * @bi_btree: direct map structure
- * @bi_direct: B-tree structure
- */
-union nilfs_bmap_union {
-        struct nilfs_bmap bi_bmap;
-        struct nilfs_direct bi_direct;
-        struct nilfs_btree bi_btree;
-};
-#endif  /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a3306..f78ab1044d1d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 }
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-                              sector_t pblocknr, struct buffer_head **pbh)
+                              sector_t pblocknr, int mode,
+                              struct buffer_head **pbh, sector_t *submit_ptr)
 {
        struct buffer_head *bh;
        struct inode *inode = NILFS_BTNC_I(btnc);
+        struct page *page;
        int err;
        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                return -ENOMEM;
        err = -EEXIST; /* internal code */
+        page = bh->b_page;
        if (buffer_uptodate(bh) || buffer_dirty(bh))
                goto found;
@@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                        }
                }
        }
-        lock_buffer(bh);
+        if (mode == READA) {
+                if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
+                        err = -EBUSY; /* internal code */
+                        brelse(bh);
+                        goto out_locked;
+                }
+        } else { /* mode == READ */
+                lock_buffer(bh);
+        }
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                err = -EEXIST; /* internal code */
@@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        bh->b_blocknr = pblocknr; /* set block address for read */
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
-        submit_bh(READ, bh);
+        submit_bh(mode, bh);
        bh->b_blocknr = blocknr; /* set back to the given block address */
+        *submit_ptr = pblocknr;
        err = 0;
 found:
        *pbh = bh;
 out_locked:
-        unlock_page(bh->b_page);
+        unlock_page(page);
-        page_cache_release(bh->b_page);
+        page_cache_release(page);
        return err;
 }
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f07712..79037494f1e0 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
                                              __u64 blocknr);
-int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
-                              struct buffer_head **);
+                              struct buffer_head **, sector_t *);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
                                    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b27a342c5af6..300c2bc00c3f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 /*
 * B-tree node operations
 */
-static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
-                                 struct buffer_head **bhp)
-{
-        struct address_space *btnc =
-                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        int err;
-        err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
-        if (err)
-                return err == -EEXIST ? 0 : err;
-        wait_on_buffer(*bhp);
-        if (!buffer_uptodate(*bhp)) {
-                brelse(*bhp);
-                return -EIO;
-        }
-        return 0;
-}
-static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
                                     __u64 ptr, struct buffer_head **bhp)
 {
-        struct address_space *btnc =
+        struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
-                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
        struct buffer_head *bh;
        bh = nilfs_btnode_create_block(btnc, ptr);
@@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
        return 0;
 }
-static inline int
+static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
 {
        return node->bn_flags;
 }
-static inline void
+static void
 nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
 {
        node->bn_flags = flags;
 }
-static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
 {
        return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
-static inline int
+static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
 {
        return node->bn_level;
 }
-static inline void
+static void
 nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
 {
        node->bn_level = level;
 }
-static inline int
+static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
 {
        return le16_to_cpu(node->bn_nchildren);
 }
-static inline void
+static void
 nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 {
        node->bn_nchildren = cpu_to_le16(nchildren);
 }
-static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
+static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
 {
-        return 1 << btree->bt_bmap.b_inode->i_blkbits;
+        return 1 << btree->b_inode->i_blkbits;
 }
-static inline int
+static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
-nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(node) ?
+        return btree->b_nchildren_per_block;
-                NILFS_BTREE_ROOT_NCHILDREN_MIN :
-                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
-static inline int
+static __le64 *
-nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree *btree)
-{
-        return nilfs_btree_node_root(node) ?
-                NILFS_BTREE_ROOT_NCHILDREN_MAX :
-                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
-}
-static inline __le64 *
 nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 {
        return (__le64 *)((char *)(node + 1) +
@@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
-static inline __le64 *
+static __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
-                       const struct nilfs_btree *btree)
 {
-        return (__le64 *)(nilfs_btree_node_dkeys(node) +
+        return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
-                          nilfs_btree_node_nchildren_max(node, btree));
 }
-static inline __u64
+static __u64
 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
 {
-        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
+        return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
 }
-static inline void
+static void
 nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
 {
-        *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
+        *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
 }
-static inline __u64
+static __u64
-nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
-                         const struct nilfs_btree_node *node, int index)
+                         int ncmax)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
+        return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
-                                        index));
 }
-static inline void
+static void
-nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
-                         struct nilfs_btree_node *node, int index, __u64 ptr)
+                         int ncmax)
 {
-        *(nilfs_btree_node_dptrs(node, btree) + index) =
+        *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
-                nilfs_bmap_ptr_to_dptr(ptr);
 }
-static void nilfs_btree_node_init(struct nilfs_btree *btree,
+static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
-                                  struct nilfs_btree_node *node,
+                                  int level, int nchildren, int ncmax,
-                                  int flags, int level, int nchildren,
                                  const __u64 *keys, const __u64 *ptrs)
 {
        __le64 *dkeys;
@@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
        nilfs_btree_node_set_nchildren(node, nchildren);
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nchildren; i++) {
-                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+                dkeys[i] = cpu_to_le64(keys[i]);
-                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+                dptrs[i] = cpu_to_le64(ptrs[i]);
        }
 }
 /* Assume the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
-                                       struct nilfs_btree_node *left,
                                       struct nilfs_btree_node *right,
-                                       int n)
+                                       int n, int lncmax, int rncmax)
 {
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(left, btree);
+        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(right, btree);
+        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 }
 /* Assume that the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
-                                        struct nilfs_btree_node *left,
                                        struct nilfs_btree_node *right,
-                                        int n)
+                                        int n, int lncmax, int rncmax)
 {
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(left, btree);
+        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(right, btree);
+        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 }
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
-                                    struct nilfs_btree_node *node,
+                                    __u64 key, __u64 ptr, int ncmax)
-                                    __u64 key, __u64 ptr, int index)
 {
        __le64 *dkeys;
        __le64 *dptrs;
        int nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (index < nchildren) {
                memmove(dkeys + index + 1, dkeys + index,
@@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
                memmove(dptrs + index + 1, dptrs + index,
                        (nchildren - index) * sizeof(*dptrs));
        }
-        dkeys[index] = nilfs_bmap_key_to_dkey(key);
+        dkeys[index] = cpu_to_le64(key);
-        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+        dptrs[index] = cpu_to_le64(ptr);
        nchildren++;
        nilfs_btree_node_set_nchildren(node, nchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
-                                    struct nilfs_btree_node *node,
+                                    __u64 *keyp, __u64 *ptrp, int ncmax)
-                                    __u64 *keyp, __u64 *ptrp, int index)
 {
        __u64 key;
        __u64 ptr;
@@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
        int nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
-        key = nilfs_bmap_dkey_to_key(dkeys[index]);
+        key = le64_to_cpu(dkeys[index]);
-        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+        ptr = le64_to_cpu(dptrs[index]);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (keyp != NULL)
                *keyp = key;
@@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
        return s == 0;
 }
-static inline struct nilfs_btree_node *
+/**
-nilfs_btree_get_root(const struct nilfs_btree *btree)
+ * nilfs_btree_node_broken - verify consistency of btree node
+ * @node: btree node block to be examined
+ * @size: node size (in bytes)
+ * @blocknr: block number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
+                                   size_t size, sector_t blocknr)
 {
-        return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+        int level, flags, nchildren;
+        int ret = 0;
+        level = nilfs_btree_node_get_level(node);
+        flags = nilfs_btree_node_get_flags(node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
+        if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+                     level >= NILFS_BTREE_LEVEL_MAX ||
+                     (flags & NILFS_BTREE_NODE_ROOT) ||
+                     nchildren < 0 ||
+                     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
+                printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
+                       "level = %d, flags = 0x%x, nchildren = %d\n",
+                       (unsigned long long)blocknr, level, flags, nchildren);
+                ret = 1;
+        }
+        return ret;
 }
-static inline struct nilfs_btree_node *
+int nilfs_btree_broken_node_block(struct buffer_head *bh)
+{
+        int ret;
+        if (buffer_nilfs_checked(bh))
+                return 0;
+        ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
+                                       bh->b_size, bh->b_blocknr);
+        if (likely(!ret))
+                set_buffer_nilfs_checked(bh);
+        return ret;
+}
+static struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_bmap *btree)
+{
+        return (struct nilfs_btree_node *)btree->b_u.u_data;
+}
+static struct nilfs_btree_node *
 nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
 nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
-static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+static int nilfs_btree_height(const struct nilfs_bmap *btree)
 {
        return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
 }
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
-nilfs_btree_get_node(const struct nilfs_btree *btree,
+nilfs_btree_get_node(const struct nilfs_bmap *btree,
                     const struct nilfs_btree_path *path,
-                     int level)
+                     int level, int *ncmaxp)
 {
-        return (level == nilfs_btree_height(btree) - 1) ?
+        struct nilfs_btree_node *node;
-                nilfs_btree_get_root(btree) :
-                nilfs_btree_get_nonroot_node(path, level);
+        if (level == nilfs_btree_height(btree) - 1) {
+                node = nilfs_btree_get_root(btree);
+                *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+        } else {
+                node = nilfs_btree_get_nonroot_node(path, level);
+                *ncmaxp = nilfs_btree_nchildren_per_block(btree);
+        }
+        return node;
 }
-static inline int
+static int
 nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
 {
        if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
        return 0;
 }
-static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+struct nilfs_btree_readahead_info {
+        struct nilfs_btree_node *node;  /* parent node */
+        int max_ra_blocks;              /* max nof blocks to read ahead */
+        int index;                      /* current index on the parent node */
+        int ncmax;                      /* nof children in the parent node */
+};
+static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+                                   struct buffer_head **bhp,
+                                   const struct nilfs_btree_readahead_info *ra)
+{
+        struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+        struct buffer_head *bh, *ra_bh;
+        sector_t submit_ptr = 0;
+        int ret;
+        ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+        if (ret) {
+                if (ret != -EEXIST)
+                        return ret;
+                goto out_check;
+        }
+        if (ra) {
+                int i, n;
+                __u64 ptr2;
+                /* read ahead sibling nodes */
+                for (n = ra->max_ra_blocks, i = ra->index + 1;
+                     n > 0 && i < ra->ncmax; n--, i++) {
+                        ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
+                        ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+                                                        &ra_bh, &submit_ptr);
+                        if (likely(!ret || ret == -EEXIST))
+                                brelse(ra_bh);
+                        else if (ret != -EBUSY)
+                                break;
+                        if (!buffer_locked(bh))
+                                goto out_no_wait;
+                }
+        }
+        wait_on_buffer(bh);
+ out_no_wait:
+        if (!buffer_uptodate(bh)) {
+                brelse(bh);
+                return -EIO;
+        }
+ out_check:
+        if (nilfs_btree_broken_node_block(bh)) {
+                clear_buffer_uptodate(bh);
+                brelse(bh);
+                return -EINVAL;
+        }
+        *bhp = bh;
+        return 0;
+}
+static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+                                   struct buffer_head **bhp)
+{
+        return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
+}
+static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
                                 struct nilfs_btree_path *path,
-                                 __u64 key, __u64 *ptrp, int minlevel)
+                                 __u64 key, __u64 *ptrp, int minlevel,
+                                 int readahead)
 {
        struct nilfs_btree_node *node;
+        struct nilfs_btree_readahead_info p, *ra;
        __u64 ptr;
-        int level, index, found, ret;
+        int level, index, found, ncmax, ret;
        node = nilfs_btree_get_root(btree);
        level = nilfs_btree_node_get_level(node);
@@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                return -ENOENT;
        found = nilfs_btree_node_lookup(node, key, &index);
-        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        ptr = nilfs_btree_node_get_ptr(node, index,
+                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
-        for (level--; level >= minlevel; level--) {
+        ncmax = nilfs_btree_nchildren_per_block(btree);
-                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
+        while (--level >= minlevel) {
+                ra = NULL;
+                if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
+                        p.node = nilfs_btree_get_node(btree, path, level + 1,
+                                                      &p.ncmax);
+                        p.index = index;
+                        p.max_ra_blocks = 7;
+                        ra = &p;
+                }
+                ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
+                                              ra);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
                if (nilfs_btree_bad_node(node, level))
                        return -EINVAL;
@@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                        found = nilfs_btree_node_lookup(node, key, &index);
                else
                        index = 0;
-                if (index < nilfs_btree_node_nchildren_max(node, btree))
+                if (index < ncmax) {
-                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                        ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
-                else {
+                } else {
                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
                        /* insert */
                        ptr = NILFS_BMAP_INVALID_PTR;
@@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
        __u64 ptr;
-        int index, level, ret;
+        int index, level, ncmax, ret;
        node = nilfs_btree_get_root(btree);
        index = nilfs_btree_node_get_nchildren(node) - 1;
        if (index < 0)
                return -ENOENT;
        level = nilfs_btree_node_get_level(node);
-        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        ptr = nilfs_btree_node_get_ptr(node, index,
+                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
+        ncmax = nilfs_btree_nchildren_per_block(btree);
        for (level--; level > 0; level--) {
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                if (nilfs_btree_bad_node(node, level))
                        return -EINVAL;
                index = nilfs_btree_node_get_nchildren(node) - 1;
-                ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
                path[level].bp_index = index;
        }
@@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
                              __u64 key, int level, __u64 *ptrp)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
-        __u64 ptr;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
-        if (ptrp != NULL)
-                *ptrp = ptr;
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
                                     __u64 key, __u64 *ptrp, unsigned maxblocks)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
        int level = NILFS_BTREE_LEVEL_NODE_MIN;
-        int ret, cnt, index, maxlevel;
+        int ret, cnt, index, maxlevel, ncmax;
+        struct nilfs_btree_readahead_info p;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
        if (ret < 0)
                goto out;
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(btree);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        goto out;
@@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                goto end;
        maxlevel = nilfs_btree_height(btree) - 1;
-        node = nilfs_btree_get_node(btree, path, level);
+        node = nilfs_btree_get_node(btree, path, level, &ncmax);
        index = path[level].bp_index + 1;
        for (;;) {
                while (index < nilfs_btree_node_get_nchildren(node)) {
                        if (nilfs_btree_node_get_key(node, index) !=
                            key + cnt)
                                goto end;
-                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                        ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
                        if (dat) {
                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
                                if (ret < 0)
@@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                        break;
                /* look-up right sibling node */
-                node = nilfs_btree_get_node(btree, path, level + 1);
+                p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
-                index = path[level + 1].bp_index + 1;
+                p.index = path[level + 1].bp_index + 1;
-                if (index >= nilfs_btree_node_get_nchildren(node) ||
+                p.max_ra_blocks = 7;
-                    nilfs_btree_node_get_key(node, index) != key + cnt)
+                if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
+                    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
                        break;
-                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
-                path[level + 1].bp_index = index;
+                path[level + 1].bp_index = p.index;
                brelse(path[level].bp_bh);
                path[level].bp_bh = NULL;
-                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+                ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
+                                              &p);
                if (ret < 0)
                        goto out;
                node = nilfs_btree_get_nonroot_node(path, level);
+                ncmax = nilfs_btree_nchildren_per_block(btree);
                index = 0;
                path[level].bp_index = index;
        }
@@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        return ret;
 }
-static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
 {
@@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
        }
 }
-static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
+        int ncblk;
        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                                        path[level].bp_index);
+                nilfs_btree_node_insert(node, path[level].bp_index,
+                                        *keyp, *ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
                                                                         0));
        } else {
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                nilfs_btree_node_insert(node, path[level].bp_index,
-                                        path[level].bp_index);
+                                        *keyp, *ptrp,
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
 }
-static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int nchildren, lnchildren, n, move;
+        int nchildren, lnchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_left(btree, left, node, n);
+        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
-static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int nchildren, rnchildren, n, move;
+        int nchildren, rnchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_right(btree, node, right, n);
+        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
-static void nilfs_btree_split(struct nilfs_btree *btree,
+static void nilfs_btree_split(struct nilfs_bmap *btree,
                              struct nilfs_btree_path *path,
                              int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
        __u64 newkey;
        __u64 newptr;
-        int nchildren, n, move;
+        int nchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + 1) / 2;
@@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_right(btree, node, right, n);
+        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        if (move) {
                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+                nilfs_btree_node_insert(right, path[level].bp_index,
-                                        path[level].bp_index);
+                                        *keyp, *ptrp, ncblk);
                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
@@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
 }
-static void nilfs_btree_grow(struct nilfs_btree *btree,
+static void nilfs_btree_grow(struct nilfs_bmap *btree,
                             struct nilfs_btree_path *path,
                             int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *root, *child;
-        int n;
+        int n, ncblk;
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(root);
-        nilfs_btree_node_move_right(btree, root, child, n);
+        nilfs_btree_node_move_right(root, child, n,
+                                    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
        nilfs_btree_node_set_level(root, level + 1);
        if (!buffer_dirty(path[level].bp_sib_bh))
@@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        *ptrp = path[level].bp_newreq.bpr_ptr;
 }
-static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
                                   const struct nilfs_btree_path *path)
 {
        struct nilfs_btree_node *node;
-        int level;
+        int level, ncmax;
        if (path == NULL)
                return NILFS_BMAP_INVALID_PTR;
@@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
        /* left sibling */
        level = NILFS_BTREE_LEVEL_NODE_MIN;
        if (path[level].bp_index > 0) {
-                node = nilfs_btree_get_node(btree, path, level);
+                node = nilfs_btree_get_node(btree, path, level, &ncmax);
-                return nilfs_btree_node_get_ptr(btree, node,
+                return nilfs_btree_node_get_ptr(node,
-                                                path[level].bp_index - 1);
+                                                path[level].bp_index - 1,
+                                                ncmax);
        }
        /* parent */
        level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
        if (level <= nilfs_btree_height(btree) - 1) {
-                node = nilfs_btree_get_node(btree, path, level);
+                node = nilfs_btree_get_node(btree, path, level, &ncmax);
-                return nilfs_btree_node_get_ptr(btree, node,
+                return nilfs_btree_node_get_ptr(node, path[level].bp_index,
-                                                path[level].bp_index);
+                                                ncmax);
        }
        return NILFS_BMAP_INVALID_PTR;
 }
-static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
                                       const struct nilfs_btree_path *path,
                                       __u64 key)
 {
        __u64 ptr;
-        ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+        ptr = nilfs_bmap_find_target_seq(btree, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;
@@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
                        return ptr;
        }
        /* block group */
-        return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+        return nilfs_bmap_find_target_in_group(btree);
-}
-static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
-                                     __u64 ptr)
-{
-        btree->bt_bmap.b_last_allocated_key = key;
-        btree->bt_bmap.b_last_allocated_ptr = ptr;
 }
-static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp, __u64 key, __u64 ptr,
                                      struct nilfs_bmap_stats *stats)
@@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
-        int pindex, level, ret;
+        int pindex, level, ncmax, ncblk, ret;
        struct inode *dat = NULL;
        stats->bs_nblocks = 0;
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
                path[level].bp_newreq.bpr_ptr =
                        nilfs_btree_find_target_v(btree, path, key);
-                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
-        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_data;
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                if (nilfs_btree_node_get_nchildren(node) <
+                if (nilfs_btree_node_get_nchildren(node) < ncblk) {
-                    nilfs_btree_node_nchildren_max(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_insert;
                        stats->bs_nblocks++;
                        goto out;
                }
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
                /* left sibling */
                if (pindex > 0) {
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
-                                                          pindex - 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
-                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_left;
                                stats->bs_nblocks++;
                                goto out;
-                        } else
+                        } else {
                                brelse(bh);
+                        }
                }
                /* right sibling */
-                if (pindex <
+                if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
-                    nilfs_btree_node_get_nchildren(parent) - 1) {
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          ncmax);
-                                                          pindex + 1);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
-                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_right;
                                stats->bs_nblocks++;
                                goto out;
-                        } else
+                        } else {
                                brelse(bh);
+                        }
                }
                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+                ret = nilfs_bmap_prepare_alloc_ptr(btree,
                                                   &path[level].bp_newreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
@@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                stats->bs_nblocks++;
-                nilfs_btree_node_init(btree,
+                sib = (struct nilfs_btree_node *)bh->b_data;
-                                      (struct nilfs_btree_node *)bh->b_data,
+                nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
-                                      0, level, 0, NULL, NULL);
                path[level].bp_sib_bh = bh;
                path[level].bp_op = nilfs_btree_split;
        }
@@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* root */
        node = nilfs_btree_get_root(btree);
        if (nilfs_btree_node_get_nchildren(node) <
-            nilfs_btree_node_nchildren_max(node, btree)) {
+            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                path[level].bp_op = nilfs_btree_do_insert;
                stats->bs_nblocks++;
                goto out;
@@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_child_node;
        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        if (ret < 0)
                goto err_out_curr_node;
-        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+        nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
-                              0, level, 0, NULL, NULL);
+                              0, level, 0, ncblk, NULL, NULL);
        path[level].bp_sib_bh = bh;
        path[level].bp_op = nilfs_btree_grow;
@@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                   dat);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
                nilfs_btnode_delete(path[level].bp_sib_bh);
-                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
+                nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        }
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                   dat);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
 }
-static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, __u64 key, __u64 ptr)
 {
@@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
-                nilfs_btree_set_target_v(btree, key, ptr);
+                nilfs_bmap_set_target_v(btree, key, ptr);
-                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
+                nilfs_bmap_commit_alloc_ptr(btree,
                                            &path[level - 1].bp_newreq, dat);
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 }
-static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-                                    NILFS_BTREE_LEVEL_NODE_MIN);
+                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret != -ENOENT) {
                if (ret == 0)
                        ret = -EEXIST;
@@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_insert(btree, path, level, key, ptr);
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
        return ret;
 }
-static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
+        int ncblk;
        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                                        path[level].bp_index);
+                nilfs_btree_node_delete(node, path[level].bp_index,
+                                        keyp, ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
                if (path[level].bp_index == 0)
@@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(node, 0));
        } else {
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                nilfs_btree_node_delete(node, path[level].bp_index,
-                                        path[level].bp_index);
+                                        keyp, ptrp,
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
 }
-static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int nchildren, lnchildren, n;
+        int nchildren, lnchildren, n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
@@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = (nchildren + lnchildren) / 2 - nchildren;
-        nilfs_btree_node_move_right(btree, left, node, n);
+        nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        path[level].bp_index += n;
 }
-static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int nchildren, rnchildren, n;
+        int nchildren, rnchildren, n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
@@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = (nchildren + rnchildren) / 2 - nchildren;
-        nilfs_btree_node_move_left(btree, node, right, n);
+        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        path[level].bp_sib_bh = NULL;
 }
-static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(node);
-        nilfs_btree_node_move_left(btree, left, node, n);
+        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
-static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(right);
-        nilfs_btree_node_move_left(btree, node, right, n);
+        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
 }
-static void nilfs_btree_shrink(struct nilfs_btree *btree,
+static void nilfs_btree_shrink(struct nilfs_bmap *btree,
                               struct nilfs_btree_path *path,
                               int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *root, *child;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_nonroot_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
-        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+        nilfs_btree_node_delete(root, 0, NULL, NULL,
+                                NILFS_BTREE_ROOT_NCHILDREN_MAX);
        nilfs_btree_node_set_level(root, level);
        n = nilfs_btree_node_get_nchildren(child);
-        nilfs_btree_node_move_left(btree, root, child, n);
+        nilfs_btree_node_move_left(root, child, n,
+                                   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
 }
-static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp,
                                      struct nilfs_bmap_stats *stats,
@@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
-        int pindex, level, ret;
+        int pindex, level, ncmin, ncmax, ncblk, ret;
        ret = 0;
        stats->bs_nblocks = 0;
+        ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
-                        nilfs_btree_node_get_ptr(btree, node,
+                        nilfs_btree_node_get_ptr(node, path[level].bp_index,
-                                                 path[level].bp_index);
+                                                 ncblk);
-                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+                ret = nilfs_bmap_prepare_end_ptr(btree,
                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
-                if (nilfs_btree_node_get_nchildren(node) >
+                if (nilfs_btree_node_get_nchildren(node) > ncmin) {
-                    nilfs_btree_node_nchildren_min(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_delete;
                        stats->bs_nblocks++;
                        goto out;
                }
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
                if (pindex > 0) {
                        /* left sibling */
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
-                                                          pindex - 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
-                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_left;
                                stats->bs_nblocks++;
@@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                } else if (pindex <
                           nilfs_btree_node_get_nchildren(parent) - 1) {
                        /* right sibling */
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
-                                                          pindex + 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
-                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_right;
                                stats->bs_nblocks++;
@@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
-                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+                nilfs_btree_node_get_ptr(node, path[level].bp_index,
+                                         NILFS_BTREE_ROOT_NCHILDREN_MAX);
-        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                         &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;
@@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
+        nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
                brelse(path[level].bp_sib_bh);
-                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
+                nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                         &path[level].bp_oldreq, dat);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
 }
-static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, struct inode *dat)
 {
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
+                nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                          &path[level].bp_oldreq, dat);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 }
-static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        struct inode *dat;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-                                    NILFS_BTREE_LEVEL_NODE_MIN);
+                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret < 0)
                goto out;
-        dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-                nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
        if (ret < 0)
                goto out;
        nilfs_btree_commit_delete(btree, path, level, dat);
-        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
@@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        return ret;
 }
-static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *root, *node;
        __u64 maxkey, nextmaxkey;
        __u64 ptr;
        int nchildren, ret;
-        btree = (struct nilfs_btree *)bmap;
        root = nilfs_btree_get_root(btree);
        switch (nilfs_btree_height(btree)) {
        case 2:
@@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                nchildren = nilfs_btree_node_get_nchildren(root);
                if (nchildren > 1)
                        return 0;
-                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
@@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
-static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
                                   __u64 *keys, __u64 *ptrs, int nitems)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *node, *root;
        __le64 *dkeys;
        __le64 *dptrs;
        __u64 ptr;
-        int nchildren, i, ret;
+        int nchildren, ncmax, i, ret;
-        btree = (struct nilfs_btree *)bmap;
        root = nilfs_btree_get_root(btree);
        switch (nilfs_btree_height(btree)) {
        case 2:
                bh = NULL;
                node = root;
+                ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
                break;
        case 3:
                nchildren = nilfs_btree_node_get_nchildren(root);
                WARN_ON(nchildren > 1);
-                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
+                ncmax = nilfs_btree_nchildren_per_block(btree);
                break;
        default:
                node = NULL;
@@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
        if (nchildren < nitems)
                nitems = nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nitems; i++) {
-                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+                keys[i] = le64_to_cpu(dkeys[i]);
-                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+                ptrs[i] = le64_to_cpu(dptrs[i]);
        }
        if (bh != NULL)
@@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 }
 static int
-nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
                                       union nilfs_bmap_ptr_req *dreq,
                                       union nilfs_bmap_ptr_req *nreq,
                                       struct buffer_head **bhp,
                                       struct nilfs_bmap_stats *stats)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct inode *dat = NULL;
        int ret;
@@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* for data */
        /* cannot find near ptr */
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
-        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
        if (ret < 0)
                return ret;
@@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
+                ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
                if (ret < 0)
                        goto err_out_dreq;
@@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
+        nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
 err_out_dreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
+        nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
        stats->bs_nblocks = 0;
        return ret;
 }
 static void
-nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
                                      int n,
@@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_node *node;
        struct inode *dat;
        __u64 tmpptr;
+        int ncblk;
        /* free resources */
-        if (bmap->b_ops->bop_clear != NULL)
+        if (btree->b_ops->bop_clear != NULL)
-                bmap->b_ops->bop_clear(bmap);
+                btree->b_ops->bop_clear(btree);
        /* ptr must be a pointer to a buffer head. */
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        /* convert and insert */
-        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-        nilfs_btree_init(bmap);
+        nilfs_btree_init(btree);
        if (nreq != NULL) {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
-                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
                /* create child node at level 1 */
                node = (struct nilfs_btree_node *)bh->b_data;
-                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                nilfs_btree_node_insert(btree, node,
+                nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
-                                        key, dreq->bpr_ptr, n);
+                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
                if (!buffer_dirty(bh))
                        nilfs_btnode_mark_dirty(bh);
-                if (!nilfs_bmap_dirty(bmap))
+                if (!nilfs_bmap_dirty(btree))
-                        nilfs_bmap_set_dirty(bmap);
+                        nilfs_bmap_set_dirty(btree);
                brelse(bh);
                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
                tmpptr = nreq->bpr_ptr;
-                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
-                                      2, 1, &keys[0], &tmpptr);
+                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+                                      &keys[0], &tmpptr);
        } else {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
-                                      1, n, keys, ptrs);
+                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
-                nilfs_btree_node_insert(btree, node,
+                                      keys, ptrs);
-                                        key, dreq->bpr_ptr, n);
+                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
-                if (!nilfs_bmap_dirty(bmap))
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
-                        nilfs_bmap_set_dirty(bmap);
+                if (!nilfs_bmap_dirty(btree))
+                        nilfs_bmap_set_dirty(btree);
        }
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (NILFS_BMAP_USE_VBN(btree))
-                nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
+                nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
 }
 /**
@@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 * @ptrs:
 * @n:
 */
-int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                                   __u64 key, __u64 ptr,
                                   const __u64 *keys, const __u64 *ptrs, int n)
 {
@@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                di = &dreq;
                ni = NULL;
        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
-                           1 << bmap->b_inode->i_blkbits)) {
+                           1 << btree->b_inode->i_blkbits)) {
                di = &dreq;
                ni = &nreq;
        } else {
@@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                BUG();
        }
-        ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+        ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
                                                     &stats);
        if (ret < 0)
                return ret;
-        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+        nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
                                              di, ni, bh);
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
        return 0;
 }
-static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level,
                                   struct buffer_head *bh)
@@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        path[level].bp_oldreq.bpr_ptr =
-                nilfs_btree_node_get_ptr(btree, parent,
+                nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                         path[level + 1].bp_index);
+                                         ncmax);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
        ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
                                       &path[level].bp_newreq.bpr_req);
@@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
                path[level].bp_ctxt.bh = path[level].bp_bh;
                ret = nilfs_btnode_prepare_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
                        nilfs_dat_abort_update(dat,
@@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
        return 0;
 }
-static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
+        int ncmax;
        nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
                                &path[level].bp_newreq.bpr_req,
-                                btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
+                                btree->b_ptr_type == NILFS_BMAP_PTR_VS);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                path[level].bp_bh = path[level].bp_ctxt.bh;
        }
        set_buffer_nilfs_volatile(path[level].bp_bh);
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
-                                 path[level].bp_newreq.bpr_ptr);
+                                 path[level].bp_newreq.bpr_ptr, ncmax);
 }
-static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
                                       struct nilfs_btree_path *path,
                                       int level, struct inode *dat)
 {
@@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                               &path[level].bp_newreq.bpr_req);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
 }
-static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int *maxlevelp,
                                           struct inode *dat)
@@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
        return ret;
 }
-static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int maxlevel,
                                           struct buffer_head *bh,
@@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
                nilfs_btree_commit_update_v(btree, path, level, dat);
 }
-static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, struct buffer_head *bh)
 {
        int maxlevel = 0, ret;
        struct nilfs_btree_node *parent;
-        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 ptr;
+        int ncmax;
        get_bh(bh);
        path[level].bp_bh = bh;
@@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                goto out;
        if (buffer_nilfs_volatile(path[level].bp_bh)) {
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-                ptr = nilfs_btree_node_get_ptr(btree, parent,
+                ptr = nilfs_btree_node_get_ptr(parent,
-                                               path[level + 1].bp_index);
+                                               path[level + 1].bp_index,
+                                               ncmax);
                ret = nilfs_dat_mark_dirty(dat, ptr);
                if (ret < 0)
                        goto out;
@@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
        return ret;
 }
-static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate(struct nilfs_bmap *btree,
                                 struct buffer_head *bh)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
@@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        WARN_ON(!buffer_dirty(bh));
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
@@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
-                key = nilfs_bmap_data_get_key(bmap, bh);
+                key = nilfs_bmap_data_get_key(btree, bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }
-        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                if (unlikely(ret == -ENOENT))
                        printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = NILFS_BMAP_USE_VBN(bmap) ?
+        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_propagate_v(btree, path, level, bh) :
                nilfs_btree_propagate_p(btree, path, level, bh);
@@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        return ret;
 }
-static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
                                    struct buffer_head *bh)
 {
-        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
 }
-static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
                                         struct list_head *lists,
                                         struct buffer_head *bh)
 {
@@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        node = (struct nilfs_btree_node *)bh->b_data;
        key = nilfs_btree_node_get_key(node, 0);
        level = nilfs_btree_node_get_level(node);
+        if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
+            level >= NILFS_BTREE_LEVEL_MAX) {
+                dump_stack();
+                printk(KERN_WARNING
+                       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
+                       "blocknr=%llu)\n",
+                       __func__, level, (unsigned long long)key,
+                       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
+                       (unsigned long long)bh->b_blocknr);
+                return;
+        }
        list_for_each(head, &lists[level]) {
                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
                cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        list_add_tail(&bh->b_assoc_buffers, head);
 }
-static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
                                             struct list_head *listp)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
-        struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
        struct list_head lists[NILFS_BTREE_LEVEL_MAX];
        struct pagevec pvec;
        struct buffer_head *bh, *head;
@@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
                list_splice_tail(&lists[level], listp);
 }
-static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
@@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
        struct nilfs_btree_node *parent;
        __u64 key;
        __u64 ptr;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        ptr = nilfs_btree_node_get_ptr(btree, parent,
+        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                       path[level + 1].bp_index);
+                                       ncmax);
        if (buffer_nilfs_node(*bh)) {
                path[level].bp_ctxt.oldkey = ptr;
                path[level].bp_ctxt.newkey = blocknr;
                path[level].bp_ctxt.bh = *bh;
                ret = nilfs_btnode_prepare_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0)
                        return ret;
                nilfs_btnode_commit_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                *bh = path[level].bp_ctxt.bh;
        }
-        nilfs_btree_node_set_ptr(btree, parent,
+        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
-                                 path[level + 1].bp_index, blocknr);
+                                 ncmax);
        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
-        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = level;
        return 0;
 }
-static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
@@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
                                union nilfs_binfo *binfo)
 {
        struct nilfs_btree_node *parent;
-        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 key;
        __u64 ptr;
        union nilfs_bmap_ptr_req req;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        ptr = nilfs_btree_node_get_ptr(btree, parent,
+        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                       path[level + 1].bp_index);
+                                       ncmax);
        req.bpr_ptr = ptr;
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (ret < 0)
@@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
-        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        return 0;
 }
-static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign(struct nilfs_bmap *btree,
                              struct buffer_head **bh,
                              sector_t blocknr,
                              union nilfs_binfo *binfo)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
@@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
-                key = nilfs_bmap_data_get_key(bmap, *bh);
+                key = nilfs_bmap_data_get_key(btree, *bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }
-        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
        }
-        ret = NILFS_BMAP_USE_VBN(bmap) ?
+        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
@@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        return ret;
 }
-static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
@@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
        __u64 key;
        int ret;
-        ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+        ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
                             blocknr);
        if (ret < 0)
                return ret;
@@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
                node = (struct nilfs_btree_node *)(*bh)->b_data;
                key = nilfs_btree_node_get_key(node, 0);
        } else
-                key = nilfs_bmap_data_get_key(bmap, *bh);
+                key = nilfs_bmap_data_get_key(btree, *bh);
        /* on-disk format */
        binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        return 0;
 }
-static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        __u64 ptr;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
@@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        if (!buffer_dirty(bh))
                nilfs_btnode_mark_dirty(bh);
        brelse(bh);
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 out:
        nilfs_btree_free_path(path);
@@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 int nilfs_btree_init(struct nilfs_bmap *bmap)
 {
        bmap->b_ops = &nilfs_btree_ops;
+        bmap->b_nchildren_per_block =
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
        return 0;
 }
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
        bmap->b_ops = &nilfs_btree_ops_gc;
+        bmap->b_nchildren_per_block =
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 43c8c5b541fd..22c02e35b6ef 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -31,14 +31,6 @@
 #include "bmap.h"
 /**
- * struct nilfs_btree - B-tree structure
- * @bt_bmap: bmap base structure
- */
-struct nilfs_btree {
-        struct nilfs_bmap bt_bmap;
-};
-/**
 * struct nilfs_btree_path - A path on which B-tree operations are executed
 * @bp_bh: buffer head of node block
 * @bp_sib_bh: buffer head of sibling node block
@@ -54,7 +46,7 @@ struct nilfs_btree_path {
        union nilfs_bmap_ptr_req bp_oldreq;
        union nilfs_bmap_ptr_req bp_newreq;
        struct nilfs_btnode_chkey_ctxt bp_ctxt;
-        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+        void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
                      int, __u64 *, __u64 *);
 };
@@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
                                   const __u64 *, const __u64 *, int);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
+int nilfs_btree_broken_node_block(struct buffer_head *bh);
 #endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f0..b60277b44468 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page)
        }
        for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
                p = (struct nilfs_dir_entry *)(kaddr + offs);
-                rec_len = le16_to_cpu(p->rec_len);
+                rec_len = nilfs_rec_len_from_disk(p->rec_len);
                if (rec_len < NILFS_DIR_REC_LEN(1))
                        goto Eshort;
@@ -199,13 +199,10 @@ fail:
 static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t *)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (!PageChecked(page))
                        nilfs_check_page(page);
                if (PageError(page))
@@ -238,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
 */
 static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
 {
-        return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+        return (struct nilfs_dir_entry *)((char *)p +
+                                          nilfs_rec_len_from_disk(p->rec_len));
 }
 static unsigned char
@@ -329,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                        goto success;
                                }
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
                }
                nilfs_put_page(page);
        }
@@ -444,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
                    struct page *page, struct inode *inode)
 {
        unsigned from = (char *) de - (char *) page_address(page);
-        unsigned to = from + le16_to_cpu(de->rec_len);
+        unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
        struct address_space *mapping = page->mapping;
        int err;
@@ -500,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
                                /* We hit i_size */
                                name_len = 0;
                                rec_len = chunk_size;
-                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->rec_len = nilfs_rec_len_to_disk(chunk_size);
                                de->inode = 0;
                                goto got_it;
                        }
@@ -514,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
                        if (nilfs_match(namelen, name, de))
                                goto out_unlock;
                        name_len = NILFS_DIR_REC_LEN(de->name_len);
-                        rec_len = le16_to_cpu(de->rec_len);
+                        rec_len = nilfs_rec_len_from_disk(de->rec_len);
                        if (!de->inode && rec_len >= reclen)
                                goto got_it;
                        if (rec_len >= name_len + reclen)
@@ -537,8 +535,8 @@ got_it:
                struct nilfs_dir_entry *de1;
                de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
-                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
-                de->rec_len = cpu_to_le16(name_len);
+                de->rec_len = nilfs_rec_len_to_disk(name_len);
                de = de1;
        }
        de->name_len = namelen;
@@ -569,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        struct inode *inode = mapping->host;
        char *kaddr = page_address(page);
        unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        unsigned to = ((char *)dir - kaddr) +
+                nilfs_rec_len_from_disk(dir->rec_len);
        struct nilfs_dir_entry *pde = NULL;
        struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
        int err;
@@ -590,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        err = nilfs_prepare_chunk(page, mapping, from, to);
        BUG_ON(err);
        if (pde)
-                pde->rec_len = cpu_to_le16(to - from);
+                pde->rec_len = nilfs_rec_len_to_disk(to - from);
        dir->inode = 0;
        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -624,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
        memset(kaddr, 0, chunk_size);
        de = (struct nilfs_dir_entry *)kaddr;
        de->name_len = 1;
-        de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+        de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
        memcpy(de->name, ".\0\0", 4);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
        de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
        de->name_len = 2;
-        de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+        de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
        de->inode = cpu_to_le64(parent->i_ino);
        memcpy(de->name, "..\0", 4);
        nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cdf..324d80c57518 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
 #include "alloc.h"
 #include "dat.h"
-static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
 {
        return (__le64 *)
-                ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+                ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
 }
 static inline __u64
-nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+        return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
 }
-static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
                                        __u64 key, __u64 ptr)
 {
-        *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+        *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
 }
-static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
                               __u64 key, int level, __u64 *ptrp)
 {
-        struct nilfs_direct *direct;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;  /* XXX: use macro for level 1 */
        if (key > NILFS_DIRECT_KEY_MAX || level != 1)
                return -ENOENT;
        ptr = nilfs_direct_get_ptr(direct, key);
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        if (ptrp != NULL)
+        *ptrp = ptr;
-                *ptrp = ptr;
        return 0;
 }
-static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
                                      __u64 key, __u64 *ptrp,
                                      unsigned maxblocks)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(direct)) {
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(direct);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
 }
 static __u64
-nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
 {
        __u64 ptr;
-        ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+        ptr = nilfs_bmap_find_target_seq(direct, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;
        else
                /* block group */
-                return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+                return nilfs_bmap_find_target_in_group(direct);
-}
-static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
-                                      __u64 key, __u64 ptr)
-{
-        direct->d_bmap.b_last_allocated_key = key;
-        direct->d_bmap.b_last_allocated_ptr = ptr;
 }
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
        struct inode *dat = NULL;
        struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
-        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+        if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
                return -EEXIST;
        if (NILFS_BMAP_USE_VBN(bmap)) {
-                req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+                req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
                dat = nilfs_bmap_get_dat(bmap);
        }
        ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                set_buffer_nilfs_volatile(bh);
                nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
-                nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
+                nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
                if (!nilfs_bmap_dirty(bmap))
                        nilfs_bmap_set_dirty(bmap);
                if (NILFS_BMAP_USE_VBN(bmap))
-                        nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
+                        nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
                nilfs_bmap_add_blocks(bmap, 1);
        }
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
        struct inode *dat;
        int ret;
        if (key > NILFS_DIRECT_KEY_MAX ||
-            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+            nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-        req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
+        req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
        ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
        if (!ret) {
                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
-                nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+                nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
                nilfs_bmap_sub_blocks(bmap, 1);
        }
        return ret;
 }
-static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
 {
-        struct nilfs_direct *direct;
        __u64 key, lastkey;
-        direct = (struct nilfs_direct *)bmap;
        lastkey = NILFS_DIRECT_KEY_MAX + 1;
        for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
                if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
        return key > NILFS_DIRECT_KEY_MAX;
 }
-static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
                                    __u64 *keys, __u64 *ptrs, int nitems)
 {
-        struct nilfs_direct *direct;
        __u64 key;
        __u64 ptr;
        int n;
-        direct = (struct nilfs_direct *)bmap;
        if (nitems > NILFS_DIRECT_NBLOCKS)
                nitems = NILFS_DIRECT_NBLOCKS;
        n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
 {
-        struct nilfs_direct *direct;
        __le64 *dptrs;
        int ret, i, j;
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                bmap->b_ops->bop_clear(bmap);
        /* convert */
-        direct = (struct nilfs_direct *)bmap;
+        dptrs = nilfs_direct_dptrs(bmap);
-        dptrs = nilfs_direct_dptrs(direct);
        for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
                if ((j < n) && (i == keys[j])) {
                        dptrs[i] = (i != key) ?
-                                nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+                                cpu_to_le64(ptrs[j]) :
                                NILFS_BMAP_INVALID_PTR;
                        j++;
                } else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
        return 0;
 }
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        struct nilfs_palloc_req oldreq, newreq;
        struct inode *dat;
        __u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
        dat = nilfs_bmap_get_dat(bmap);
        key = nilfs_bmap_data_get_key(bmap, bh);
-        ptr = nilfs_direct_get_ptr(direct, key);
+        ptr = nilfs_direct_get_ptr(bmap, key);
        if (!buffer_nilfs_volatile(bh)) {
                oldreq.pr_entry_nr = ptr;
                newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
                nilfs_dat_commit_update(dat, &oldreq, &newreq,
                                        bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
                set_buffer_nilfs_volatile(bh);
-                nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
+                nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
        } else
                ret = nilfs_dat_mark_dirty(dat, ptr);
        return ret;
 }
-static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
-        struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(direct);
        union nilfs_bmap_ptr_req req;
        int ret;
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (!ret) {
                nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
-                binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+                binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
-                binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+                binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        }
        return ret;
 }
-static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
 {
        nilfs_direct_set_ptr(direct, key, blocknr);
-        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = 0;
        return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                               sector_t blocknr,
                               union nilfs_binfo *binfo)
 {
-        struct nilfs_direct *direct;
        __u64 key;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;
        key = nilfs_bmap_data_get_key(bmap, *bh);
        if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
                printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
                       (unsigned long long)key);
                return -EINVAL;
        }
-        ptr = nilfs_direct_get_ptr(direct, key);
+        ptr = nilfs_direct_get_ptr(bmap, key);
        if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
                printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
                       (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
        }
        return NILFS_BMAP_USE_VBN(bmap) ?
-                nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+                nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
-                nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
+                nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
 }
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d0..dc643de20a25 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
 #include "bmap.h"
-struct nilfs_direct;
 /**
 * struct nilfs_direct_node - direct node
 * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
        __u8 pad[7];
 };
-/**
- * struct nilfs_direct - direct mapping
- * @d_bmap: bmap structure
- */
-struct nilfs_direct {
-        struct nilfs_bmap d_bmap;
-};
 #define NILFS_DIRECT_NBLOCKS    (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
 #define NILFS_DIRECT_KEY_MIN    0
 #define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3e..bed3a783129b 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,6 +48,8 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "mdt.h"
 #include "dat.h"
@@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
                                   __u64 vbn, struct buffer_head **out_bh)
 {
-        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+        int ret;
-                                            vbn ? : pbn, pbn, out_bh);
+        ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+                                        vbn ? : pbn, pbn, READ, out_bh, &pbn);
        if (ret == -EEXIST) /* internal code (cache hit) */
                ret = 0;
        return ret;
@@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
        if (buffer_dirty(bh))
                return -EEXIST;
-        if (buffer_nilfs_node(bh))
+        if (buffer_nilfs_node(bh)) {
+                if (nilfs_btree_broken_node_block(bh)) {
+                        clear_buffer_uptodate(bh);
+                        return -EIO;
+                }
                nilfs_btnode_mark_dirty(bh);
-        else
+        } else {
                nilfs_mdt_mark_buffer_dirty(bh);
+        }
        return 0;
 }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb6..d01aff4957d9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include "nilfs.h"
+#include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 47d6d7928122..0842d775b3e0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
 #include "the_nilfs.h"
 #include "sb.h"
 #include "bmap.h"
-#include "bmap_union.h"
 /*
 * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
        __u32 i_flags;
        unsigned long  i_state;         /* Dynamic state flags */
        struct nilfs_bmap *i_bmap;
-        union nilfs_bmap_union i_bmap_union;
+        struct nilfs_bmap i_bmap_data;
        __u64 i_xattr;  /* sector_t ??? */
        __u32 i_dir_start_lookup;
        __u64 i_cno;            /* check point number for GC inode */
@@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
 static inline struct nilfs_inode_info *
 NILFS_BMAP_I(const struct nilfs_bmap *bmap)
 {
-        return container_of((union nilfs_bmap_union *)bmap,
+        return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
-                            struct nilfs_inode_info,
-                            i_bmap_union);
 }
 static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -107,6 +104,14 @@ enum {
 };
 /*
+ * commit flags for nilfs_commit_super and nilfs_sync_super
+ */
+enum {
+        NILFS_SB_COMMIT = 0,    /* Commit a super block alternately */
+        NILFS_SB_COMMIT_ALL     /* Commit both super blocks */
+};
+/*
 * Macros to check inode numbers
 */
 #define NILFS_MDT_INO_BITS   \
@@ -270,7 +275,14 @@ extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
                                        struct nilfs_super_block *, char *);
+extern int nilfs_check_feature_compatibility(struct super_block *,
+                                             struct nilfs_super_block *);
+extern void nilfs_set_log_cursor(struct nilfs_super_block *,
+                                 struct the_nilfs *);
+extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
+                                                      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_cleanup_super(struct nilfs_sb_info *);
 extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
 extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e48130..aab11db2cb08 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
 #define NILFS_BUFFER_INHERENT_BITS  \
        ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
-         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
+         (1UL << BH_NILFS_Checked))
 static struct buffer_head *
 __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
        lock_buffer(bh);
        clear_buffer_nilfs_volatile(bh);
+        clear_buffer_nilfs_checked(bh);
        clear_buffer_dirty(bh);
        if (nilfs_page_buffers_clean(page))
                __nilfs_clear_page_dirty(page);
@@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
                                lock_buffer(bh);
                                clear_buffer_dirty(bh);
                                clear_buffer_nilfs_volatile(bh);
+                                clear_buffer_nilfs_checked(bh);
                                clear_buffer_uptodate(bh);
                                clear_buffer_mapped(bh);
                                unlock_buffer(bh);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f8..f53d8da41ed7 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,13 @@ enum {
        BH_NILFS_Allocated = BH_PrivateStart,
        BH_NILFS_Node,
        BH_NILFS_Volatile,
+        BH_NILFS_Checked,
 };
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index bae2a516b4ee..83e3d8c61a01 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err)
        return -EINVAL;
 }
-static void store_segsum_info(struct nilfs_segsum_info *ssi,
-                              struct nilfs_segment_summary *sum,
-                              unsigned int blocksize)
-{
-        ssi->flags = le16_to_cpu(sum->ss_flags);
-        ssi->seg_seq = le64_to_cpu(sum->ss_seq);
-        ssi->ctime = le64_to_cpu(sum->ss_create);
-        ssi->next = le64_to_cpu(sum->ss_next);
-        ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
-        ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
-        ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
-        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
-        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
-        /* need to verify ->ss_bytes field if read ->ss_cno */
-}
 /**
- * calc_crc_cont - check CRC of blocks continuously
+ * nilfs_compute_checksum - compute checksum of blocks continuously
- * @sbi: nilfs_sb_info
+ * @nilfs: nilfs object
 * @bhs: buffer head of start block
 * @sum: place to store result
 * @offset: offset bytes in the first block
@@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
 * @start: DBN of start block
 * @nblock: number of blocks to be checked
 */
-static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+static int nilfs_compute_checksum(struct the_nilfs *nilfs,
-                         u32 *sum, unsigned long offset, u64 check_bytes,
+                                  struct buffer_head *bhs, u32 *sum,
-                         sector_t start, unsigned long nblock)
+                                  unsigned long offset, u64 check_bytes,
+                                  sector_t start, unsigned long nblock)
 {
-        unsigned long blocksize = sbi->s_super->s_blocksize;
+        unsigned int blocksize = nilfs->ns_blocksize;
        unsigned long size;
        u32 crc;
        BUG_ON(offset >= blocksize);
        check_bytes -= offset;
        size = min_t(u64, check_bytes, blocksize - offset);
-        crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+        crc = crc32_le(nilfs->ns_crc_seed,
                       (unsigned char *)bhs->b_data + offset, size);
        if (--nblock > 0) {
                do {
-                        struct buffer_head *bh
+                        struct buffer_head *bh;
-                                = sb_bread(sbi->s_super, ++start);
+                        bh = __bread(nilfs->ns_bdev, ++start, blocksize);
                        if (!bh)
                                return -EIO;
                        check_bytes -= size;
@@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
 /**
 * nilfs_read_super_root_block - read super root block
- * @sb: super_block
+ * @nilfs: nilfs object
 * @sr_block: disk block number of the super root block
 * @pbh: address of a buffer_head pointer to return super root buffer
 * @check: CRC check flag
 */
-int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
                                struct buffer_head **pbh, int check)
 {
        struct buffer_head *bh_sr;
@@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
        int ret;
        *pbh = NULL;
-        bh_sr = sb_bread(sb, sr_block);
+        bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
        if (unlikely(!bh_sr)) {
                ret = NILFS_SEG_FAIL_IO;
                goto failed;
@@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
        if (check) {
                unsigned bytes = le16_to_cpu(sr->sr_bytes);
-                if (bytes == 0 || bytes > sb->s_blocksize) {
+                if (bytes == 0 || bytes > nilfs->ns_blocksize) {
                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
                        goto failed_bh;
                }
-                if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+                if (nilfs_compute_checksum(
-                                  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+                            nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
+                            sr_block, 1)) {
                        ret = NILFS_SEG_FAIL_IO;
                        goto failed_bh;
                }
@@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 }
 /**
- * load_segment_summary - read segment summary of the specified partial segment
+ * nilfs_read_log_header - read summary header of the specified log
- * @sbi: nilfs_sb_info
+ * @nilfs: nilfs object
- * @pseg_start: start disk block number of partial segment
+ * @start_blocknr: start block number of the log
- * @seg_seq: sequence number requested
+ * @sum: pointer to return segment summary structure
- * @ssi: pointer to nilfs_segsum_info struct to store information
 */
-static int
+static struct buffer_head *
-load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
-                     u64 seg_seq, struct nilfs_segsum_info *ssi)
+                      struct nilfs_segment_summary **sum)
 {
        struct buffer_head *bh_sum;
-        struct nilfs_segment_summary *sum;
+        bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+        if (bh_sum)
+                *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        return bh_sum;
+}
+/**
+ * nilfs_validate_log - verify consistency of log
+ * @nilfs: nilfs object
+ * @seg_seq: sequence number of segment
+ * @bh_sum: buffer head of summary block
+ * @sum: segment summary struct
+ */
+static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
+                              struct buffer_head *bh_sum,
+                              struct nilfs_segment_summary *sum)
+{
        unsigned long nblock;
        u32 crc;
-        int ret = NILFS_SEG_FAIL_IO;
+        int ret;
-        bh_sum = sb_bread(sbi->s_super, pseg_start);
+        ret = NILFS_SEG_FAIL_MAGIC;
-        if (!bh_sum)
+        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
                goto out;
-        sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        ret = NILFS_SEG_FAIL_SEQ;
+        if (le64_to_cpu(sum->ss_seq) != seg_seq)
-        /* Check consistency of segment summary */
+                goto out;
-        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
-                ret = NILFS_SEG_FAIL_MAGIC;
-                goto failed;
-        }
-        store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
-        if (seg_seq != ssi->seg_seq) {
-                ret = NILFS_SEG_FAIL_SEQ;
-                goto failed;
-        }
-        nblock = ssi->nblocks;
+        nblock = le32_to_cpu(sum->ss_nblocks);
-        if (unlikely(nblock == 0 ||
+        ret = NILFS_SEG_FAIL_CONSISTENCY;
-                     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+        if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
                /* This limits the number of blocks read in the CRC check */
-                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                goto out;
-                goto failed;
-        }
+        ret = NILFS_SEG_FAIL_IO;
-        if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
+        if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
-                          ((u64)nblock << sbi->s_super->s_blocksize_bits),
+                                   ((u64)nblock << nilfs->ns_blocksize_bits),
-                          pseg_start, nblock)) {
+                                   bh_sum->b_blocknr, nblock))
-                ret = NILFS_SEG_FAIL_IO;
+                goto out;
-                goto failed;
-        }
+        ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
-        if (crc == le32_to_cpu(sum->ss_datasum))
+        if (crc != le32_to_cpu(sum->ss_datasum))
-                ret = 0;
+                goto out;
-        else
+        ret = 0;
-                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+out:
- failed:
-        brelse(bh_sum);
- out:
        return ret;
 }
-static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+/**
-                        unsigned int *offset, unsigned int bytes)
+ * nilfs_read_summary_info - read an item on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be read
+ */
+static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
+                                     struct buffer_head **pbh,
+                                     unsigned int *offset, unsigned int bytes)
 {
        void *ptr;
        sector_t blocknr;
@@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
        if (bytes > (*pbh)->b_size - *offset) {
                blocknr = (*pbh)->b_blocknr;
                brelse(*pbh);
-                *pbh = sb_bread(sb, blocknr + 1);
+                *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
+                               nilfs->ns_blocksize);
                if (unlikely(!*pbh))
                        return NULL;
                *offset = 0;
@@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
        return ptr;
 }
-static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+/**
-                        unsigned int *offset, unsigned int bytes,
+ * nilfs_skip_summary_info - skip items on summary blocks of a log
-                        unsigned long count)
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be skipped
+ * @count: number of items to be skipped
+ */
+static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
+                                    struct buffer_head **pbh,
+                                    unsigned int *offset, unsigned int bytes,
+                                    unsigned long count)
 {
        unsigned int rest_item_in_current_block
                = ((*pbh)->b_size - *offset) / bytes;
@@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
                *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
                brelse(*pbh);
-                *pbh = sb_bread(sb, blocknr + bcnt);
+                *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
+                               nilfs->ns_blocksize);
        }
 }
-static int
+/**
-collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+ * nilfs_scan_dsync_log - get block information of a log written for data sync
-                           struct nilfs_segsum_info *ssi,
+ * @nilfs: nilfs object
-                           struct list_head *head)
+ * @start_blocknr: start block number of the log
+ * @sum: log summary information
+ * @head: list head to add nilfs_recovery_block struct
+ */
+static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
+                                struct nilfs_segment_summary *sum,
+                                struct list_head *head)
 {
        struct buffer_head *bh;
        unsigned int offset;
-        unsigned long nfinfo = ssi->nfinfo;
+        u32 nfinfo, sumbytes;
-        sector_t blocknr = sum_blocknr + ssi->nsumblk;
+        sector_t blocknr;
        ino_t ino;
        int err = -EIO;
+        nfinfo = le32_to_cpu(sum->ss_nfinfo);
        if (!nfinfo)
                return 0;
-        bh = sb_bread(sbi->s_super, sum_blocknr);
+        sumbytes = le32_to_cpu(sum->ss_sumbytes);
+        blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
+        bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh))
                goto out;
-        offset = le16_to_cpu(
+        offset = le16_to_cpu(sum->ss_bytes);
-                ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
        for (;;) {
                unsigned long nblocks, ndatablk, nnodeblk;
                struct nilfs_finfo *finfo;
-                finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+                finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+                                                sizeof(*finfo));
                if (unlikely(!finfo))
                        goto out;
@@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
                        struct nilfs_recovery_block *rb;
                        struct nilfs_binfo_v *binfo;
-                        binfo = segsum_get(sbi->s_super, &bh, &offset,
+                        binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
-                                           sizeof(*binfo));
+                                                        sizeof(*binfo));
                        if (unlikely(!binfo))
                                goto out;
@@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
                }
                if (--nfinfo == 0)
                        break;
-                blocknr += nnodeblk; /* always 0 for the data sync segments */
+                blocknr += nnodeblk; /* always 0 for data sync logs */
-                segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+                nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
-                            nnodeblk);
+                                        nnodeblk);
                if (unlikely(!bh))
                        goto out;
        }
@@ -467,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        return err;
 }
-static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
                                     struct nilfs_recovery_block *rb,
                                     struct page *page)
 {
        struct buffer_head *bh_org;
        void *kaddr;
-        bh_org = sb_bread(sbi->s_super, rb->blocknr);
+        bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh_org))
                return -EIO;
@@ -485,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
        return 0;
 }
-static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
-                                struct list_head *head,
+                                      struct nilfs_sb_info *sbi,
-                                unsigned long *nr_salvaged_blocks)
+                                      struct list_head *head,
+                                      unsigned long *nr_salvaged_blocks)
 {
        struct inode *inode;
        struct nilfs_recovery_block *rb, *n;
-        unsigned blocksize = sbi->s_super->s_blocksize;
+        unsigned blocksize = nilfs->ns_blocksize;
        struct page *page;
        loff_t pos;
        int err = 0, err2 = 0;
@@ -511,7 +529,7 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
                if (unlikely(err))
                        goto failed_inode;
-                err = nilfs_recovery_copy_block(sbi, rb, page);
+                err = nilfs_recovery_copy_block(nilfs, rb, page);
                if (unlikely(err))
                        goto failed_page;
@@ -551,18 +569,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 /**
 * nilfs_do_roll_forward - salvage logical segments newer than the latest
 * checkpoint
+ * @nilfs: nilfs object
 * @sbi: nilfs_sb_info
- * @nilfs: the_nilfs
 * @ri: pointer to a nilfs_recovery_info
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi,
                                 struct nilfs_recovery_info *ri)
 {
-        struct nilfs_segsum_info ssi;
+        struct buffer_head *bh_sum = NULL;
+        struct nilfs_segment_summary *sum;
        sector_t pseg_start;
        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
        unsigned long nsalvaged_blocks = 0;
+        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        int empty_seg = 0;
@@ -581,8 +601,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+                brelse(bh_sum);
+                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+                if (!bh_sum) {
+                        err = -EIO;
+                        goto failed;
+                }
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO) {
                                err = -EIO;
@@ -590,33 +616,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                        }
                        goto strayed;
                }
-                if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+                flags = le16_to_cpu(sum->ss_flags);
+                if (flags & NILFS_SS_SR)
                        goto confused;
                /* Found a valid partial segment; do recovery actions */
-                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                nextnum = nilfs_get_segnum_of_block(nilfs,
+                                                    le64_to_cpu(sum->ss_next));
                empty_seg = 0;
-                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
-                if (!(ssi.flags & NILFS_SS_GC))
+                if (!(flags & NILFS_SS_GC))
-                        nilfs->ns_nongc_ctime = ssi.ctime;
+                        nilfs->ns_nongc_ctime = nilfs->ns_ctime;
                switch (state) {
                case RF_INIT_ST:
-                        if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+                        if (!(flags & NILFS_SS_LOGBGN) ||
+                            !(flags & NILFS_SS_SYNDT))
                                goto try_next_pseg;
                        state = RF_DSYNC_ST;
                        /* Fall through */
                case RF_DSYNC_ST:
-                        if (!NILFS_SEG_DSYNC(&ssi))
+                        if (!(flags & NILFS_SS_SYNDT))
                                goto confused;
-                        err = collect_blocks_from_segsum(
+                        err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
-                                sbi, pseg_start, &ssi, &dsync_blocks);
+                                                   &dsync_blocks);
                        if (unlikely(err))
                                goto failed;
-                        if (NILFS_SEG_LOGEND(&ssi)) {
+                        if (flags & NILFS_SS_LOGEND) {
-                                err = recover_dsync_blocks(
+                                err = nilfs_recover_dsync_blocks(
-                                        sbi, &dsync_blocks, &nsalvaged_blocks);
+                                        nilfs, sbi, &dsync_blocks,
+                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
                                state = RF_INIT_ST;
@@ -627,7 +658,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 try_next_pseg:
                if (pseg_start == ri->ri_lsegs_end)
                        break;
-                pseg_start += ssi.nblocks;
+                pseg_start += le32_to_cpu(sum->ss_nblocks);
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;
@@ -652,8 +683,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
        }
 out:
+        brelse(bh_sum);
        dispose_recovery_list(&dsync_blocks);
-        nilfs_detach_writer(sbi->s_nilfs, sbi);
+        nilfs_detach_writer(nilfs, sbi);
        return err;
 confused:
@@ -667,7 +699,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 }
 static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
-                                      struct nilfs_sb_info *sbi,
                                      struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh;
@@ -677,7 +708,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
            nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
                return;
-        bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+        bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
        BUG_ON(!bh);
        memset(bh->b_data, 0, bh->b_size);
        set_buffer_dirty(bh);
@@ -690,9 +721,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 }
 /**
- * nilfs_recover_logical_segments - salvage logical segments written after
+ * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
- * the latest super root
+ * @nilfs: nilfs object
- * @nilfs: the_nilfs
 * @sbi: nilfs_sb_info
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
@@ -709,9 +739,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
-                                   struct nilfs_sb_info *sbi,
+                              struct nilfs_sb_info *sbi,
-                                   struct nilfs_recovery_info *ri)
+                              struct nilfs_recovery_info *ri)
 {
        int err;
@@ -751,7 +781,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
                        goto failed;
                }
-                nilfs_finish_roll_forward(nilfs, sbi, ri);
+                nilfs_finish_roll_forward(nilfs, ri);
        }
 failed:
@@ -762,7 +792,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 /**
 * nilfs_search_super_root - search the latest valid super root
 * @nilfs: the_nilfs
- * @sbi: nilfs_sb_info
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -775,14 +804,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 * %-EINVAL - No valid segment found
 *
 * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+int nilfs_search_super_root(struct the_nilfs *nilfs,
                            struct nilfs_recovery_info *ri)
 {
-        struct nilfs_segsum_info ssi;
+        struct buffer_head *bh_sum = NULL;
+        struct nilfs_segment_summary *sum;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
        sector_t b, end;
+        unsigned long nblocks;
+        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
@@ -801,17 +835,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        /* Read ahead segment */
        b = seg_start;
        while (b <= seg_end)
-                sb_breadahead(sbi->s_super, b++);
+                __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
        for (;;) {
-                /* Load segment summary */
+                brelse(bh_sum);
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+                ret = NILFS_SEG_FAIL_IO;
+                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+                if (!bh_sum)
+                        goto failed;
+                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO)
                                goto failed;
                        goto strayed;
                }
-                pseg_end = pseg_start + ssi.nblocks - 1;
+                nblocks = le32_to_cpu(sum->ss_nblocks);
+                pseg_end = pseg_start + nblocks - 1;
                if (unlikely(pseg_end > seg_end)) {
                        ret = NILFS_SEG_FAIL_CONSISTENCY;
                        goto strayed;
@@ -821,11 +862,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_pseg_start = pseg_start;
                ri->ri_seq = seg_seq;
                ri->ri_segnum = segnum;
-                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                nextnum = nilfs_get_segnum_of_block(nilfs,
+                                                    le64_to_cpu(sum->ss_next));
                ri->ri_nextnum = nextnum;
                empty_seg = 0;
-                if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+                flags = le16_to_cpu(sum->ss_flags);
+                if (!(flags & NILFS_SS_SR) && !scan_newer) {
                        /* This will never happen because a superblock
                           (last_segment) always points to a pseg
                           having a super root. */
@@ -836,14 +879,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                if (pseg_start == seg_start) {
                        nilfs_get_segment_range(nilfs, nextnum, &b, &end);
                        while (b <= end)
-                                sb_breadahead(sbi->s_super, b++);
+                                __breadahead(nilfs->ns_bdev, b++,
+                                             nilfs->ns_blocksize);
                }
-                if (!NILFS_SEG_HAS_SR(&ssi)) {
+                if (!(flags & NILFS_SS_SR)) {
-                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+                        if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
                                ri->ri_lsegs_start = pseg_start;
                                ri->ri_lsegs_start_seq = seg_seq;
                        }
-                        if (NILFS_SEG_LOGEND(&ssi))
+                        if (flags & NILFS_SS_LOGEND)
                                ri->ri_lsegs_end = pseg_start;
                        goto try_next_pseg;
                }
@@ -854,12 +898,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
                nilfs_dispose_segment_list(&segments);
-                nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+                sr_pseg_start = pseg_start;
-                        + ssi.nblocks - seg_start;
+                nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
                nilfs->ns_seg_seq = seg_seq;
                nilfs->ns_segnum = segnum;
                nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
-                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
                nilfs->ns_nextnum = nextnum;
                if (scan_newer)
@@ -870,15 +914,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                        scan_newer = 1;
                }
-                /* reset region for roll-forward */
-                pseg_start += ssi.nblocks;
-                if (pseg_start < seg_end)
-                        continue;
-                goto feed_segment;
 try_next_pseg:
                /* Standing on a course, or met an inconsistent state */
-                pseg_start += ssi.nblocks;
+                pseg_start += nblocks;
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;
@@ -909,6 +947,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 super_root_found:
        /* Updating pointers relating to the latest checkpoint */
+        brelse(bh_sum);
        list_splice_tail(&segments, &ri->ri_used_segments);
        nilfs->ns_last_pseg = sr_pseg_start;
        nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -916,6 +955,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        return 0;
 failed:
+        brelse(bh_sum);
        nilfs_dispose_segment_list(&segments);
        return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
 }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 85fbb66455e2..b04f08cc2397 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -54,17 +54,6 @@ struct nilfs_segsum_info {
        sector_t                next;
 };
-/* macro for the flags */
-#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
-#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
-#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
-#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
-#define NILFS_SEG_SIMPLEX(sum) \
-        (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
-         (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
-#define NILFS_SEG_EMPTY(sum)    ((sum)->nblocks == (sum)->nsumblk)
 /**
 * struct nilfs_segment_buffer - Segment buffer
 * @sb_super: back pointer to a superblock struct
@@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
                                struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
+{
+        unsigned int flags = segbuf->sb_sum.flags;
+        return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
+                (NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
+}
+static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
+{
+        return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
+}
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
                               struct buffer_head *bh)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9201649cc49..9fd051a33c4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1914,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        }
                }
-                if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+                if (!nilfs_segbuf_simplex(segbuf)) {
-                        if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+                        if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
                                set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                                sci->sc_lseg_stime = jiffies;
                        }
-                        if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+                        if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
                                clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                }
        }
@@ -1951,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        if (update_sr) {
                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-                set_nilfs_sb_dirty(nilfs);
                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2082,7 +2081,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
-                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+                    nilfs_segbuf_empty(sci->sc_curseg)) {
                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
                }
@@ -2408,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
        int err = 0;
        nilfs_segctor_accept(sci);
@@ -2423,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
-                        err = nilfs_commit_super(
+                        err = -EIO;
-                                sbi, nilfs_altsb_need_update(nilfs));
+                        sbp = nilfs_prepare_super(sbi,
+                                                  nilfs_sb_will_flip(nilfs));
+                        if (likely(sbp)) {
+                                nilfs_set_log_cursor(sbp[0], nilfs);
+                                err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                        }
                        up_write(&nilfs->ns_sem);
                }
        }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 01e20dbb217d..17c487bd8152 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -234,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
-extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
                                       struct buffer_head **, int);
-extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+extern int nilfs_search_super_root(struct the_nilfs *,
                                   struct nilfs_recovery_info *);
-extern int nilfs_recover_logical_segments(struct the_nilfs *,
+extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
-                                          struct nilfs_sb_info *,
+                                     struct nilfs_sb_info *,
-                                          struct nilfs_recovery_info *);
+                                     struct nilfs_recovery_info *);
 extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 414ef68931cf..26078b3407c9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -55,6 +55,8 @@
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "cpfile.h"
 #include "ifile.h"
@@ -74,6 +76,25 @@ struct kmem_cache *nilfs_btree_path_cache;
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static void nilfs_set_error(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
+        down_write(&nilfs->ns_sem);
+        if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+                nilfs->ns_mount_state |= NILFS_ERROR_FS;
+                sbp = nilfs_prepare_super(sbi, 0);
+                if (likely(sbp)) {
+                        sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+                        if (sbp[1])
+                                sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+                        nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+                }
+        }
+        up_write(&nilfs->ns_sem);
+}
 /**
 * nilfs_error() - report failure condition on a filesystem
 *
@@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
-                struct the_nilfs *nilfs = sbi->s_nilfs;
+                nilfs_set_error(sbi);
-                down_write(&nilfs->ns_sem);
-                if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
-                        nilfs->ns_mount_state |= NILFS_ERROR_FS;
-                        nilfs->ns_sbp[0]->s_state |=
-                                cpu_to_le16(NILFS_ERROR_FS);
-                        nilfs_commit_super(sbi, 1);
-                }
-                up_write(&nilfs->ns_sem);
                if (nilfs_test_opt(sbi, ERRORS_RO)) {
                        printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -176,7 +188,7 @@ static void nilfs_clear_inode(struct inode *inode)
        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
@@ -202,12 +214,20 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
                printk(KERN_ERR
                       "NILFS: unable to write superblock (err=%d)\n", err);
                if (err == -EIO && nilfs->ns_sbh[1]) {
+                        /*
+                         * sbp[0] points to newer log than sbp[1],
+                         * so copy sbp[0] to sbp[1] to take over sbp[0].
+                         */
+                        memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
+                               nilfs->ns_sbsize);
                        nilfs_fall_back_super_block(nilfs);
                        goto retry;
                }
        } else {
                struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+                nilfs->ns_sbwcount++;
                /*
                 * The latest segment becomes trailable from the position
                 * written in superblock.
@@ -216,66 +236,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
                /* update GC protection for recent segments */
                if (nilfs->ns_sbh[1]) {
-                        sbp = NULL;
+                        if (flag == NILFS_SB_COMMIT_ALL) {
-                        if (dupsb) {
                                set_buffer_dirty(nilfs->ns_sbh[1]);
-                                if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+                                if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
-                                        sbp = nilfs->ns_sbp[1];
+                                        goto out;
                        }
+                        if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
+                            le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
+                                sbp = nilfs->ns_sbp[1];
                }
-                if (sbp) {
-                        spin_lock(&nilfs->ns_last_segment_lock);
-                        nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
-                        spin_unlock(&nilfs->ns_last_segment_lock);
-                }
-        }
+                spin_lock(&nilfs->ns_last_segment_lock);
+                nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+                spin_unlock(&nilfs->ns_last_segment_lock);
+        }
+ out:
        return err;
 }
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
+                          struct the_nilfs *nilfs)
+{
+        sector_t nfreeblocks;
+        /* nilfs->ns_sem must be locked by the caller. */
+        nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+        sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+        sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+        spin_unlock(&nilfs->ns_last_segment_lock);
+}
+struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+                                               int flip)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
-        sector_t nfreeblocks;
-        time_t t;
-        int err;
-        /* nilfs->sem must be locked by the caller. */
+        /* nilfs->ns_sem must be locked by the caller. */
        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
+                if (sbp[1] &&
-                        nilfs_swap_super_block(nilfs);
+                    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                else {
+                        memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+                } else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
                               sbi->s_super->s_id);
-                        return -EIO;
+                        return NULL;
                }
+        } else if (sbp[1] &&
+                   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+                        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        }
-        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
-        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: failed to count free blocks\n");
-                return err;
-        }
-        spin_lock(&nilfs->ns_last_segment_lock);
-        sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
-        sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
-        sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
-        spin_unlock(&nilfs->ns_last_segment_lock);
+        if (flip && sbp[1])
+                nilfs_swap_super_block(nilfs);
+        return sbp;
+}
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        time_t t;
+        /* nilfs->ns_sem must be locked by the caller. */
        t = get_seconds();
-        nilfs->ns_sbwtime[0] = t;
+        nilfs->ns_sbwtime = t;
-        sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
        sbp[0]->s_wtime = cpu_to_le64(t);
        sbp[0]->s_sum = 0;
        sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
                                             (unsigned char *)sbp[0],
                                             nilfs->ns_sbsize));
-        if (dupsb && sbp[1]) {
+        if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
-                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+                sbp[1]->s_wtime = sbp[0]->s_wtime;
-                nilfs->ns_sbwtime[1] = t;
+                sbp[1]->s_sum = 0;
+                sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+                                            (unsigned char *)sbp[1],
+                                            nilfs->ns_sbsize));
        }
        clear_nilfs_sb_dirty(nilfs);
-        return nilfs_sync_super(sbi, dupsb);
+        return nilfs_sync_super(sbi, flag);
+}
+/**
+ * nilfs_cleanup_super() - write filesystem state for cleanup
+ * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ *
+ * This function restores state flags in the on-disk super block.
+ * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
+ * filesystem was not clean previously.
+ */
+int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_super_block **sbp;
+        int flag = NILFS_SB_COMMIT;
+        int ret = -EIO;
+        sbp = nilfs_prepare_super(sbi, 0);
+        if (sbp) {
+                sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
+                nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+                if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
+                        /*
+                         * make the "clean" flag also to the opposite
+                         * super block if both super blocks point to
+                         * the same checkpoint.
+                         */
+                        sbp[1]->s_state = sbp[0]->s_state;
+                        flag = NILFS_SB_COMMIT_ALL;
+                }
+                ret = nilfs_commit_super(sbi, flag);
+        }
+        return ret;
 }
 static void nilfs_put_super(struct super_block *sb)
@@ -289,8 +365,7 @@ static void nilfs_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                nilfs_cleanup_super(sbi);
-                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        }
        down_write(&nilfs->ns_super_sem);
@@ -311,6 +386,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
        int err = 0;
        /* This function is called when super block should be written back */
@@ -318,8 +394,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
                err = nilfs_construct_segment(sb);
        down_write(&nilfs->ns_sem);
-        if (nilfs_sb_dirty(nilfs))
+        if (nilfs_sb_dirty(nilfs)) {
-                nilfs_commit_super(sbi, 1);
+                sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+                if (likely(sbp)) {
+                        nilfs_set_log_cursor(sbp[0], nilfs);
+                        nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                }
+        }
        up_write(&nilfs->ns_sem);
        return err;
@@ -442,20 +523,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        if (!nilfs_test_opt(sbi, BARRIER))
-                seq_printf(seq, ",nobarrier");
+                seq_puts(seq, ",nobarrier");
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
-                seq_printf(seq, ",errors=panic");
+                seq_puts(seq, ",errors=panic");
        if (nilfs_test_opt(sbi, ERRORS_CONT))
-                seq_printf(seq, ",errors=continue");
+                seq_puts(seq, ",errors=continue");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
-                seq_printf(seq, ",order=strict");
+                seq_puts(seq, ",order=strict");
        if (nilfs_test_opt(sbi, NORECOVERY))
-                seq_printf(seq, ",norecovery");
+                seq_puts(seq, ",norecovery");
        if (nilfs_test_opt(sbi, DISCARD))
-                seq_printf(seq, ",discard");
+                seq_puts(seq, ",discard");
        return 0;
 }
@@ -524,23 +605,25 @@ static const struct export_operations nilfs_export_ops = {
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+        Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-        Opt_discard, Opt_err,
+        Opt_discard, Opt_nodiscard, Opt_err,
 };
 static match_table_t tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_snapshot, "cp=%u"},
        {Opt_order, "order=%s"},
        {Opt_norecovery, "norecovery"},
        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL}
 };
-static int parse_options(char *options, struct super_block *sb)
+static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        char *p;
@@ -557,6 +640,9 @@ static int parse_options(char *options, struct super_block *sb)
                token = match_token(p, tokens, args);
                switch (token) {
+                case Opt_barrier:
+                        nilfs_set_opt(sbi, BARRIER);
+                        break;
                case Opt_nobarrier:
                        nilfs_clear_opt(sbi, BARRIER);
                        break;
@@ -582,8 +668,26 @@ static int parse_options(char *options, struct super_block *sb)
                case Opt_snapshot:
                        if (match_int(&args[0], &option) || option <= 0)
                                return 0;
-                        if (!(sb->s_flags & MS_RDONLY))
+                        if (is_remount) {
+                                if (!nilfs_test_opt(sbi, SNAPSHOT)) {
+                                        printk(KERN_ERR
+                                               "NILFS: cannot change regular "
+                                               "mount to snapshot.\n");
+                                        return 0;
+                                } else if (option != sbi->s_snapshot_cno) {
+                                        printk(KERN_ERR
+                                               "NILFS: cannot remount to a "
+                                               "different snapshot.\n");
+                                        return 0;
+                                }
+                                break;
+                        }
+                        if (!(sb->s_flags & MS_RDONLY)) {
+                                printk(KERN_ERR "NILFS: cannot mount snapshot "
+                                       "read/write.  A read-only option is "
+                                       "required.\n");
                                return 0;
+                        }
                        sbi->s_snapshot_cno = option;
                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
@@ -593,6 +697,9 @@ static int parse_options(char *options, struct super_block *sb)
                case Opt_discard:
                        nilfs_set_opt(sbi, DISCARD);
                        break;
+                case Opt_nodiscard:
+                        nilfs_clear_opt(sbi, DISCARD);
+                        break;
                default:
                        printk(KERN_ERR
                               "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -613,11 +720,18 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+        struct nilfs_super_block **sbp;
-        int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+        int max_mnt_count;
-        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+        int mnt_count;
+        /* nilfs->ns_sem must be locked by the caller. */
+        sbp = nilfs_prepare_super(sbi, 0);
+        if (!sbp)
+                return -EIO;
+        max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
+        mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
-        /* nilfs->sem must be locked by the caller. */
        if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
                printk(KERN_WARNING
                       "NILFS warning: mounting fs with errors\n");
@@ -628,12 +742,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 #endif
        }
        if (!max_mnt_count)
-                sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+                sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
-        sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
-        sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+        sbp[0]->s_state =
-        sbp->s_mtime = cpu_to_le64(get_seconds());
+                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
-        return nilfs_commit_super(sbi, 1);
+        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+        /* synchronize sbp[1] with sbp[0] */
+        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -670,7 +787,31 @@ int nilfs_store_magic_and_option(struct super_block *sb,
        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
-        return !parse_options(data, sb) ? -EINVAL : 0 ;
+        return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+}
+int nilfs_check_feature_compatibility(struct super_block *sb,
+                                      struct nilfs_super_block *sbp)
+{
+        __u64 features;
+        features = le64_to_cpu(sbp->s_feature_incompat) &
+                ~NILFS_FEATURE_INCOMPAT_SUPP;
+        if (features) {
+                printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+                       "optional features (%llx)\n",
+                       (unsigned long long)features);
+                return -EINVAL;
+        }
+        features = le64_to_cpu(sbp->s_feature_compat_ro) &
+                ~NILFS_FEATURE_COMPAT_RO_SUPP;
+        if (!(sb->s_flags & MS_RDONLY) && features) {
+                printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
+                       "unsupported optional features (%llx)\n",
+                       (unsigned long long)features);
+                return -EINVAL;
+        }
+        return 0;
 }
 /**
@@ -819,7 +960,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
-        struct nilfs_super_block *sbp;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
@@ -833,32 +973,17 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
-        if (!parse_options(data, sb)) {
+        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
        err = -EINVAL;
-        if (was_snapshot) {
+        if (was_snapshot && !(*flags & MS_RDONLY)) {
-                if (!(*flags & MS_RDONLY)) {
+                printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
-                        printk(KERN_ERR "NILFS (device %s): cannot remount "
+                       "read/write.\n", sb->s_id);
-                               "snapshot read/write.\n",
+                goto restore_opts;
-                               sb->s_id);
-                        goto restore_opts;
-                } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
-                        printk(KERN_ERR "NILFS (device %s): cannot "
-                               "remount to a different snapshot.\n",
-                               sb->s_id);
-                        goto restore_opts;
-                }
-        } else {
-                if (nilfs_test_opt(sbi, SNAPSHOT)) {
-                        printk(KERN_ERR "NILFS (device %s): cannot change "
-                               "a regular mount to a snapshot.\n",
-                               sb->s_id);
-                        goto restore_opts;
-                }
        }
        if (!nilfs_valid_fs(nilfs)) {
@@ -880,19 +1005,29 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * the RDONLY flag and then mark the partition as valid again.
                 */
                down_write(&nilfs->ns_sem);
-                sbp = nilfs->ns_sbp[0];
+                nilfs_cleanup_super(sbi);
-                if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
-                    (nilfs->ns_mount_state & NILFS_VALID_FS))
-                        sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
-                sbp->s_mtime = cpu_to_le64(get_seconds());
-                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        } else {
+                __u64 features;
                /*
                 * Mounting a RDONLY partition read-write, so reread and
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
+                down_read(&nilfs->ns_sem);
+                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
+                up_read(&nilfs->ns_sem);
+                if (features) {
+                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                               "remount RDWR because of unsupported optional "
+                               "features (%llx)\n",
+                               sb->s_id, (unsigned long long)features);
+                        err = -EROFS;
+                        goto restore_opts;
+                }
                sb->s_flags &= ~MS_RDONLY;
                err = nilfs_attach_segment_constructor(sbi);
@@ -1119,7 +1254,7 @@ static void nilfs_inode_init_once(void *obj)
        init_rwsem(&ii->xattr_sem);
 #endif
        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        ii->i_bmap = &ii->i_bmap_data;
        inode_init_once(&ii->vfs_inode);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8c1097327abc..37de1f062d81 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -38,6 +38,8 @@
 static LIST_HEAD(nilfs_objects);
 static DEFINE_SPINLOCK(nilfs_lock);
+static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
        nilfs->ns_last_pseg = start_blocknr;
        nilfs->ns_last_seq = seq;
        nilfs->ns_last_cno = cno;
+        if (!nilfs_sb_dirty(nilfs)) {
+                if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
+                        goto stay_cursor;
+                set_nilfs_sb_dirty(nilfs);
+        }
+        nilfs->ns_prev_seq = nilfs->ns_last_seq;
+ stay_cursor:
        spin_unlock(&nilfs->ns_last_segment_lock);
 }
@@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs)
        kfree(nilfs);
 }
-static int nilfs_load_super_root(struct the_nilfs *nilfs,
+static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
-                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
@@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        unsigned inode_size;
        int err;
-        err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+        err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
        if (unlikely(err))
                return err;
@@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
 }
 /**
+ * nilfs_store_log_cursor - load log cursor from a super block
+ * @nilfs: nilfs object
+ * @sbp: buffer storing super block to be read
+ *
+ * nilfs_store_log_cursor() reads the last position of the log
+ * containing a super root from a given super block, and initializes
+ * relevant information on the nilfs object preparatory for log
+ * scanning and recovery.
+ */
+static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
+                                  struct nilfs_super_block *sbp)
+{
+        int ret = 0;
+        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+        nilfs->ns_prev_seq = nilfs->ns_last_seq;
+        nilfs->ns_seg_seq = nilfs->ns_last_seq;
+        nilfs->ns_segnum =
+                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+        nilfs->ns_cno = nilfs->ns_last_cno + 1;
+        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+                printk(KERN_ERR "NILFS invalid last segment number.\n");
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/**
 * load_nilfs - load and recover the nilfs
 * @nilfs: the_nilfs structure to be released
 * @sbi: nilfs_sb_info used to recover past segment
@@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        nilfs_init_recovery_info(&ri);
-        err = nilfs_search_super_root(nilfs, sbi, &ri);
+        err = nilfs_search_super_root(nilfs, &ri);
        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: error searching super root.\n");
+                struct nilfs_super_block **sbp = nilfs->ns_sbp;
-                goto failed;
+                int blocksize;
+                if (err != -EINVAL)
+                        goto scan_error;
+                if (!nilfs_valid_sb(sbp[1])) {
+                        printk(KERN_WARNING
+                               "NILFS warning: unable to fall back to spare"
+                               "super block\n");
+                        goto scan_error;
+                }
+                printk(KERN_INFO
+                       "NILFS: try rollback from an earlier position\n");
+                /*
+                 * restore super block with its spare and reconfigure
+                 * relevant states of the nilfs object.
+                 */
+                memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+                nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
+                nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+                /* verify consistency between two super blocks */
+                blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+                if (blocksize != nilfs->ns_blocksize) {
+                        printk(KERN_WARNING
+                               "NILFS warning: blocksize differs between "
+                               "two super blocks (%d != %d)\n",
+                               blocksize, nilfs->ns_blocksize);
+                        goto scan_error;
+                }
+                err = nilfs_store_log_cursor(nilfs, sbp[0]);
+                if (err)
+                        goto scan_error;
+                /* drop clean flag to allow roll-forward and recovery */
+                nilfs->ns_mount_state &= ~NILFS_VALID_FS;
+                valid_fs = 0;
+                err = nilfs_search_super_root(nilfs, &ri);
+                if (err)
+                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto skip_recovery;
        if (s_flags & MS_RDONLY) {
+                __u64 features;
                if (nilfs_test_opt(sbi, NORECOVERY)) {
                        printk(KERN_INFO "NILFS: norecovery option specified. "
                               "skipping roll-forward recovery\n");
                        goto skip_recovery;
                }
+                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
+                if (features) {
+                        printk(KERN_ERR "NILFS: couldn't proceed with "
+                               "recovery because of unsupported optional "
+                               "features (%llx)\n",
+                               (unsigned long long)features);
+                        err = -EROFS;
+                        goto failed_unload;
+                }
                if (really_read_only) {
                        printk(KERN_ERR "NILFS: write access "
                               "unavailable, cannot proceed.\n");
@@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto failed_unload;
        }
-        err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+        err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
        if (err)
                goto failed_unload;
        down_write(&nilfs->ns_sem);
-        nilfs->ns_mount_state |= NILFS_VALID_FS;
+        nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
-        nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+        err = nilfs_cleanup_super(sbi);
-        err = nilfs_commit_super(sbi, 1);
        up_write(&nilfs->ns_sem);
        if (err) {
@@ -343,6 +438,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        sbi->s_super->s_flags = s_flags;
        return 0;
+ scan_error:
+        printk(KERN_ERR "NILFS: error searching super root.\n");
+        goto failed;
 failed_unload:
        nilfs_mdt_destroy(nilfs->ns_cpfile);
        nilfs_mdt_destroy(nilfs->ns_sufile);
@@ -515,8 +614,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                nilfs_swap_super_block(nilfs);
        }
-        nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+        nilfs->ns_sbwcount = 0;
-        nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+        nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
        nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
        *sbpp = sbp[0];
        return 0;
@@ -557,6 +656,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                if (err)
                        goto out;
+                err = nilfs_check_feature_compatibility(sb, sbp);
+                if (err)
+                        goto out;
                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
                if (sb->s_blocksize != blocksize &&
                    !sb_set_blocksize(sb, blocksize)) {
@@ -568,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                goto out;
        }
-        blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
        if (!blocksize) {
                printk(KERN_ERR "NILFS: unable to set blocksize\n");
                err = -EINVAL;
@@ -582,7 +685,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        if (err)
                goto failed_sbh;
+        err = nilfs_check_feature_compatibility(sb, sbp);
+        if (err)
+                goto failed_sbh;
        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+        if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+            blocksize > NILFS_MAX_BLOCK_SIZE) {
+                printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+                       "filesystem blocksize %d\n", blocksize);
+                err = -EINVAL;
+                goto failed_sbh;
+        }
        if (sb->s_blocksize != blocksize) {
                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
@@ -604,6 +718,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                           when reloading fails. */
        }
        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+        nilfs->ns_blocksize = blocksize;
        err = nilfs_store_disk_layout(nilfs, sbp);
        if (err)
@@ -616,23 +731,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
-        /* Finding last segment */
+        err = nilfs_store_log_cursor(nilfs, sbp);
-        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        if (err)
-        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
-        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
-        nilfs->ns_seg_seq = nilfs->ns_last_seq;
-        nilfs->ns_segnum =
-                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
-        nilfs->ns_cno = nilfs->ns_last_cno + 1;
-        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
-                printk(KERN_ERR "NILFS invalid last segment number.\n");
-                err = -EINVAL;
                goto failed_sbh;
-        }
-        /* Dummy values  */
-        nilfs->ns_free_segments_count =
-                nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
        /* Initialize gcinode cache */
        err = nilfs_init_gccache(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab974533697..f785a7b0ab99 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -57,7 +57,8 @@ enum {
 * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
- * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbwtime: previous write time of super block
+ * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
 * @ns_supers: list of nilfs super block structs
 * @ns_seg_seq: segment sequence counter
@@ -73,7 +74,7 @@ enum {
 * @ns_last_seq: sequence value of the latest segment
 * @ns_last_cno: checkpoint number of the latest segment
 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
- * @ns_free_segments_count: counter of free segments
+ * @ns_prev_seq: base sequence number used to decide if advance log cursor
 * @ns_segctor_sem: segment constructor semaphore
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
@@ -82,6 +83,7 @@ enum {
 * @ns_gc_inodes: dummy inodes to keep live blocks
 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
 * @ns_blocksize_bits: bit length of block size
+ * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
 * @ns_blocks_per_segment: number of blocks per segment
 * @ns_r_segments_percentage: reserved segments percentage
@@ -119,7 +121,8 @@ struct the_nilfs {
         */
        struct buffer_head     *ns_sbh[2];
        struct nilfs_super_block *ns_sbp[2];
-        time_t                  ns_sbwtime[2];
+        time_t                  ns_sbwtime;
+        unsigned                ns_sbwcount;
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
@@ -149,7 +152,7 @@ struct the_nilfs {
        u64                     ns_last_seq;
        __u64                   ns_last_cno;
        u64                     ns_prot_seq;
-        unsigned long           ns_free_segments_count;
+        u64                     ns_prev_seq;
        struct rw_semaphore     ns_segctor_sem;
@@ -168,6 +171,7 @@ struct the_nilfs {
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
+        unsigned int            ns_blocksize;
        unsigned long           ns_nsegments;
        unsigned long           ns_blocks_per_segment;
        unsigned long           ns_r_segments_percentage;
@@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
-#define NILFS_ALTSB_FREQ        60  /* spare superblock */
 static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
 {
        u64 t = get_seconds();
-        return t < nilfs->ns_sbwtime[0] ||
+        return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
-                 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
 }
-static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 {
-        u64 t = get_seconds();
+        int flip_bits = nilfs->ns_sbwcount & 0x0FL;
-        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return (flip_bits != 0x08 && flip_bits != 0x0F);
-        return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
 }
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..96337a4fbbdf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
                        dump_stack();
                        goto bail;
                }
-                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-                     (unsigned long long)past_eof);
-                if (create && (iblock >= past_eof))
-                        set_buffer_new(bh_result);
        }
+        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+             (unsigned long long)past_eof);
+        if (create && (iblock >= past_eof))
+                set_buffer_new(bh_result);
 bail:
        if (err < 0)
                err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers(	handle_t *handle,
        return ret;
 }
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-                                                         struct page *page,
-                                                         unsigned from,
-                                                         unsigned to)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle;
-        int ret = 0;
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        if (ocfs2_should_order_data(inode)) {
-                ret = ocfs2_jbd2_file_inode(handle, inode);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-out:
-        if (ret) {
-                if (!IS_ERR(handle))
-                        ocfs2_commit_trans(osb, handle);
-                handle = ERR_PTR(ret);
-        }
-        return handle;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -609,7 +578,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
                             loff_t offset,
                             ssize_t bytes,
-                             void *private)
+                             void *private,
+                             int ret,
+                             bool is_async)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int level;
@@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        if (!level)
                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
+        if (is_async)
+                aio_complete(iocb, ret, 0);
 }
 /*
@@ -1131,23 +1105,37 @@ out:
 */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                                      struct ocfs2_write_ctxt *wc,
-                                      u32 cpos, loff_t user_pos, int new,
+                                      u32 cpos, loff_t user_pos,
+                                      unsigned user_len, int new,
                                      struct page *mmap_page)
 {
        int ret = 0, i;
-        unsigned long start, target_index, index;
+        unsigned long start, target_index, end_index, index;
        struct inode *inode = mapping->host;
+        loff_t last_byte;
        target_index = user_pos >> PAGE_CACHE_SHIFT;
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
-         * page. Otherwise, we'll need a whole clusters worth.
+         * page. Otherwise, we'll need a whole clusters worth.  If we're
+         * writing past i_size, we only need enough pages to cover the
+         * last page of the write.
         */
        if (new) {
                wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
                start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+                /*
+                 * We need the index *past* the last page we could possibly
+                 * touch.  This is the page past the end of the write or
+                 * i_size, whichever is greater.
+                 */
+                last_byte = max(user_pos + user_len, i_size_read(inode));
+                BUG_ON(last_byte < 1);
+                end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+                if ((start + wc->w_num_pages) > end_index)
+                        wc->w_num_pages = end_index - start;
        } else {
                wc->w_num_pages = 1;
                start = target_index;
@@ -1620,21 +1608,20 @@ out:
 * write path can treat it as an non-allocating write, which has no
 * special case code for sparse/nonsparse files.
 */
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
-                                        unsigned len,
+                                        struct buffer_head *di_bh,
+                                        loff_t pos, unsigned len,
                                        struct ocfs2_write_ctxt *wc)
 {
        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        loff_t newsize = pos + len;
-        if (ocfs2_sparse_alloc(osb))
+        BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
-                return 0;
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, pos);
+        ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
        if (ret)
                mlog_errno(ret);
@@ -1644,6 +1631,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        return ret;
 }
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+                           loff_t pos)
+{
+        int ret = 0;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+        if (pos > i_size_read(inode))
+                ret = ocfs2_zero_extend(inode, di_bh, pos);
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
@@ -1679,7 +1678,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                }
        }
-        ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+        if (ocfs2_sparse_alloc(osb))
+                ret = ocfs2_zero_tail(inode, di_bh, pos);
+        else
+                ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+                                                   wc);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1789,7 +1792,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * that we can zero and flush if we error after adding the
         * extent.
         */
-        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                         cluster_of_pages, mmap_page);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf54396..5efdd37dfe48 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
 static int debug_buffer_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+        struct debug_buffer *db = file->private_data;
        if (db)
                kfree(db->buf);
@@ -715,7 +715,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
                goto bail;
        }
-        seq = (struct seq_file *) file->private_data;
+        seq = file->private_data;
        seq->private = dl;
        dlm_grab(dlm);
@@ -731,7 +731,7 @@ bail:
 static int debug_lockres_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct seq_file *seq = file->private_data;
        struct debug_lockres *dl = (struct debug_lockres *)seq->private;
        if (dl->dl_res)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e1749..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
        struct dlm_ctxt *dlm = NULL;
        struct dlm_ctxt *new_ctxt = NULL;
-        if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+        if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
                ret = -ENAMETOOLONG;
                mlog(ML_ERROR, "domain name length too long\n");
                goto leave;
@@ -1709,6 +1709,7 @@ retry:
                }
                if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+                        spin_unlock(&dlm_domain_lock);
                        mlog(ML_ERROR,
                             "Requested locking protocol version is not "
                             "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e314..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
                mlog(0, "trying again...\n");
                goto again;
        }
-        /* now that we are sure the MIGRATING state is there, drop
-         * the unneded state which blocked threads trying to DIRTY */
-        spin_lock(&res->spinlock);
-        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
-        BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
-        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
-        spin_unlock(&res->spinlock);
+        ret = 0;
        /* did the target go down or die? */
        spin_lock(&dlm->spinlock);
        if (!test_bit(target, dlm->domain_map)) {
@@ -2826,9 +2820,21 @@ again:
        spin_unlock(&dlm->spinlock);
        /*
+         * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+         * another try; otherwise, we are sure the MIGRATING state is there,
+         * drop the unneded state which blocked threads trying to DIRTY
+         */
+        spin_lock(&res->spinlock);
+        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+        if (!ret)
+                BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+        spin_unlock(&res->spinlock);
+        /*
         * at this point:
         *
-         *   o the DLM_LOCK_RES_MIGRATING flag is set
+         *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
         *   o there are no pending asts on this lockres
         *   o all processes trying to reserve an ast on this
         *     lockres must wait for the MIGRATING flag to clear
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be70..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
                int bit;
-                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit >= O2NM_MAX_NODES || bit < 0)
                        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
                else
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f5..bef34d0528d5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode,
 {
        int level, status;
        struct dlmfs_inode_private *ip = DLMFS_I(inode);
-        struct dlmfs_filp_private *fp =
+        struct dlmfs_filp_private *fp = file->private_data;
-                (struct dlmfs_filp_private *) file->private_data;
        if (S_ISDIR(inode->i_mode))
                BUG();
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 39eb16ac5f98..5e02a893f46e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq = (struct seq_file *) file->private_data;
+        struct seq_file *seq = file->private_data;
        struct ocfs2_dlm_seq_priv *priv = seq->private;
        struct ocfs2_lock_res *res = &priv->p_iter_res;
@@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
                goto out;
        }
-        seq = (struct seq_file *) file->private_data;
+        seq = file->private_data;
        seq->private = priv;
        ocfs2_add_lockres_tracking(&priv->p_iter_res,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea64c447..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
        return status;
 }
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = NULL;
+        int ret = 0;
+        if (!ocfs2_should_order_data(inode))
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_jbd2_file_inode(handle, inode);
+        if (ret < 0)
+                mlog_errno(ret);
+out:
+        if (ret) {
+                if (!IS_ERR(handle))
+                        ocfs2_commit_trans(osb, handle);
+                handle = ERR_PTR(ret);
+        }
+        return handle;
+}
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
-                                 u64 size)
+                                 u64 abs_to)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
-        unsigned long index;
+        unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-        unsigned int offset;
        handle_t *handle = NULL;
-        int ret;
+        int ret = 0;
+        unsigned zero_from, zero_to, block_start, block_end;
-        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+        BUG_ON(abs_from >= abs_to);
-        /* ugh.  in prepare/commit_write, if from==to==start of block, we
+        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
-        ** skip the prepare.  make sure we never send an offset for the start
+        BUG_ON(abs_from & (inode->i_blkbits - 1));
-        ** of a block
-        */
-        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-                offset++;
-        }
-        index = size >> PAGE_CACHE_SHIFT;
        page = grab_cache_page(mapping, index);
        if (!page) {
@@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
+        /* Get the offsets within the page that we want to zero */
-        if (ret < 0) {
+        zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
-                mlog_errno(ret);
+        zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
-                goto out_unlock;
+        if (!zero_to)
-        }
+                zero_to = PAGE_CACHE_SIZE;
-        if (ocfs2_should_order_data(inode)) {
+        mlog(0,
-                handle = ocfs2_start_walk_page_trans(inode, page, offset,
+             "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-                                                     offset);
+             (unsigned long long)abs_from, (unsigned long long)abs_to,
-                if (IS_ERR(handle)) {
+             index, zero_from, zero_to);
-                        ret = PTR_ERR(handle);
-                        handle = NULL;
+        /* We know that zero_from is block aligned */
+        for (block_start = zero_from; block_start < zero_to;
+             block_start = block_end) {
+                block_end = block_start + (1 << inode->i_blkbits);
+                /*
+                 * block_start is block-aligned.  Bump it by one to
+                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * whole block.
+                 */
+                ret = ocfs2_prepare_write_nolock(inode, page,
+                                                 block_start + 1,
+                                                 block_start + 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
                        goto out_unlock;
                }
-        }
-        /* must not update i_size! */
+                if (!handle) {
-        ret = block_commit_write(page, offset, offset);
+                        handle = ocfs2_zero_start_ordered_transaction(inode);
-        if (ret < 0)
+                        if (IS_ERR(handle)) {
-                mlog_errno(ret);
+                                ret = PTR_ERR(handle);
-        else
+                                handle = NULL;
-                ret = 0;
+                                break;
+                        }
+                }
+                /* must not update i_size! */
+                ret = block_commit_write(page, block_start + 1,
+                                         block_start + 1);
+                if (ret < 0)
+                        mlog_errno(ret);
+                else
+                        ret = 0;
+        }
        if (handle)
                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -786,22 +838,114 @@ out:
        return ret;
 }
-static int ocfs2_zero_extend(struct inode *inode,
+/*
-                             u64 zero_to_size)
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       u64 zero_start, u64 zero_end,
+                                       u64 *range_start, u64 *range_end)
 {
-        int ret = 0;
+        int rc = 0, needs_cow = 0;
-        u64 start_off;
+        u32 p_cpos, zero_clusters = 0;
-        struct super_block *sb = inode->i_sb;
+        u32 zero_cpos =
+                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
-        start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        while (zero_cpos < last_cpos) {
-        while (start_off < zero_to_size) {
+                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
-                ret = ocfs2_write_zero_page(inode, start_off);
+                                        &num_clusters, &ext_flags);
-                if (ret < 0) {
+                if (rc) {
-                        mlog_errno(ret);
+                        mlog_errno(rc);
+                        goto out;
+                }
+                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                        zero_clusters = num_clusters;
+                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                                needs_cow = 1;
+                        break;
+                }
+                zero_cpos += num_clusters;
+        }
+        if (!zero_clusters) {
+                *range_end = 0;
+                goto out;
+        }
+        while ((zero_cpos + zero_clusters) < last_cpos) {
+                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+                                        &p_cpos, &num_clusters,
+                                        &ext_flags);
+                if (rc) {
+                        mlog_errno(rc);
                        goto out;
                }
-                start_off += sb->s_blocksize;
+                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+                        break;
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        needs_cow = 1;
+                zero_clusters += num_clusters;
+        }
+        if ((zero_cpos + zero_clusters) > last_cpos)
+                zero_clusters = last_cpos - zero_cpos;
+        if (needs_cow) {
+                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+                                        UINT_MAX);
+                if (rc) {
+                        mlog_errno(rc);
+                        goto out;
+                }
+        }
+        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+                                             zero_cpos + zero_clusters);
+out:
+        return rc;
+}
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+                                   u64 range_end)
+{
+        int rc = 0;
+        u64 next_pos;
+        u64 zero_pos = range_start;
+        mlog(0, "range_start = %llu, range_end = %llu\n",
+             (unsigned long long)range_start,
+             (unsigned long long)range_end);
+        BUG_ON(range_start >= range_end);
+        while (zero_pos < range_end) {
+                next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+                if (next_pos > range_end)
+                        next_pos = range_end;
+                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+                if (rc < 0) {
+                        mlog_errno(rc);
+                        break;
+                }
+                zero_pos = next_pos;
                /*
                 * Very large extends have the potential to lock up
@@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
                cond_resched();
        }
-out:
+        return rc;
+}
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to_size)
+{
+        int ret = 0;
+        u64 zero_start, range_start = 0, range_end = 0;
+        struct super_block *sb = inode->i_sb;
+        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        mlog(0, "zero_start %llu for i_size %llu\n",
+             (unsigned long long)zero_start,
+             (unsigned long long)i_size_read(inode));
+        while (zero_start < zero_to_size) {
+                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+                                                  zero_to_size,
+                                                  &range_start,
+                                                  &range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                if (!range_end)
+                        break;
+                /* Trim the ends */
+                if (range_start < zero_start)
+                        range_start = zero_start;
+                if (range_end > zero_to_size)
+                        range_end = zero_to_size;
+                ret = ocfs2_zero_extend_range(inode, range_start,
+                                              range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                zero_start = range_end;
+        }
        return ret;
 }
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+                          u64 new_i_size, u64 zero_to)
 {
        int ret;
        u32 clusters_to_add;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        /*
+         * Only quota files call this without a bh, and they can't be
+         * refcounted.
+         */
+        BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
        if (clusters_to_add < oi->ip_clusters)
                clusters_to_add = 0;
@@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
         * still need to zero the area between the old i_size and the
         * new i_size.
         */
-        ret = ocfs2_zero_extend(inode, zero_to);
+        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
        if (ret < 0)
                mlog_errno(ret);
@@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        if (i_size_read(inode) == new_i_size)
-                goto out;
+                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
        /*
-         * Fall through for converting inline data, even if the fs
-         * supports sparse files.
-         *
-         * The check for inline data here is legal - nobody can add
-         * the feature since we have i_mutex. We must check it again
-         * after acquiring ip_alloc_sem though, as paths like mmap
-         * might have raced us to converting the inode to extents.
-         */
-        if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                goto out_update_size;
-        /*
         * The alloc sem blocks people in read/write from reading our
         * allocation until we're done changing it. We depend on
         * i_mutex to block other extend/truncate calls while we're
-         * here.
+         * here.  We even have to hold it for sparse files because there
+         * might be some tail zeroing.
         */
        down_write(&oi->ip_alloc_sem);
@@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
                if (ret) {
                        up_write(&oi->ip_alloc_sem);
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+        else
+                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+                                            new_i_size);
        up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
-                          u64 zero_to);
+                          u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..9b57c0350ff9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
 }
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Quota blocks have their own trigger because the struct ocfs2_block_check
 * offset depends on the blocksize.
 */
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Directory blocks also have their own trigger because the
 * struct ocfs2_block_check offset depends on the blocksize.
 */
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 static struct ocfs2_triggers di_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
 static struct ocfs2_triggers eb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
 static struct ocfs2_triggers rb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
 static struct ocfs2_triggers gd_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
 static struct ocfs2_triggers db_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_db_commit_trigger,
+                .t_frozen = ocfs2_db_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers xb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
 static struct ocfs2_triggers dq_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_dq_commit_trigger,
+                .t_frozen = ocfs2_dq_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers dr_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
 static struct ocfs2_triggers dl_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
        if (osb->osb_commit_interval)
                commit_interval = osb->osb_commit_interval;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_commit_interval = commit_interval;
        if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
-                                      ocfs2_orphan_scan_timeout());
+                                   ocfs2_orphan_scan_timeout());
        }
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc0..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
 {
        unsigned int la_mb;
        unsigned int gd_mb;
+        unsigned int la_max_mb;
        unsigned int megs_per_slot;
        struct super_block *sb = osb->sb;
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
        if (megs_per_slot < la_mb)
                la_mb = megs_per_slot;
+        /* We can't store more bits than we can in a block. */
+        la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        if (la_mb > la_max_mb)
+                la_mb = la_max_mb;
        return la_mb;
 }
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
                 * locking allocators ranks above a transaction start
                 */
                WARN_ON(journal_current_handle());
-                status = ocfs2_extend_no_holes(gqinode,
+                status = ocfs2_extend_no_holes(gqinode, NULL,
                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
                        gqinode->i_size);
                if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + 2 * sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                return ocfs2_local_quota_add_chunk(sb, type, offset);
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+        /*
+         * We only duplicate pages until we reach the page contains i_size - 1.
+         * So trim 'end' to i_size.
+         */
+        if (end > i_size_read(context->inode))
+                end = i_size_read(context->inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
        struct inode *inode = old_dentry->d_inode;
        struct buffer_head *new_bh = NULL;
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
        ret = filemap_fdatawrite(inode->i_mapping);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4d..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     le16_to_cpu(bg->bg_free_bits_count));
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
                     le16_to_cpu(bg->bg_bits));
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
+        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0eaa929a4dbf..03a799fdd740 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2472,7 +2472,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
-         * allocate osb->journal at the start of ocfs2_initalize_osb(),
+         * allocate osb->journal at the start of ocfs2_initialize_osb(),
         * we free it here.
         */
        kfree(osb->journal);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cfe..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         struct ocfs2_xattr_value_buf *vb,
                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int status = 0;
+        int status = 0, credits;
        handle_t *handle = ctxt->handle;
        enum ocfs2_alloc_restarted why;
        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
        ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
-        status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+        while (clusters_to_add) {
-                              OCFS2_JOURNAL_ACCESS_WRITE);
+                status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-        if (status < 0) {
+                                       OCFS2_JOURNAL_ACCESS_WRITE);
-                mlog_errno(status);
+                if (status < 0) {
-                goto leave;
+                        mlog_errno(status);
-        }
+                        break;
+                }
-        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+                prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-        status = ocfs2_add_clusters_in_btree(handle,
+                status = ocfs2_add_clusters_in_btree(handle,
-                                             &et,
+                                                     &et,
-                                             &logical_start,
+                                                     &logical_start,
-                                             clusters_to_add,
+                                                     clusters_to_add,
-                                             0,
+                                                     0,
-                                             ctxt->data_ac,
+                                                     ctxt->data_ac,
-                                             ctxt->meta_ac,
+                                                     ctxt->meta_ac,
-                                             &why);
+                                                     &why);
-        if (status < 0) {
+                if ((status < 0) && (status != -EAGAIN)) {
-                mlog_errno(status);
+                        if (status != -ENOSPC)
-                goto leave;
+                                mlog_errno(status);
-        }
+                        break;
+                }
-        ocfs2_journal_dirty(handle, vb->vb_bh);
+                ocfs2_journal_dirty(handle, vb->vb_bh);
-        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+                clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+                                         prev_clusters;
-        /*
+                if (why != RESTART_NONE && clusters_to_add) {
-         * We should have already allocated enough space before the transaction,
+                        /*
-         * so no need to restart.
+                         * We can only fail in case the alloc file doesn't give
-         */
+                         * up enough clusters.
-        BUG_ON(why != RESTART_NONE || clusters_to_add);
+                         */
+                        BUG_ON(why == RESTART_META);
-leave:
+                        mlog(0, "restarting xattr value extension for %u"
+                             " clusters,.\n", clusters_to_add);
+                        credits = ocfs2_calc_extend_credits(inode->i_sb,
+                                                            &vb->vb_xv->xr_list,
+                                                            clusters_to_add);
+                        status = ocfs2_extend_trans(handle, credits);
+                        if (status < 0) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                break;
+                        }
+                }
+        }
        return status;
 }
@@ -6788,16 +6804,15 @@ out:
        return ret;
 }
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
                                u64 blkno, u64 new_blkno, u32 clusters,
+                                u32 *cpos, int num_buckets,
                                struct ocfs2_alloc_context *meta_ac,
                                struct ocfs2_alloc_context *data_ac,
                                struct ocfs2_reflink_xattr_tree_args *args)
 {
        int i, j, ret = 0;
        struct super_block *sb = args->reflink->old_inode->i_sb;
-        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        u32 num_buckets = clusters * bpc;
        int bpb = args->old_bucket->bu_blocks;
        struct ocfs2_xattr_value_buf vb = {
                .vb_access = ocfs2_journal_access,
@@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                        break;
                }
-                /*
-                 * The real bucket num in this series of blocks is stored
-                 * in the 1st bucket.
-                 */
-                if (i == 0)
-                        num_buckets = le16_to_cpu(
-                                bucket_xh(args->old_bucket)->xh_num_buckets);
                ret = ocfs2_xattr_bucket_journal_access(handle,
                                                args->new_bucket,
                                                OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                               bucket_block(args->old_bucket, j),
                               sb->s_blocksize);
+                /*
+                 * Record the start cpos so that we can use it to initialize
+                 * our xattr tree we also set the xh_num_bucket for the new
+                 * bucket.
+                 */
+                if (i == 0) {
+                        *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+                                            xh_entries[0].xe_name_hash);
+                        bucket_xh(args->new_bucket)->xh_num_buckets =
+                                cpu_to_le16(num_buckets);
+                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ocfs2_xattr_bucket_relse(args->old_bucket);
                ocfs2_xattr_bucket_relse(args->new_bucket);
        }
@@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
        ocfs2_xattr_bucket_relse(args->new_bucket);
        return ret;
 }
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+                                struct inode *inode,
+                                struct ocfs2_reflink_xattr_tree_args *args,
+                                struct ocfs2_extent_tree *et,
+                                struct ocfs2_alloc_context *meta_ac,
+                                struct ocfs2_alloc_context *data_ac,
+                                u64 blkno, u32 cpos, u32 len)
+{
+        int ret, first_inserted = 0;
+        u32 p_cluster, num_clusters, reflink_cpos = 0;
+        u64 new_blkno;
+        unsigned int num_buckets, reflink_buckets;
+        unsigned int bpc =
+                ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+        ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+        ocfs2_xattr_bucket_relse(args->old_bucket);
+        while (len && num_buckets) {
+                ret = ocfs2_claim_clusters(handle, data_ac,
+                                           1, &p_cluster, &num_clusters);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+                reflink_buckets = min(num_buckets, bpc * num_clusters);
+                ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+                                                 new_blkno, num_clusters,
+                                                 &reflink_cpos, reflink_buckets,
+                                                 meta_ac, data_ac, args);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * For the 1st allocated cluster, we make it use the same cpos
+                 * so that the xattr tree looks the same as the original one
+                 * in the most case.
+                 */
+                if (!first_inserted) {
+                        reflink_cpos = cpos;
+                        first_inserted = 1;
+                }
+                ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+                                          num_clusters, 0, meta_ac);
+                if (ret)
+                        mlog_errno(ret);
+                mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+                     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+                len -= num_clusters;
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                num_buckets -= reflink_buckets;
+        }
+out:
+        return ret;
+}
 /*
 * Create the same xattr extent record in the new inode's xattr tree.
 */
@@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                                   void *para)
 {
        int ret, credits = 0;
-        u32 p_cluster, num_clusters;
-        u64 new_blkno;
        handle_t *handle;
        struct ocfs2_reflink_xattr_tree_args *args =
                        (struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_extent_tree et;
+        mlog(0, "reflink xattr buckets %llu len %u\n",
+             (unsigned long long)blkno, len);
        ocfs2_init_xattr_tree_extent_tree(&et,
                                          INODE_CACHE(args->reflink->new_inode),
                                          args->new_blk_bh);
@@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(handle, data_ac,
+        ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
-                                   len, &p_cluster, &num_clusters);
+                                          meta_ac, data_ac,
-        if (ret) {
+                                          blkno, cpos, len);
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-        mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
-             (unsigned long long)blkno, (unsigned long long)new_blkno, len);
-        ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
-                                          meta_ac, data_ac, args);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-             (unsigned long long)new_blkno, len, cpos);
-        ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
-                                  len, 0, meta_ac);
        if (ret)
                mlog_errno(ret);
-out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
diff --git a/fs/open.c b/fs/open.c
index 5463266db9e6..0d1fa3dc0efb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -110,7 +110,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
-                error = security_path_truncate(&path, length, 0);
+                error = security_path_truncate(&path);
        if (!error)
                error = do_truncate(path.dentry, length, 0, NULL);
@@ -165,8 +165,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
-                error = security_path_truncate(&file->f_path, length,
+                error = security_path_truncate(&file->f_path);
-                                               ATTR_MTIME|ATTR_CTIME);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
@@ -367,7 +366,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
        if (error)
                goto out;
-        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;
@@ -396,7 +395,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
        if (!S_ISDIR(inode->i_mode))
                goto out_putf;
-        error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -414,7 +413,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        if (error)
                goto out;
-        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5dcd4b0c5533..72c52656dc2e 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -459,7 +459,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        }
        /* everything is up and running, commence */
-        INIT_RCU_HEAD(&p->rcu_head);
        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967ff..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
        } *label;
        unsigned char *data;
        Sector sect;
+        sector_t labelsect;
        res = 0;
        blocksize = bdev_logical_block_size(bdev);
@@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state)
                goto out_freeall;
        /*
+         * Special case for FBA disks: label sector does not depend on
+         * blocksize.
+         */
+        if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+            (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+                labelsect = info->label_block;
+        else
+                labelsect = info->label_block * (blocksize >> 9);
+        /*
         * Get volume label, extract name and type.
         */
-        data = read_part_sector(state, info->label_block*(blocksize/512),
+        data = read_part_sector(state, labelsect, &sect);
-                                &sect);
        if (data == NULL)
                goto out_readerr;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b58d38bc911..fff6572676ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                if (tracer)
                        tpid = task_pid_nr_ns(tracer, ns);
        }
-        cred = get_cred((struct cred *) __task_cred(p));
+        cred = get_task_cred(p);
        seq_printf(m,
                "State:\t%s\n"
                "Tgid:\t%d\n"
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 12c233da1b6b..ef72b1699429 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+void __quota_error(struct super_block *sb, const char *func,
+                  const char *fmt, ...)
+{
+        va_list args;
+        if (printk_ratelimit()) {
+                va_start(args, fmt);
+                printk(KERN_ERR "Quota error (device %s): %s: ",
+                       sb->s_id, func);
+                vprintk(fmt, args);
+                printk("\n");
+                va_end(args);
+        }
+}
+EXPORT_SYMBOL(__quota_error);
 #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
 #endif
@@ -676,7 +692,7 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                spin_lock(&dq_list_lock);
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
                return;
 #ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
-                printk("VFS: dqput: trying to free free dquot\n");
+                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
-                printk("VFS: device %s, dquot of %s %d\n",
+                            quotatypes[dquot->dq_type], dquot->dq_id);
-                        dquot->dq_sb->s_id,
-                        quotatypes[dquot->dq_type],
-                        dquot->dq_id);
                BUG();
        }
 #endif
@@ -732,9 +745,9 @@ we_slept:
                /* Commit dquot before releasing */
                ret = dquot->dq_sb->dq_op->write_dquot(dquot);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: cannot write quota structure on "
+                        quota_error(dquot->dq_sb, "Can't write quota structure"
-                                "device %s (error %d). Quota may get out of "
+                                    " (error %d). Quota may get out of sync!",
-                                "sync!\n", dquot->dq_sb->s_id, ret);
+                                    ret);
                        /*
                         * We clear dirty bit anyway, so that we avoid
                         * infinite loop here
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
 #ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
-                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+                quota_error(sb, "Writes happened before quota was turned on "
-                        " was turned on thus quota information is probably "
+                        "thus quota information is probably inconsistent. "
-                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+                        "Please run quotacheck(8)");
        }
 #endif
 }
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
                if (dqput_blocks(dquot)) {
 #ifdef CONFIG_QUOTA_DEBUG
                        if (atomic_read(&dquot->dq_count) != 1)
-                                printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
+                                quota_error(inode->i_sb, "Adding dquot with "
+                                            "dq_count %d to dispose list",
+                                            atomic_read(&dquot->dq_count));
 #endif
                        spin_lock(&dq_list_lock);
                        /* As dquot must have currently users it can't be on
@@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                struct list_head *tofree_head)
 {
        struct inode *inode;
+        int reserved = 0;
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                 *  only quota pointers and these have separate locking
                 *  (dqptr_sem).
                 */
-                if (!IS_NOQUOTA(inode))
+                if (!IS_NOQUOTA(inode)) {
+                        if (unlikely(inode_get_rsv_space(inode) > 0))
+                                reserved = 1;
                        remove_inode_dquot_ref(inode, type, tofree_head);
+                }
        }
        spin_unlock(&inode_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+        if (reserved) {
+                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+                        " was disabled thus quota information is probably "
+                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+        }
+#endif
 }
 /* Gather all references from inodes and drop them */
@@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
        return QUOTA_NL_NOWARN;
 }
+static int dquot_active(const struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        if (IS_NOQUOTA(inode))
+                return 0;
+        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
 /*
 * Initialize quota pointers in inode
 *
@@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return;
        /* First get references to structures we might need. */
@@ -1507,7 +1542,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
         * First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex
         */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_incr_space(inode, number, reserve);
                goto out;
        }
@@ -1559,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1596,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
        int cnt;
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_claim_rsv_space(inode, number);
                return 0;
        }
@@ -1629,7 +1664,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_decr_space(inode, number, reserve);
                return;
        }
@@ -1667,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1790,7 +1825,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        struct super_block *sb = inode->i_sb;
        int ret;
-        if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return 0;
        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
@@ -1957,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
                                truncate_inode_pages(&toputinode[cnt]->i_data,
                                                     0);
                                mutex_unlock(&toputinode[cnt]->i_mutex);
-                                mark_inode_dirty(toputinode[cnt]);
+                                mark_inode_dirty_sync(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
                }
@@ -2270,7 +2305,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
        memset(di, 0, sizeof(*di));
        di->d_version = FS_DQUOT_VERSION;
        di->d_flags = dquot->dq_type == USRQUOTA ?
-                        XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+                        FS_USER_QUOTA : FS_GROUP_QUOTA;
        di->d_id = dquot->dq_id;
        spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb5..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
        ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
        if (ret != info->dqi_usable_bs) {
-                q_warn(KERN_WARNING "VFS: dquota write failed on "
+                quota_error(sb, "dquota write failed");
-                        "dev %s\n", sb->s_id);
                if (ret >= 0)
                        ret = -EIO;
        }
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
        /* No matter whether write succeeds block is out of list */
        if (write_blk(info, blk, buf) < 0)
-                q_warn(KERN_ERR
+                quota_error(info->dqi_sb, "Can't write block (%u) "
-                       "VFS: Can't write block (%u) with free entries.\n",
+                            "with free entries", blk);
-                       blk);
        return 0;
 out_buf:
        kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
                *err = remove_free_dqentry(info, buf, blk);
                if (*err < 0) {
-                        q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                        quota_error(dquot->dq_sb, "Can't remove block (%u) "
-                               "remove block (%u) from entry free list.\n",
+                                    "from entry free list", blk);
-                               blk);
                        goto out_buf;
                }
        }
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        }
 #ifdef __QUOTA_QT_PARANOIA
        if (i == qtree_dqstr_in_blk(info)) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+                quota_error(dquot->dq_sb, "Data block full but it shouldn't");
-                                "but it shouldn't.\n");
                *err = -EIO;
                goto out_buf;
        }
 #endif
        *err = write_blk(info, blk, buf);
        if (*err < 0) {
-                q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                quota_error(dquot->dq_sb, "Can't write quota data block %u",
-                                "data block %u.\n", blk);
+                            blk);
                goto out_buf;
        }
        dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        } else {
                ret = read_blk(info, *treeblk, buf);
                if (ret < 0) {
-                        q_warn(KERN_ERR "VFS: Can't read tree quota block "
+                        quota_error(dquot->dq_sb, "Can't read tree quota "
-                                        "%u.\n", *treeblk);
+                                    "block %u", *treeblk);
                        goto out_buf;
                }
        }
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
                if (newblk) {
-                        printk(KERN_ERR "VFS: Inserting already present quota "
+                        quota_error(dquot->dq_sb, "Inserting already present "
-                                        "entry (block %u).\n",
+                                    "quota entry (block %u)",
-                               le32_to_cpu(ref[get_index(info,
+                                    le32_to_cpu(ref[get_index(info,
                                                dquot->dq_id, depth)]));
                        ret = -EIO;
                        goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (!dquot->dq_off) {
                ret = dq_insert_tree(info, dquot);
                if (ret < 0) {
-                        q_warn(KERN_ERR "VFS: Error %zd occurred while "
+                        quota_error(sb, "Error %zd occurred while creating "
-                                        "creating quota.\n", ret);
+                                    "quota", ret);
                        kfree(ddquot);
                        return ret;
                }
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
                                    dquot->dq_off);
        if (ret != info->dqi_entry_size) {
-                q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                quota_error(sb, "dquota write failed");
-                       sb->s_id);
                if (ret >= 0)
                        ret = -ENOSPC;
        } else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (!buf)
                return -ENOMEM;
        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-                q_warn(KERN_ERR "VFS: Quota structure has offset to other "
+                quota_error(dquot->dq_sb, "Quota structure has offset to "
-                  "block (%u) than it should (%u).\n", blk,
+                        "other block (%u) than it should (%u)", blk,
-                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+                        (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
                goto out_buf;
        }
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
+                            blk);
                goto out_buf;
        }
        dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                if (ret >= 0)
                        ret = put_free_dqblk(info, buf, blk);
                if (ret < 0) {
-                        q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
+                        quota_error(dquot->dq_sb, "Can't move quota data block "
-                          "to free list.\n", blk);
+                                    "(%u) to free list", blk);
                        goto out_buf;
                }
        } else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                        /* Insert will write block itself */
                        ret = insert_free_dqentry(info, buf, blk);
                        if (ret < 0) {
-                                q_warn(KERN_ERR "VFS: Can't insert quota data "
+                                quota_error(dquot->dq_sb, "Can't insert quota "
-                                       "block (%u) to free entry list.\n", blk);
+                                    "data block (%u) to free entry list", blk);
                                goto out_buf;
                        }
                } else {
                        ret = write_blk(info, blk, buf);
                        if (ret < 0) {
-                                q_warn(KERN_ERR "VFS: Can't write quota data "
+                                quota_error(dquot->dq_sb, "Can't write quota "
-                                  "block %u\n", blk);
+                                            "data block %u", blk);
                                goto out_buf;
                        }
                }
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                quota_error(dquot->dq_sb, "Can't read quota data "
+                            "block %u", blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                q_warn(KERN_ERR "VFS: Can't write quota tree "
+                                quota_error(dquot->dq_sb, "Can't write quota "
-                                  "block %u.\n", *blk);
+                                            "tree block %u", blk);
                }
        }
 out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota tree "
+                            "block %u", blk);
                goto out_buf;
        }
        ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                ddquot += info->dqi_entry_size;
        }
        if (i == qtree_dqstr_in_blk(info)) {
-                q_warn(KERN_ERR "VFS: Quota for id %u referenced "
+                quota_error(dquot->dq_sb, "Quota for id %u referenced "
-                  "but not present.\n", dquot->dq_id);
+                            "but not present", dquot->dq_id);
                ret = -EIO;
                goto out_buf;
        } else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+                            blk);
                goto out_buf;
        }
        ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 #ifdef __QUOTA_QT_PARANOIA
        /* Invalidated quota? */
        if (!sb_dqopt(dquot->dq_sb)->files[type]) {
-                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+                quota_error(sb, "Quota invalidated while reading!");
                return -EIO;
        }
 #endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
                offset = find_dqentry(info, dquot);
                if (offset <= 0) {      /* Entry not present? */
                        if (offset < 0)
-                                q_warn(KERN_ERR "VFS: Can't read quota "
+                                quota_error(sb, "Can't read quota structure "
-                                  "structure for id %u.\n", dquot->dq_id);
+                                            "for id %u", dquot->dq_id);
                        dquot->dq_off = 0;
                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
                        memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (ret != info->dqi_entry_size) {
                if (ret >= 0)
                        ret = -EIO;
-                q_warn(KERN_ERR "VFS: Error while reading quota "
+                quota_error(sb, "Error while reading quota structure for id %u",
-                                "structure for id %u.\n", dquot->dq_id);
+                            dquot->dq_id);
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
                kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d8..a1ab8db81a51 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
 #define QT_TREEOFF      1               /* Offset of tree in file in blocks */
-#define q_warn(fmt, args...) \
-do { \
-        if (printk_ratelimit()) \
-                printk(fmt, ## args); \
-} while(0)
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852a..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
                        (char *)&dqblk, sizeof(struct v1_disk_dqblk),
                        v1_dqoff(dquot->dq_id));
        if (ret != sizeof(struct v1_disk_dqblk)) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                quota_error(dquot->dq_sb, "dquota write failed");
-                        dquot->dq_sb->s_id);
                if (ret >= 0)
                        ret = -EIO;
                goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af1458..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
        size = sb->s_op->quota_read(sb, type, (char *)dqhead,
                                    sizeof(struct v2_disk_dqheader), 0);
        if (size != sizeof(struct v2_disk_dqheader)) {
-                q_warn(KERN_WARNING "quota_v2: Failed header read:"
+                quota_error(sb, "Failed header read: expected=%zd got=%zd",
-                       " expected=%zd got=%zd\n",
+                            sizeof(struct v2_disk_dqheader), size);
-                        sizeof(struct v2_disk_dqheader), size);
                return 0;
        }
        return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
+                quota_error(sb, "Can't read info structure");
-                        sb->s_id);
                return -1;
        }
        info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
+                quota_error(sb, "Can't write info structure");
-                        sb->s_id);
                return -1;
        }
        return 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0f22fdaf54ac..29db72203bde 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1221,7 +1221,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
                inode_set_bytes(inode,
                                to_real_used_space(inode, inode->i_blocks,
                                                   SD_V2_SIZE));
-                /* read persistent inode attributes from sd and initalise
+                /* read persistent inode attributes from sd and initialise
                   generic inode flags from them */
                REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
                sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
diff --git a/fs/splice.c b/fs/splice.c
index 740e6b9faf7a..efdbfece9932 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
-        return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+        return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+                              sd->flags);
 }
 /**
@@ -1371,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
-                        if (!out->f_op || !out->f_op->llseek ||
+                        if (!(out->f_mode & FMODE_PWRITE))
-                            out->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
@@ -1392,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
-                        if (!in->f_op || !in->f_op->llseek ||
+                        if (!(in->f_mode & FMODE_PREAD))
-                            in->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a6..1b27b5688f62 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 * @mode: file permissions.
 *
 */
-int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+                     mode_t mode)
 {
        struct sysfs_dirent *sd;
        struct iattr newattrs;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index f71246bebfe4..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        struct sysfs_dirent *target_sd = NULL;
        struct sysfs_dirent *sd = NULL;
        struct sysfs_addrm_cxt acxt;
+        enum kobj_ns_type ns_type;
        int error;
        BUG_ON(!name);
@@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
-        if (sysfs_ns_type(parent_sd))
+        ns_type = sysfs_ns_type(parent_sd);
+        if (ns_type)
                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
        sysfs_addrm_start(&acxt, parent_sd);
-        if (warn)
+        /* Symlinks must be between directories with the same ns_type */
-                error = sysfs_add_one(&acxt, sd);
+        if (!ns_type ||
-        else
+            (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-                error = __sysfs_add_one(&acxt, sd);
+                if (warn)
+                        error = sysfs_add_one(&acxt, sd);
+                else
+                        error = __sysfs_add_one(&acxt, sd);
+        } else {
+                error = -EINVAL;
+                WARN(1, KERN_WARNING
+                        "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+                        parent_sd->s_name,
+                        sd->s_name,
+                        sd->s_symlink.target_sd->s_parent->s_name,
+                        sd->s_symlink.target_sd->s_name);
+        }
        sysfs_addrm_finish(&acxt);
        if (error)
@@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 {
        const void *ns = NULL;
        spin_lock(&sysfs_assoc_lock);
-        if (targ->sd)
+        if (targ->sd && sysfs_ns_type(kobj->sd))
                ns = targ->sd->s_ns;
        spin_unlock(&sysfs_assoc_lock);
        sysfs_hash_and_remove(kobj->sd, ns, name);
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827ea..0084a33c4c69 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        iip = (i & (UBIFS_LPT_FANOUT - 1));
        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
        nnode = c->nroot;
        nnode = dirty_cow_nnode(c, nnode);
        if (IS_ERR(nnode))
-                return ERR_PTR(PTR_ERR(nnode));
+                return ERR_CAST(nnode);
        i = lnum - c->main_first;
        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
        for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
                nnode = dirty_cow_nnode(c, nnode);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        pnode = dirty_cow_pnode(c, pnode);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        iip = (i & (UBIFS_LPT_FANOUT - 1));
        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237bf..d12535b7fc78 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        return ubifs_get_pnode(c, nnode, iip);
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb5..daae9e1f5382 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
 * This file implements functions needed to recover from unclean un-mounts.
 * When UBIFS is mounted, it checks a flag on the master node to determine if
 * an un-mount was completed successfully. If not, the process of mounting
- * incorparates additional checking and fixing of on-flash data structures.
+ * incorporates additional checking and fixing of on-flash data structures.
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
 * read-only, and the flash is not modified in that case.
@@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
        }
        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
        if (err) {
-                if (err == -ENOSPC)
+                /*
-                        dbg_err("could not find a dirty LEB");
+                 * There are no dirty or empty LEBs subject to here being
+                 * enough for the index. Try to use
+                 * 'ubifs_find_free_leb_for_idx()', which will return any empty
+                 * LEBs (ignoring index requirements). If the index then
+                 * doesn't have enough LEBs the recovery commit will fail -
+                 * which is the  same result anyway i.e. recovery fails. So
+                 * there is no problem ignoring index  requirements and just
+                 * grabbing a free LEB since we have already established there
+                 * is not a dirty LEB we could have used instead.
+                 */
+                if (err == -ENOSPC) {
+                        dbg_rcvry("could not find a dirty LEB");
+                        goto find_free;
+                }
                return err;
        }
        ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 find_free:
        /*
         * There is no GC head LEB or the free space in the GC head LEB is too
-         * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+         * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-         * GC is not run.
+         * 'ubifs_find_free_leb_for_idx()' so GC is not run.
         */
        lnum = ubifs_find_free_leb_for_idx(c);
        if (lnum < 0) {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
        return 0;
 }
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        int freed, contention = 0;
        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3f..5fc5a0988970 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1307,6 +1307,8 @@ static int mount_ubifs(struct ubifs_info *c)
                        if (err)
                                goto out_orphans;
                        err = ubifs_rcvry_gc_commit(c);
+                        if (err)
+                                goto out_orphans;
                } else {
                        err = take_gc_lnum(c);
                        if (err)
@@ -1318,7 +1320,7 @@ static int mount_ubifs(struct ubifs_info *c)
                         */
                        err = ubifs_leb_unmap(c, c->gc_lnum);
                        if (err)
-                                return err;
+                                goto out_orphans;
                }
                err = dbg_check_lprops(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2eef553d50c8..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 /* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 /* commit.c */
 int ubifs_bg_thread(void *info);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 94e06d6bddbd..6e450e01a1bb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
-#include <linux/smp_lock.h>
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 612d1e2e285a..12bb651e5400 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1579,9 +1579,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
 {
        struct anchorVolDescPtr *anchor;
        long main_s, main_e, reserve_s, reserve_e;
-        struct udf_sb_info *sbi;
-        sbi = UDF_SB(sb);
        anchor = (struct anchorVolDescPtr *)bh->b_data;
        /* Locate the main sequence */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c8fb13f83b3f..0dce969d6cad 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -87,11 +87,9 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
-                                   xfs_trans_item.o \
                                   xfs_utils.o \
                                   xfs_vnodeops.o \
-                                   xfs_rw.o \
+                                   xfs_rw.o
-                                   xfs_dmops.o
 xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 9f769b5b38fc..b2771862fd3d 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
        struct posix_acl *acl;
        int error = -EAGAIN;
-        xfs_itrace_entry(ip);
+        trace_xfs_check_acl(ip);
        /*
         * If there is no attribute fork no ACL exists on this inode and
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34640d6dbdcb..d24e78f32f3e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
@@ -92,18 +85,15 @@ void
 xfs_count_page_state(
        struct page             *page,
        int                     *delalloc,
-        int                     *unmapped,
        int                     *unwritten)
 {
        struct buffer_head      *bh, *head;
-        *delalloc = *unmapped = *unwritten = 0;
+        *delalloc = *unwritten = 0;
        bh = head = page_buffers(page);
        do {
-                if (buffer_uptodate(bh) && !buffer_mapped(bh))
+                if (buffer_unwritten(bh))
-                        (*unmapped) = 1;
-                else if (buffer_unwritten(bh))
                        (*unwritten) = 1;
                else if (buffer_delay(bh))
                        (*delalloc) = 1;
@@ -212,23 +202,17 @@ xfs_setfilesize(
 }
 /*
- * Schedule IO completion handling on a xfsdatad if this was
+ * Schedule IO completion handling on the final put of an ioend.
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
 */
 STATIC void
 xfs_finish_ioend(
-        xfs_ioend_t     *ioend,
+        struct xfs_ioend        *ioend)
-        int             wait)
 {
        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                struct workqueue_struct *wq;
+                if (ioend->io_type == IO_UNWRITTEN)
+                        queue_work(xfsconvertd_workqueue, &ioend->io_work);
-                wq = (ioend->io_type == IO_UNWRITTEN) ?
+                else
-                        xfsconvertd_workqueue : xfsdatad_workqueue;
+                        queue_work(xfsdatad_workqueue, &ioend->io_work);
-                queue_work(wq, &ioend->io_work);
-                if (wait)
-                        flush_workqueue(wq);
        }
 }
@@ -272,11 +256,25 @@ xfs_end_io(
         */
        if (error == EAGAIN) {
                atomic_inc(&ioend->io_remaining);
-                xfs_finish_ioend(ioend, 0);
+                xfs_finish_ioend(ioend);
                /* ensure we don't spin on blocked ioends */
                delay(1);
-        } else
+        } else {
+                if (ioend->io_iocb)
+                        aio_complete(ioend->io_iocb, ioend->io_result, 0);
                xfs_destroy_ioend(ioend);
+        }
+}
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+        struct xfs_ioend        *ioend)
+{
+        if (atomic_dec_and_test(&ioend->io_remaining))
+                xfs_end_io(&ioend->io_work);
 }
 /*
@@ -309,6 +307,8 @@ xfs_alloc_ioend(
        atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
        ioend->io_offset = 0;
        ioend->io_size = 0;
+        ioend->io_iocb = NULL;
+        ioend->io_result = 0;
        INIT_WORK(&ioend->io_work, xfs_end_io);
        return ioend;
@@ -358,7 +358,7 @@ xfs_end_bio(
        bio->bi_end_io = NULL;
        bio_put(bio);
-        xfs_finish_ioend(ioend, 0);
+        xfs_finish_ioend(ioend);
 }
 STATIC void
@@ -500,7 +500,7 @@ xfs_submit_ioend(
                }
                if (bio)
                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                xfs_finish_ioend(ioend, 0);
+                xfs_finish_ioend(ioend);
        } while ((ioend = next) != NULL);
 }
@@ -614,31 +614,30 @@ xfs_map_at_offset(
 STATIC unsigned int
 xfs_probe_page(
        struct page             *page,
-        unsigned int            pg_offset,
+        unsigned int            pg_offset)
-        int                     mapped)
 {
+        struct buffer_head      *bh, *head;
        int                     ret = 0;
        if (PageWriteback(page))
                return 0;
+        if (!PageDirty(page))
+                return 0;
+        if (!page->mapping)
+                return 0;
+        if (!page_has_buffers(page))
+                return 0;
-        if (page->mapping && PageDirty(page)) {
+        bh = head = page_buffers(page);
-                if (page_has_buffers(page)) {
+        do {
-                        struct buffer_head      *bh, *head;
+                if (!buffer_uptodate(bh))
+                        break;
-                        bh = head = page_buffers(page);
+                if (!buffer_mapped(bh))
-                        do {
+                        break;
-                                if (!buffer_uptodate(bh))
+                ret += bh->b_size;
-                                        break;
+                if (ret >= pg_offset)
-                                if (mapped != buffer_mapped(bh))
+                        break;
-                                        break;
+        } while ((bh = bh->b_this_page) != head);
-                                ret += bh->b_size;
-                                if (ret >= pg_offset)
-                                        break;
-                        } while ((bh = bh->b_this_page) != head);
-                } else
-                        ret = mapped ? 0 : PAGE_CACHE_SIZE;
-        }
        return ret;
 }
@@ -648,8 +647,7 @@ xfs_probe_cluster(
        struct inode            *inode,
        struct page             *startpage,
        struct buffer_head      *bh,
-        struct buffer_head      *head,
+        struct buffer_head      *head)
-        int                     mapped)
 {
        struct pagevec          pvec;
        pgoff_t                 tindex, tlast, tloff;
@@ -658,7 +656,7 @@ xfs_probe_cluster(
        /* First sum forwards in this page */
        do {
-                if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
                        return total;
                total += bh->b_size;
        } while ((bh = bh->b_this_page) != head);
@@ -692,7 +690,7 @@ xfs_probe_cluster(
                                pg_offset = PAGE_CACHE_SIZE;
                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset, mapped);
+                                pg_len = xfs_probe_page(page, pg_offset);
                                unlock_page(page);
                        }
@@ -761,7 +759,6 @@ xfs_convert_page(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     startio,
        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
@@ -832,19 +829,14 @@ xfs_convert_page(
                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
                        xfs_map_at_offset(inode, bh, imap, offset);
-                        if (startio) {
+                        xfs_add_to_ioend(inode, bh, offset, type,
-                                xfs_add_to_ioend(inode, bh, offset,
+                                         ioendp, done);
-                                                type, ioendp, done);
-                        } else {
-                                set_buffer_dirty(bh);
-                                unlock_buffer(bh);
-                                mark_buffer_dirty(bh);
-                        }
                        page_dirty--;
                        count++;
                } else {
                        type = IO_NEW;
-                        if (buffer_mapped(bh) && all_bh && startio) {
+                        if (buffer_mapped(bh) && all_bh) {
                                lock_buffer(bh);
                                xfs_add_to_ioend(inode, bh, offset,
                                                type, ioendp, done);
@@ -859,14 +851,12 @@ xfs_convert_page(
        if (uptodate && bh == head)
                SetPageUptodate(page);
-        if (startio) {
+        if (count) {
-                if (count) {
+                wbc->nr_to_write--;
-                        wbc->nr_to_write--;
+                if (wbc->nr_to_write <= 0)
-                        if (wbc->nr_to_write <= 0)
+                        done = 1;
-                                done = 1;
-                }
-                xfs_start_page_writeback(page, !page_dirty, count);
        }
+        xfs_start_page_writeback(page, !page_dirty, count);
        return done;
 fail_unlock_page:
@@ -886,7 +876,6 @@ xfs_cluster_write(
        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     startio,
        int                     all_bh,
        pgoff_t                 tlast)
 {
@@ -902,7 +891,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        imap, ioendp, wbc, startio, all_bh);
+                                        imap, ioendp, wbc, all_bh);
                        if (done)
                                break;
                }
@@ -981,7 +970,7 @@ xfs_aops_discard_page(
                 */
                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL, NULL);
+                                &nimaps, NULL);
                if (error) {
                        /* something screwed, just bail */
@@ -1009,7 +998,7 @@ xfs_aops_discard_page(
                 */
                xfs_bmap_init(&flist, &firstblock);
                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, NULL, &done);
+                                        &flist, &done);
                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
@@ -1032,50 +1021,66 @@ out_invalidate:
 }
 /*
- * Calling this without startio set means we are being asked to make a dirty
+ * Write out a dirty page.
- * page ready for freeing it's buffers.  When called with startio set then
+ *
- * we are coming from writepage.
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
 *
- * When called with startio set it is important that we write the WHOLE
+ * If we detect that a transaction would be required to flush the page, we
- * page if possible.
+ * have to check the process flags first, if we are already in a transaction
- * The bh->b_state's cannot know if any of the blocks or which block for
+ * or disk I/O during allocations is off, we need to fail the writepage and
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
+ * redirty the page.
- * only valid if the page itself isn't completely uptodate.  Some layers
- * may clear the page dirty flag prior to calling write page, under the
- * assumption the entire page will be written out; by not writing out the
- * whole page the page can be reused before all valid dirty data is
- * written out.  Note: in the case of a page that has been dirty'd by
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
 */
 STATIC int
-xfs_page_state_convert(
+xfs_vm_writepage(
-        struct inode    *inode,
+        struct page             *page,
-        struct page     *page,
+        struct writeback_control *wbc)
-        struct writeback_control *wbc,
-        int             startio,
-        int             unmapped) /* also implies page uptodate */
 {
+        struct inode            *inode = page->mapping->host;
+        int                     delalloc, unwritten;
        struct buffer_head      *bh, *head;
        struct xfs_bmbt_irec    imap;
        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
-        unsigned long           p_offset = 0;
        unsigned int            type;
        __uint64_t              end_offset;
        pgoff_t                 end_index, last_index;
        ssize_t                 size, len;
        int                     flags, err, imap_valid = 0, uptodate = 1;
-        int                     page_dirty, count = 0;
+        int                     count = 0;
-        int                     trylock = 0;
+        int                     all_bh = 0;
-        int                     all_bh = unmapped;
-        if (startio) {
+        trace_xfs_writepage(inode, page, 0);
-                if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
-                        trylock |= BMAPI_TRYLOCK;
+        ASSERT(page_has_buffers(page));
-        }
+        /*
+         * Refuse to write the page out if we are called from reclaim context.
+         *
+         * This avoids stack overflows when called from deeply used stacks in
+         * random callers for direct reclaim or memcg reclaim.  We explicitly
+         * allow reclaim from kswapd as the stack usage there is relatively low.
+         *
+         * This should really be done by the core VM, but until that happens
+         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * by themselves.
+         */
+        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+                goto out_fail;
+        /*
+         * We need a transaction if there are delalloc or unwritten buffers
+         * on the page.
+         *
+         * If we need a transaction and the process flags say we are already
+         * in a transaction, or no IO is allowed then mark the page dirty
+         * again and leave the page as is.
+         */
+        xfs_count_page_state(page, &delalloc, &unwritten);
+        if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+                goto out_fail;
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
@@ -1084,50 +1089,33 @@ xfs_page_state_convert(
        if (page->index >= end_index) {
                if ((page->index >= end_index + 1) ||
                    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-                        if (startio)
+                        unlock_page(page);
-                                unlock_page(page);
                        return 0;
                }
        }
-        /*
-         * page_dirty is initially a count of buffers on the page before
-         * EOF and is decremented as we move each into a cleanable state.
-         *
-         * Derivation:
-         *
-         * End offset is the highest offset that this page should represent.
-         * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-         * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-         * hence give us the correct page_dirty count. On any other page,
-         * it will be zero and in that case we need page_dirty to be the
-         * count of buffers on the page.
-         */
        end_offset = min_t(unsigned long long,
-                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
+                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+                        offset);
        len = 1 << inode->i_blkbits;
-        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                        PAGE_CACHE_SIZE);
-        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-        page_dirty = p_offset / len;
        bh = head = page_buffers(page);
        offset = page_offset(page);
        flags = BMAPI_READ;
        type = IO_NEW;
-        /* TODO: cleanup count and page_dirty */
        do {
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
                        uptodate = 0;
-                if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
-                        /*
+                /*
-                         * the iomap is actually still valid, but the ioend
+                 * A hole may still be marked uptodate because discard_buffer
-                         * isn't.  shouldn't happen too often.
+                 * leaves the flag set.
-                         */
+                 */
+                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                        ASSERT(!buffer_dirty(bh));
                        imap_valid = 0;
                        continue;
                }
@@ -1135,19 +1123,7 @@ xfs_page_state_convert(
                if (imap_valid)
                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                /*
+                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                 * First case, map an unwritten extent and prepare for
-                 * extent state conversion transaction on completion.
-                 *
-                 * Second case, allocate space for a delalloc buffer.
-                 * We can return EAGAIN here in the release page case.
-                 *
-                 * Third case, an unmapped buffer was found, and we are
-                 * in a path where we need to write the whole page out.
-                 */
-                if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                    ((buffer_uptodate(bh) || PageUptodate(page)) &&
-                     !buffer_mapped(bh) && (unmapped || startio))) {
                        int new_ioend = 0;
                        /*
@@ -1161,15 +1137,16 @@ xfs_page_state_convert(
                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
                        } else if (buffer_delay(bh)) {
                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE | trylock;
+                                flags = BMAPI_ALLOCATE;
-                        } else {
-                                type = IO_NEW;
+                                if (wbc->sync_mode == WB_SYNC_NONE &&
-                                flags = BMAPI_WRITE | BMAPI_MMAP;
+                                    wbc->nonblocking)
+                                        flags |= BMAPI_TRYLOCK;
                        }
                        if (!imap_valid) {
                                /*
-                                 * if we didn't have a valid mapping then we
+                                 * If we didn't have a valid mapping then we
                                 * need to ensure that we put the new mapping
                                 * in a new ioend structure. This needs to be
                                 * done to ensure that the ioends correctly
@@ -1177,14 +1154,7 @@ xfs_page_state_convert(
                                 * for unwritten extent conversion.
                                 */
                                new_ioend = 1;
-                                if (type == IO_NEW) {
+                                err = xfs_map_blocks(inode, offset, len,
-                                        size = xfs_probe_cluster(inode,
-                                                        page, bh, head, 0);
-                                } else {
-                                        size = len;
-                                }
-                                err = xfs_map_blocks(inode, offset, size,
                                                &imap, flags);
                                if (err)
                                        goto error;
@@ -1193,19 +1163,11 @@ xfs_page_state_convert(
                        }
                        if (imap_valid) {
                                xfs_map_at_offset(inode, bh, &imap, offset);
-                                if (startio) {
+                                xfs_add_to_ioend(inode, bh, offset, type,
-                                        xfs_add_to_ioend(inode, bh, offset,
+                                                 &ioend, new_ioend);
-                                                        type, &ioend,
-                                                        new_ioend);
-                                } else {
-                                        set_buffer_dirty(bh);
-                                        unlock_buffer(bh);
-                                        mark_buffer_dirty(bh);
-                                }
-                                page_dirty--;
                                count++;
                        }
-                } else if (buffer_uptodate(bh) && startio) {
+                } else if (buffer_uptodate(bh)) {
                        /*
                         * we got here because the buffer is already mapped.
                         * That means it must already have extents allocated
@@ -1213,8 +1175,7 @@ xfs_page_state_convert(
                         */
                        if (!imap_valid || flags != BMAPI_READ) {
                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh,
+                                size = xfs_probe_cluster(inode, page, bh, head);
-                                                                head, 1);
                                err = xfs_map_blocks(inode, offset, size,
                                                &imap, flags);
                                if (err)
@@ -1233,18 +1194,16 @@ xfs_page_state_convert(
                         */
                        type = IO_NEW;
                        if (trylock_buffer(bh)) {
-                                ASSERT(buffer_mapped(bh));
                                if (imap_valid)
                                        all_bh = 1;
                                xfs_add_to_ioend(inode, bh, offset, type,
                                                &ioend, !imap_valid);
-                                page_dirty--;
                                count++;
                        } else {
                                imap_valid = 0;
                        }
-                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
+                } else if (PageUptodate(page)) {
-                           (unmapped || startio)) {
+                        ASSERT(buffer_mapped(bh));
                        imap_valid = 0;
                }
@@ -1256,8 +1215,7 @@ xfs_page_state_convert(
        if (uptodate && bh == head)
                SetPageUptodate(page);
-        if (startio)
+        xfs_start_page_writeback(page, 1, count);
-                xfs_start_page_writeback(page, 1, count);
        if (ioend && imap_valid) {
                xfs_off_t               end_index;
@@ -1275,131 +1233,27 @@ xfs_page_state_convert(
                        end_index = last_index;
                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                        wbc, startio, all_bh, end_index);
+                                        wbc, all_bh, end_index);
        }
        if (iohead)
                xfs_submit_ioend(wbc, iohead);
-        return page_dirty;
+        return 0;
 error:
        if (iohead)
                xfs_cancel_ioend(iohead);
-        /*
+        xfs_aops_discard_page(page);
-         * If it's delalloc and we have nowhere to put it,
+        ClearPageUptodate(page);
-         * throw it away, unless the lower layers told
+        unlock_page(page);
-         * us to try again.
-         */
-        if (err != -EAGAIN) {
-                if (!unmapped)
-                        xfs_aops_discard_page(page);
-                ClearPageUptodate(page);
-        }
        return err;
-}
-/*
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- *    state is cleared before we get here. In this case is it
- *    conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-STATIC int
-xfs_vm_writepage(
-        struct page             *page,
-        struct writeback_control *wbc)
-{
-        int                     error;
-        int                     need_trans;
-        int                     delalloc, unmapped, unwritten;
-        struct inode            *inode = page->mapping->host;
-        trace_xfs_writepage(inode, page, 0);
-        /*
-         * Refuse to write the page out if we are called from reclaim context.
-         *
-         * This is primarily to avoid stack overflows when called from deep
-         * used stacks in random callers for direct reclaim, but disabling
-         * reclaim for kswap is a nice side-effect as kswapd causes rather
-         * suboptimal I/O patters, too.
-         *
-         * This should really be done by the core VM, but until that happens
-         * filesystems like XFS, btrfs and ext4 have to take care of this
-         * by themselves.
-         */
-        if (current->flags & PF_MEMALLOC)
-                goto out_fail;
-        /*
-         * We need a transaction if:
-         *  1. There are delalloc buffers on the page
-         *  2. The page is uptodate and we have unmapped buffers
-         *  3. The page is uptodate and we have no buffers
-         *  4. There are unwritten buffers on the page
-         */
-        if (!page_has_buffers(page)) {
-                unmapped = 1;
-                need_trans = 1;
-        } else {
-                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-                if (!PageUptodate(page))
-                        unmapped = 0;
-                need_trans = delalloc + unmapped + unwritten;
-        }
-        /*
-         * If we need a transaction and the process flags say
-         * we are already in a transaction, or no IO is allowed
-         * then mark the page dirty again and leave the page
-         * as is.
-         */
-        if (current_test_flags(PF_FSTRANS) && need_trans)
-                goto out_fail;
-        /*
-         * Delay hooking up buffer heads until we have
-         * made our go/no-go decision.
-         */
-        if (!page_has_buffers(page))
-                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-        /*
-         * Convert delayed allocate, unwritten or unmapped space
-         * to real space and flush out to disk.
-         */
-        error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
-        if (error == -EAGAIN)
-                goto out_fail;
-        if (unlikely(error < 0))
-                goto out_unlock;
-        return 0;
 out_fail:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
        return 0;
-out_unlock:
-        unlock_page(page);
-        return error;
 }
 STATIC int
@@ -1413,65 +1267,27 @@ xfs_vm_writepages(
 /*
 * Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
+ * to be released. The page should already be clean. We always
 * have buffer heads in this call.
 *
- * Returns 0 if the page is ok to release, 1 otherwise.
+ * Returns 1 if the page is ok to release, 0 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- *    to via regular I/O. buffer heads will be dirty and possibly
- *    delalloc. If no delalloc buffer heads in this case then we
- *    can just return zero.
- *
- * 2. We are called to release a page which has been written via
- *    mmap, all we need to do is ensure there is no delalloc
- *    state in the buffer heads, if not we can let the caller
- *    free them and we should come back later via writepage.
 */
 STATIC int
 xfs_vm_releasepage(
        struct page             *page,
        gfp_t                   gfp_mask)
 {
-        struct inode            *inode = page->mapping->host;
+        int                     delalloc, unwritten;
-        int                     dirty, delalloc, unmapped, unwritten;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 1,
-        };
-        trace_xfs_releasepage(inode, page, 0);
+        trace_xfs_releasepage(page->mapping->host, page, 0);
-        if (!page_has_buffers(page))
-                return 0;
-        xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+        xfs_count_page_state(page, &delalloc, &unwritten);
-        if (!delalloc && !unwritten)
-                goto free_buffers;
-        if (!(gfp_mask & __GFP_FS))
+        if (WARN_ON(delalloc))
                return 0;
+        if (WARN_ON(unwritten))
-        /* If we are already inside a transaction or the thread cannot
-         * do I/O, we cannot release this page.
-         */
-        if (current_test_flags(PF_FSTRANS))
                return 0;
-        /*
-         * Convert delalloc space to real space, do not flush the
-         * data out to disk, that will be done by the caller.
-         * Never need to allocate space here - we will always
-         * come back to writepage in that case.
-         */
-        dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
-        if (dirty == 0 && !unwritten)
-                goto free_buffers;
-        return 0;
-free_buffers:
        return try_to_free_buffers(page);
 }
@@ -1481,9 +1297,9 @@ __xfs_get_blocks(
        sector_t                iblock,
        struct buffer_head      *bh_result,
        int                     create,
-        int                     direct,
+        int                     direct)
-        bmapi_flags_t           flags)
 {
+        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
        struct xfs_bmbt_irec    imap;
        xfs_off_t               offset;
        ssize_t                 size;
@@ -1498,8 +1314,11 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        error = xfs_iomap(XFS_I(inode), offset, size,
+        if (direct && create)
-                             create ? flags : BMAPI_READ, &imap, &nimap, &new);
+                flags |= BMAPI_DIRECT;
+        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+                          &new);
        if (error)
                return -error;
        if (nimap == 0)
@@ -1579,8 +1398,7 @@ xfs_get_blocks(
        struct buffer_head      *bh_result,
        int                     create)
 {
-        return __xfs_get_blocks(inode, iblock,
+        return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
-                                bh_result, create, 0, BMAPI_WRITE);
 }
 STATIC int
@@ -1590,61 +1408,59 @@ xfs_get_blocks_direct(
        struct buffer_head      *bh_result,
        int                     create)
 {
-        return __xfs_get_blocks(inode, iblock,
+        return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
-                                bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successfull AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
 STATIC void
-xfs_end_io_direct(
+xfs_end_io_direct_write(
-        struct kiocb    *iocb,
+        struct kiocb            *iocb,
-        loff_t          offset,
+        loff_t                  offset,
-        ssize_t         size,
+        ssize_t                 size,
-        void            *private)
+        void                    *private,
+        int                     ret,
+        bool                    is_async)
 {
-        xfs_ioend_t     *ioend = iocb->private;
+        struct xfs_ioend        *ioend = iocb->private;
        /*
-         * Non-NULL private data means we need to issue a transaction to
+         * blockdev_direct_IO can return an error even after the I/O
-         * convert a range from unwritten to written extents.  This needs
+         * completion handler was called.  Thus we need to protect
-         * to happen from process context but aio+dio I/O completion
+         * against double-freeing.
-         * happens from irq context so we need to defer it to a workqueue.
-         * This is not necessary for synchronous direct I/O, but we do
-         * it anyway to keep the code uniform and simpler.
-         *
-         * Well, if only it were that simple. Because synchronous direct I/O
-         * requires extent conversion to occur *before* we return to userspace,
-         * we have to wait for extent conversion to complete. Look at the
-         * iocb that has been passed to us to determine if this is AIO or
-         * not. If it is synchronous, tell xfs_finish_ioend() to kick the
-         * workqueue and wait for it to complete.
-         *
-         * The core direct I/O code might be changed to always call the
-         * completion handler in the future, in which case all this can
-         * go away.
         */
+        iocb->private = NULL;
        ioend->io_offset = offset;
        ioend->io_size = size;
-        if (ioend->io_type == IO_READ) {
+        if (private && size > 0)
-                xfs_finish_ioend(ioend, 0);
+                ioend->io_type = IO_UNWRITTEN;
-        } else if (private && size > 0) {
-                xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
+        if (is_async) {
-        } else {
                /*
-                 * A direct I/O write ioend starts it's life in unwritten
+                 * If we are converting an unwritten extent we need to delay
-                 * state in case they map an unwritten extent.  This write
+                 * the AIO completion until after the unwrittent extent
-                 * didn't map an unwritten extent so switch it's completion
+                 * conversion has completed, otherwise do it ASAP.
-                 * handler.
                 */
-                ioend->io_type = IO_NEW;
+                if (ioend->io_type == IO_UNWRITTEN) {
-                xfs_finish_ioend(ioend, 0);
+                        ioend->io_iocb = iocb;
+                        ioend->io_result = ret;
+                } else {
+                        aio_complete(iocb, ret, 0);
+                }
+                xfs_finish_ioend(ioend);
+        } else {
+                xfs_finish_ioend_sync(ioend);
        }
-        /*
-         * blockdev_direct_IO can return an error even after the I/O
-         * completion handler was called.  Thus we need to protect
-         * against double-freeing.
-         */
-        iocb->private = NULL;
 }
 STATIC ssize_t
@@ -1655,23 +1471,26 @@ xfs_vm_direct_IO(
        loff_t                  offset,
        unsigned long           nr_segs)
 {
-        struct file     *file = iocb->ki_filp;
+        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        struct inode    *inode = file->f_mapping->host;
+        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-        struct block_device *bdev;
+        ssize_t                 ret;
-        ssize_t         ret;
+        if (rw & WRITE) {
-        bdev = xfs_find_bdev_for_inode(inode);
+                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
-        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+                ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-                                        IO_UNWRITTEN : IO_READ);
+                                                    offset, nr_segs,
+                                                    xfs_get_blocks_direct,
-        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+                                                    xfs_end_io_direct_write);
-                                            offset, nr_segs,
+                if (ret != -EIOCBQUEUED && iocb->private)
-                                            xfs_get_blocks_direct,
+                        xfs_destroy_ioend(iocb->private);
-                                            xfs_end_io_direct);
+        } else {
+                ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+                                                    offset, nr_segs,
+                                                    xfs_get_blocks_direct,
+                                                    NULL);
+        }
-        if (unlikely(ret != -EIOCBQUEUED && iocb->private))
-                xfs_destroy_ioend(iocb->private);
        return ret;
 }
@@ -1686,8 +1505,8 @@ xfs_vm_write_begin(
        void                    **fsdata)
 {
        *pagep = NULL;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin(file, mapping, pos, len, flags | AOP_FLAG_NOFS,
-                                                                xfs_get_blocks);
+                                 pagep, fsdata, xfs_get_blocks);
 }
 STATIC sector_t
@@ -1698,7 +1517,7 @@ xfs_vm_bmap(
        struct inode            *inode = (struct inode *)mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_itrace_entry(XFS_I(inode));
+        trace_xfs_vm_bmap(XFS_I(inode));
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..c5057fb6237a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
+        struct kiocb            *io_iocb;
+        int                     io_result;
 } xfs_ioend_t;
 extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_ioend_init(void);
 extern void xfs_ioend_wait(struct xfs_inode *);
-extern void xfs_count_page_state(struct page *, int *, int *, int *);
+extern void xfs_count_page_state(struct page *, int *, int *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef598..ea79072f5210 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -39,13 +39,12 @@
 #include "xfs_inum.h"
 #include "xfs_log.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct shrinker xfs_buf_shake = {
        .shrink = xfsbufd_wakeup,
@@ -340,7 +339,7 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(0, gfp_mask);
+                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -579,9 +578,9 @@ _xfs_buf_read(
                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
        status = xfs_buf_iorequest(bp);
-        if (!status && !(flags & XBF_ASYNC))
+        if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
-                status = xfs_buf_iowait(bp);
+                return status;
-        return status;
+        return xfs_buf_iowait(bp);
 }
 xfs_buf_t *
@@ -897,36 +896,6 @@ xfs_buf_unlock(
        trace_xfs_buf_unlock(bp, _RET_IP_);
 }
-/*
- *      Pinning Buffer Storage in Memory
- *      Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
-        xfs_buf_t               *bp)
-{
-        trace_xfs_buf_pin(bp, _RET_IP_);
-        atomic_inc(&bp->b_pin_count);
-}
-void
-xfs_buf_unpin(
-        xfs_buf_t               *bp)
-{
-        trace_xfs_buf_unpin(bp, _RET_IP_);
-        if (atomic_dec_and_test(&bp->b_pin_count))
-                wake_up_all(&bp->b_waiters);
-}
-int
-xfs_buf_ispin(
-        xfs_buf_t               *bp)
-{
-        return atomic_read(&bp->b_pin_count);
-}
 STATIC void
 xfs_buf_wait_unpin(
        xfs_buf_t               *bp)
@@ -1018,13 +987,12 @@ xfs_bwrite(
 {
        int                     error;
-        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
        xfs_buf_delwri_dequeue(bp);
-        xfs_buf_iostrategy(bp);
+        xfs_bdstrat_cb(bp);
        error = xfs_buf_iowait(bp);
        if (error)
@@ -1040,7 +1008,6 @@ xfs_bdwrite(
 {
        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags &= ~XBF_READ;
@@ -1075,7 +1042,6 @@ xfs_bioerror(
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
        xfs_biodone(bp);
        return EIO;
@@ -1105,7 +1071,6 @@ xfs_bioerror_relse(
        XFS_BUF_DONE(bp);
        XFS_BUF_STALE(bp);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
        if (!(fl & XBF_ASYNC)) {
                /*
                 * Mark b_error and B_ERROR _both_.
@@ -1311,8 +1276,19 @@ submit_io:
                if (size)
                        goto next_chunk;
        } else {
-                bio_put(bio);
+                /*
+                 * if we get here, no pages were added to the bio. However,
+                 * we can't just error out here - if the pages are locked then
+                 * we have to unlock them otherwise we can hang on a later
+                 * access to the page.
+                 */
                xfs_buf_ioerror(bp, EIO);
+                if (bp->b_flags & _XBF_PAGE_LOCKED) {
+                        int i;
+                        for (i = 0; i < bp->b_page_count; i++)
+                                unlock_page(bp->b_pages[i]);
+                }
+                bio_put(bio);
        }
 }
@@ -1762,6 +1738,7 @@ xfs_buf_runall_queues(
 STATIC int
 xfsbufd_wakeup(
+        struct shrinker         *shrink,
        int                     priority,
        gfp_t                   mask)
 {
@@ -1803,7 +1780,7 @@ xfs_buf_delwri_split(
                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
-                if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
                        if (!force &&
                            time_before(jiffies, bp->b_queuetime + age)) {
                                xfs_buf_unlock(bp);
@@ -1888,7 +1865,7 @@ xfsbufd(
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
-                        xfs_buf_iostrategy(bp);
+                        xfs_bdstrat_cb(bp);
                        count++;
                }
                if (count)
@@ -1935,7 +1912,7 @@ xfs_flush_buftarg(
                        bp->b_flags &= ~XBF_ASYNC;
                        list_add(&bp->b_list, &wait_list);
                }
-                xfs_buf_iostrategy(bp);
+                xfs_bdstrat_cb(bp);
        }
        if (wait) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 5fbecefa5dfd..d072e5ff923b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,57 @@ typedef enum {
        XBRW_ZERO = 3,                  /* Zero target memory */
 } xfs_buf_rw_t;
-typedef enum {
+#define XBF_READ        (1 << 0) /* buffer intended for reading from device */
-        XBF_READ = (1 << 0),    /* buffer intended for reading from device */
+#define XBF_WRITE       (1 << 1) /* buffer intended for writing to device */
-        XBF_WRITE = (1 << 1),   /* buffer intended for writing to device   */
+#define XBF_MAPPED      (1 << 2) /* buffer mapped (b_addr valid) */
-        XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
+#define XBF_ASYNC       (1 << 4) /* initiator will not wait for completion */
-        XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
+#define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
-        XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate        */
+#define XBF_DELWRI      (1 << 6) /* buffer has dirty pages */
-        XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
+#define XBF_STALE       (1 << 7) /* buffer has been staled, do not find it */
-        XBF_STALE = (1 << 7),   /* buffer has been staled, do not find it  */
+#define XBF_FS_MANAGED  (1 << 8) /* filesystem controls freeing memory */
-        XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
+#define XBF_ORDERED     (1 << 11)/* use ordered writes */
-        XBF_ORDERED = (1 << 11),    /* use ordered writes                  */
+#define XBF_READ_AHEAD  (1 << 12)/* asynchronous read-ahead */
-        XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
+#define XBF_LOG_BUFFER  (1 << 13)/* this is a buffer used for the log */
-        XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log   */
+/* flags used only as arguments to access routines */
-        /* flags used only as arguments to access routines */
+#define XBF_LOCK        (1 << 14)/* lock requested */
-        XBF_LOCK = (1 << 14),       /* lock requested                      */
+#define XBF_TRYLOCK     (1 << 15)/* lock requested, but do not wait */
-        XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait     */
+#define XBF_DONT_BLOCK  (1 << 16)/* do not block in current thread */
-        XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread      */
+/* flags used only internally */
-        /* flags used only internally */
+#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
-        _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache                 */
+#define _XBF_PAGES      (1 << 18)/* backed by refcounted pages */
-        _XBF_PAGES = (1 << 18),     /* backed by refcounted pages          */
+#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
-        _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue         */
+#define _XBF_DELWRI_Q   (1 << 21)/* buffer on delwri queue */
-        _XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue             */
+/*
-        /*
+ * Special flag for supporting metadata blocks smaller than a FSB.
-         * Special flag for supporting metadata blocks smaller than a FSB.
+ *
-         *
+ * In this case we can have multiple xfs_buf_t on a single page and
-         * In this case we can have multiple xfs_buf_t on a single page and
+ * need to lock out concurrent xfs_buf_t readers as they only
-         * need to lock out concurrent xfs_buf_t readers as they only
+ * serialise access to the buffer.
-         * serialise access to the buffer.
+ *
-         *
+ * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
-         * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+ * between reads of the page. Hence we can have one thread read the
-         * between reads of the page. Hence we can have one thread read the
+ * page and modify it, but then race with another thread that thinks
-         * page and modify it, but then race with another thread that thinks
+ * the page is not up-to-date and hence reads it again.
-         * the page is not up-to-date and hence reads it again.
+ *
-         *
+ * The result is that the first modifcation to the page is lost.
-         * The result is that the first modifcation to the page is lost.
+ * This sort of AGF/AGI reading race can happen when unlinking inodes
-         * This sort of AGF/AGI reading race can happen when unlinking inodes
+ * that require truncation and results in the AGI unlinked list
-         * that require truncation and results in the AGI unlinked list
+ * modifications being lost.
-         * modifications being lost.
+ */
-         */
+#define _XBF_PAGE_LOCKED        (1 << 22)
-        _XBF_PAGE_LOCKED = (1 << 22),
+/*
-        /*
+ * If we try a barrier write, but it fails we have to communicate
-         * If we try a barrier write, but it fails we have to communicate
+ * this to the upper layers.  Unfortunately b_error gets overwritten
-         * this to the upper layers.  Unfortunately b_error gets overwritten
+ * when the buffer is re-issued so we have to add another flag to
-         * when the buffer is re-issued so we have to add another flag to
+ * keep this information.
-         * keep this information.
+ */
-         */
+#define _XFS_BARRIER_FAILED     (1 << 23)
-        _XFS_BARRIER_FAILED = (1 << 23),
-} xfs_buf_flags_t;
+typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
        { XBF_READ,             "READ" }, \
@@ -187,7 +187,6 @@ typedef struct xfs_buf {
        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
-        xfs_buf_bdstrat_t       b_strat;        /* pre-write function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -245,11 +244,6 @@ extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
-        return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
        return bp ? bp->b_error : ENOMEM;
@@ -258,11 +252,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -326,8 +315,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_IODONE_FUNC(bp)                 ((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)       ((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)             ((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)      ((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp)            ((bp)->b_strat = NULL)
 #define XFS_BUF_FSPRIVATE(bp, type)             ((type)(bp)->b_fspriv)
 #define XFS_BUF_SET_FSPRIVATE(bp, val)          ((bp)->b_fspriv = (void*)(val))
@@ -351,7 +338,7 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
 #define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
-#define XFS_BUF_ISPINNED(bp)    xfs_buf_ispin(bp)
+#define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
 #define XFS_BUF_VALUSEMA(bp)    xfs_buf_lock_value(bp)
 #define XFS_BUF_CPSEMA(bp)      (xfs_buf_cond_lock(bp) == 0)
@@ -370,8 +357,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
        xfs_buf_rele(bp);
 }
-#define xfs_bpin(bp)            xfs_buf_pin(bp)
-#define xfs_bunpin(bp)          xfs_buf_unpin(bp)
 #define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
 #define xfs_biomove(bp, off, len, data, rw) \
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685eed..000000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_PRIV_H__
-#define __XFS_DMAPI_PRIV_H__
-/*
- *      Based on IO_ISDIRECT, decide which i_ flag is set.
- */
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-                              DM_FLAGS_IMUX : 0)
-#define DM_SEM_FLAG_WR  (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index e7839ee49e43..3764d74790ec 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
@@ -132,8 +132,7 @@ xfs_nfs_get_inode(
         * fine and not an indication of a corrupted filesystem as clients can
         * send invalid file handles and we have to handle it gracefully..
         */
-        error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED,
+        error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
-                         XFS_ILOCK_SHARED, &ip);
        if (error) {
                /*
                 * EINVAL means the inode cluster doesn't exist anymore.
@@ -148,11 +147,10 @@ xfs_nfs_get_inode(
        }
        if (ip->i_d.di_gen != generation) {
-                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                IRELE(ip);
                return ERR_PTR(-ENOENT);
        }
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return VFS_I(ip);
 }
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 257a56b127cf..ba8ad422a165 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_ioctl.h"
@@ -108,7 +100,7 @@ xfs_file_fsync(
        int                     error = 0;
        int                     log_flushed = 0;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_fsync(ip);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -XFS_ERROR(EIO);
@@ -166,8 +158,7 @@ xfs_file_fsync(
                 * transaction.  So we play it safe and fire off the
                 * transaction anyway.
                 */
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                xfs_trans_set_sync(tp);
                error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -275,20 +266,6 @@ xfs_file_aio_read(
                mutex_lock(&inode->i_mutex);
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-                int iolock = XFS_IOLOCK_SHARED;
-                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
-                                        dmflags, &iolock);
-                if (ret) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        if (unlikely(ioflags & IO_ISDIRECT))
-                                mutex_unlock(&inode->i_mutex);
-                        return ret;
-                }
-        }
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
@@ -321,7 +298,6 @@ xfs_file_splice_read(
        unsigned int            flags)
 {
        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-        struct xfs_mount        *mp = ip->i_mount;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -335,18 +311,6 @@ xfs_file_splice_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_SHARED;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(infilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        return -error;
-                }
-        }
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -367,7 +331,6 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
        xfs_fsize_t             isize, new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -382,18 +345,6 @@ xfs_file_splice_write(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_EXCL;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(outfilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                        return -error;
-                }
-        }
        new_size = *ppos + count;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -463,7 +414,7 @@ xfs_zero_last_block(
        last_fsb = XFS_B_TO_FSBT(mp, isize);
        nimaps = 1;
        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-                          &nimaps, NULL, NULL);
+                          &nimaps, NULL);
        if (error) {
                return error;
        }
@@ -558,7 +509,7 @@ xfs_zero_eof(
                nimaps = 1;
                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
+                                  0, NULL, 0, &imap, &nimaps, NULL);
                if (error) {
                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
                        return error;
@@ -627,7 +578,6 @@ xfs_file_aio_write(
        int                     ioflags = 0;
        xfs_fsize_t             isize, new_size;
        int                     iolock;
-        int                     eventsent = 0;
        size_t                  ocount = 0, count;
        int                     need_i_mutex;
@@ -673,33 +623,6 @@ start:
                goto out_unlock_mutex;
        }
-        if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
-            !(ioflags & IO_INVIS) && !eventsent)) {
-                int             dmflags = FILP_DELAY_FLAG(file);
-                if (need_i_mutex)
-                        dmflags |= DM_FLAGS_IMUX;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
-                                      pos, count, dmflags, &iolock);
-                if (error) {
-                        goto out_unlock_internal;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                eventsent = 1;
-                /*
-                 * The iolock was dropped and reacquired in XFS_SEND_DATA
-                 * so we have to recheck the size when appending.
-                 * We will only "goto start;" once, since having sent the
-                 * event prevents another call to XFS_SEND_DATA, which is
-                 * what allows the size to change in the first place.
-                 */
-                if ((file->f_flags & O_APPEND) && pos != ip->i_size)
-                        goto start;
-        }
        if (ioflags & IO_ISDIRECT) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
@@ -830,22 +753,6 @@ write_retry:
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
-        if (ret == -ENOSPC &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-                xfs_iunlock(ip, iolock);
-                if (need_i_mutex)
-                        mutex_unlock(&inode->i_mutex);
-                error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
-                                DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
-                                0, 0, 0); /* Delay flag intentionally  unused */
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                if (error)
-                        goto out_unlock_internal;
-                goto start;
-        }
        error = -ret;
        if (ret <= 0)
                goto out_unlock_internal;
@@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
-        .open_exec      = xfs_file_open_exec,
-#endif
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7b..1f279b012f94 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
 #include "xfs_inode.h"
 #include "xfs_trace.h"
-int  fs_noerr(void) { return 0; }
-int  fs_nosys(void) { return ENOSYS; }
-void fs_noval(void) { return; }
 /*
 * note: all filemap functions return negative error codes. These
 * need to be inverted before returning to the xfs core functions.
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599e..000000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-extern int  fs_noerr(void);
-extern int  fs_nosys(void);
-extern void fs_noval(void);
-#endif  /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e59a81062830..237f5ffb2ee8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ioctl.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
@@ -908,7 +899,7 @@ xfs_ioctl_setattr(
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
-        xfs_itrace_entry(ip);
+        trace_xfs_ioctl_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return XFS_ERROR(EROFS);
@@ -1043,8 +1034,7 @@ xfs_ioctl_setattr(
                }
        }
-        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -1116,16 +1106,7 @@ xfs_ioctl_setattr(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
-        if (code)
+        return code;
-                return code;
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-                                NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
-                                (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
-        }
-        return 0;
 error_return:
        xfs_qm_dqrele(udqp);
@@ -1301,7 +1282,7 @@ xfs_file_ioctl(
        if (filp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_ioctl(ip);
        switch (cmd) {
        case XFS_IOC_ALLOCSP:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 52ed49e6465c..6c83f7f62dc9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -544,7 +540,7 @@ xfs_file_compat_ioctl(
        if (filp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_compat_ioctl(ip);
        switch (cmd) {
        /* No size or alignment issues on any arch */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 44f0b2de153e..536b81e63a3d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
@@ -496,7 +488,7 @@ xfs_vn_getattr(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        xfs_itrace_entry(ip);
+        trace_xfs_getattr(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a706..998a9d7fb9c8 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -87,7 +87,6 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_globals.h>
-#include <xfs_fs_subr.h>
 #include <xfs_buf.h>
 /*
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 067cafbfc635..29b9d642e93d 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,7 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-#include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_log.h"
@@ -69,15 +68,15 @@ xfs_fs_set_xstate(
        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        if (uflags & XFS_QUOTA_UDQ_ACCT)
+        if (uflags & FS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_PDQ_ACCT)
+        if (uflags & FS_QUOTA_PDQ_ACCT)
                flags |= XFS_PQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_GDQ_ACCT)
+        if (uflags & FS_QUOTA_GDQ_ACCT)
                flags |= XFS_GQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_UDQ_ENFD)
+        if (uflags & FS_QUOTA_UDQ_ENFD)
                flags |= XFS_UQUOTA_ENFD;
-        if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+        if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
                flags |= XFS_OQUOTA_ENFD;
        switch (op) {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c9165..758df94690ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -43,7 +40,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_BARRIER  "barrier"       /* use writer barriers for log write and
                                         * unwritten extent conversion */
 #define MNTOPT_NOBARRIER "nobarrier"    /* .. disable */
-#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"   /* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP    "ikeep"         /* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP  "noikeep"       /* free empty inode clusters */
@@ -116,9 +111,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DMAPI    "dmapi"         /* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
-#define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
 #define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
@@ -172,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
-        char                    *options,
+        char                    *options)
-        char                    **mtpt)
 {
        struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
        int                     dsunit = 0;
        int                     dswidth = 0;
        int                     iosize = 0;
-        int                     dmapi_implies_ikeep = 1;
        __uint8_t               iosizelog = 0;
        /*
@@ -243,15 +233,10 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        if (!value || !*value) {
+                        cmn_err(CE_WARN,
-                                cmn_err(CE_WARN,
+                                "XFS: %s option not allowed on this system",
-                                        "XFS: %s option requires an argument",
+                                this_char);
-                                        this_char);
+                        return EINVAL;
-                                return EINVAL;
-                        }
-                        *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
-                        if (!*mtpt)
-                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -288,8 +273,6 @@ xfs_parseargs(
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
                        mp->m_flags |= XFS_MOUNT_WSYNC;
-                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -329,7 +312,6 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
                        mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
-                        dmapi_implies_ikeep = 0;
                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -370,12 +352,6 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
-                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
-                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
                        cmn_err(CE_WARN,
@@ -387,9 +363,11 @@ xfs_parseargs(
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        /* no-op, this is now the default */
                        cmn_err(CE_WARN,
-        "XFS: osyncisdsync is now the default, option is deprecated.");
+        "XFS: osyncisdsync has no effect, option is deprecated.");
+                } else if (!strcmp(this_char, "osyncisosync")) {
+                        cmn_err(CE_WARN,
+        "XFS: osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
                        cmn_err(CE_WARN,
        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -430,12 +408,6 @@ xfs_parseargs(
                return EINVAL;
        }
-        if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
-                printk("XFS: %s option needs the mount point option as well\n",
-                        MNTOPT_DMAPI);
-                return EINVAL;
-        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                cmn_err(CE_WARN,
                        "XFS: sunit and swidth must be specified together");
@@ -449,18 +421,6 @@ xfs_parseargs(
                return EINVAL;
        }
-        /*
-         * Applications using DMI filesystems often expect the
-         * inode generation number to be monotonically increasing.
-         * If we delete inode chunks we break this assumption, so
-         * keep unused inode chunks on disk for DMI filesystems
-         * until we come up with a better solution.
-         * Note that if "ikeep" or "noikeep" mount options are
-         * supplied, then they are honored.
-         */
-        if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
 done:
        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
                /*
@@ -539,10 +499,8 @@ xfs_showargs(
                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
-                { XFS_MOUNT_OSYNCISOSYNC,       "," MNTOPT_OSYNCISOSYNC },
                { XFS_MOUNT_ATTR2,              "," MNTOPT_ATTR2 },
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
-                { XFS_MOUNT_DMAPI,              "," MNTOPT_DMAPI },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
                { 0, NULL }
@@ -947,7 +905,7 @@ xfs_fs_destroy_inode(
 {
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_itrace_entry(ip);
+        trace_xfs_destroy_inode(ip);
        XFS_STATS_INC(vn_reclaim);
@@ -1063,10 +1021,8 @@ xfs_log_inode(
         * an inode in another recent transaction.  So we play it safe and
         * fire off the transaction anyway.
         */
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
@@ -1082,27 +1038,18 @@ xfs_fs_write_inode(
        struct xfs_mount        *mp = ip->i_mount;
        int                     error = EAGAIN;
-        xfs_itrace_entry(ip);
+        trace_xfs_write_inode(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
        if (wbc->sync_mode == WB_SYNC_ALL) {
                /*
-                 * Make sure the inode has hit stable storage.  By using the
+                 * Make sure the inode has made it it into the log.  Instead
-                 * log and the fsync transactions we reduce the IOs we have
+                 * of forcing it all the way to stable storage using a
-                 * to do here from two (log and inode) to just the log.
+                 * synchronous transaction we let the log force inside the
-                 *
+                 * ->sync_fs call do that for thus, which reduces the number
-                 * Note: We still need to do a delwri write of the inode after
+                 * of synchronous log foces dramatically.
-                 * this to flush it to the backing buffer so that bulkstat
-                 * works properly if this is the first time the inode has been
-                 * written.  Because we hold the ilock atomically over the
-                 * transaction commit and the inode flush we are guaranteed
-                 * that the inode is not pinned when it returns. If the flush
-                 * lock is already held, then the inode has already been
-                 * flushed once and we don't need to flush it again.  Hence
-                 * the code will only flush the inode if it isn't already
-                 * being flushed.
                 */
                xfs_ioend_wait(ip);
                xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1116,27 +1063,29 @@ xfs_fs_write_inode(
                 * We make this non-blocking if the inode is contended, return
                 * EAGAIN to indicate to the caller that they did not succeed.
                 * This prevents the flush path from blocking on inodes inside
-                 * another operation right now, they get caught later by xfs_sync.
+                 * another operation right now, they get caught later by
+                 * xfs_sync.
                 */
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                        goto out;
-        }
-        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-                goto out_unlock;
+                        goto out_unlock;
-        /*
+                /*
-         * Now we have the flush lock and the inode is not pinned, we can check
+                 * Now we have the flush lock and the inode is not pinned, we
-         * if the inode is really clean as we know that there are no pending
+                 * can check if the inode is really clean as we know that
-         * transaction completions, it is not waiting on the delayed write
+                 * there are no pending transaction completions, it is not
-         * queue and there is no IO in progress.
+                 * waiting on the delayed write queue and there is no IO in
-         */
+                 * progress.
-        if (xfs_inode_clean(ip)) {
+                 */
-                xfs_ifunlock(ip);
+                if (xfs_inode_clean(ip)) {
-                error = 0;
+                        xfs_ifunlock(ip);
-                goto out_unlock;
+                        error = 0;
+                        goto out_unlock;
+                }
+                error = xfs_iflush(ip, 0);
        }
-        error = xfs_iflush(ip, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1156,7 +1105,8 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
-        xfs_itrace_entry(ip);
+        trace_xfs_clear_inode(ip);
        XFS_STATS_INC(vn_rele);
        XFS_STATS_INC(vn_remove);
        XFS_STATS_DEC(vn_active);
@@ -1193,22 +1143,13 @@ xfs_fs_put_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        /*
+         * Unregister the memory shrinker before we tear down the mount
+         * structure so we don't have memory reclaim racing with us here.
+         */
+        xfs_inode_shrinker_unregister(mp);
        xfs_syncd_stop(mp);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                /*
-                 * XXX(hch): this should be SYNC_WAIT.
-                 *
-                 * Or more likely not needed at all because the VFS is already
-                 * calling ->sync_fs after shutting down all filestem
-                 * operations and just before calling ->put_super.
-                 */
-                xfs_sync_data(mp, 0);
-                xfs_sync_attr(mp, 0);
-        }
-        XFS_SEND_PREUNMOUNT(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
         * This can and will cause log traffic as inodes go inactive
@@ -1218,14 +1159,10 @@ xfs_fs_put_super(
        XFS_bflush(mp->m_ddev_targp);
-        XFS_SEND_UNMOUNT(mp);
        xfs_unmountfs(mp);
        xfs_freesb(mp);
-        xfs_inode_shrinker_unregister(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
-        xfs_dmops_put(mp);
        xfs_free_fsname(mp);
        kfree(mp);
 }
@@ -1543,7 +1480,6 @@ xfs_fs_fill_super(
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
        int                     flags = 0, error = ENOMEM;
-        char                    *mtpt = NULL;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
@@ -1559,7 +1495,7 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
-        error = xfs_parseargs(mp, (char *)data, &mtpt);
+        error = xfs_parseargs(mp, (char *)data);
        if (error)
                goto out_free_fsname;
@@ -1571,16 +1507,12 @@ xfs_fs_fill_super(
 #endif
        sb->s_op = &xfs_super_operations;
-        error = xfs_dmops_get(mp);
-        if (error)
-                goto out_free_fsname;
        if (silent)
                flags |= XFS_MFSI_QUIET;
        error = xfs_open_devices(mp);
        if (error)
-                goto out_put_dmops;
+                goto out_free_fsname;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1608,8 +1540,6 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1638,7 +1568,6 @@ xfs_fs_fill_super(
        xfs_inode_shrinker_register(mp);
-        kfree(mtpt);
        return 0;
 out_filestream_unmount:
@@ -1648,11 +1577,8 @@ xfs_fs_fill_super(
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
- out_put_dmops:
-        xfs_dmops_put(mp);
 out_free_fsname:
        xfs_free_fsname(mp);
-        kfree(mtpt);
        kfree(mp);
 out:
        return -error;
@@ -1759,6 +1685,12 @@ xfs_init_zones(void)
        if (!xfs_trans_zone)
                goto out_destroy_ifork_zone;
+        xfs_log_item_desc_zone =
+                kmem_zone_init(sizeof(struct xfs_log_item_desc),
+                               "xfs_log_item_desc");
+        if (!xfs_log_item_desc_zone)
+                goto out_destroy_trans_zone;
        /*
         * The size of the zone allocated buf log item is the maximum
         * size possible under XFS.  This wastes a little bit of memory,
@@ -1768,7 +1700,7 @@ xfs_init_zones(void)
                                (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
                                  NBWORD) * sizeof(int))), "xfs_buf_item");
        if (!xfs_buf_item_zone)
-                goto out_destroy_trans_zone;
+                goto out_destroy_log_item_desc_zone;
        xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
                        ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1805,6 +1737,8 @@ xfs_init_zones(void)
        kmem_zone_destroy(xfs_efd_zone);
 out_destroy_buf_item_zone:
        kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+        kmem_zone_destroy(xfs_log_item_desc_zone);
 out_destroy_trans_zone:
        kmem_zone_destroy(xfs_trans_zone);
 out_destroy_ifork_zone:
@@ -1835,6 +1769,7 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_efi_zone);
        kmem_zone_destroy(xfs_efd_zone);
        kmem_zone_destroy(xfs_buf_item_zone);
+        kmem_zone_destroy(xfs_log_item_desc_zone);
        kmem_zone_destroy(xfs_trans_zone);
        kmem_zone_destroy(xfs_ifork_zone);
        kmem_zone_destroy(xfs_dabuf_zone);
@@ -1883,7 +1818,6 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
-        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1911,7 +1845,6 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
-        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 519618e9279e..1ef4a4d2d997 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
 # define XFS_BIGFS_STRING
 #endif
-#ifdef CONFIG_XFS_DMAPI
-# define XFS_DMAPI_STRING       "dmapi support, "
-#else
-# define XFS_DMAPI_STRING
-#endif
 #ifdef DEBUG
 # define XFS_DBG_STRING         "debug"
 #else
@@ -72,7 +66,6 @@ extern void xfs_qm_exit(void);
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
                                XFS_BIGFS_STRING \
-                                XFS_DMAPI_STRING \
                                XFS_DBG_STRING /* DBG must be last */
 struct xfs_inode;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ef7f0218bccb..dfcbd98d1599 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,25 +24,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_inode.h"
 #include "xfs_dinode.h"
 #include "xfs_error.h"
-#include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
-#include "xfs_utils.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
-#include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
@@ -144,6 +133,41 @@ restart:
        return last_error;
 }
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          *first,
+        int                     tag)
+{
+        struct xfs_perag        *pag = NULL;
+        if (tag == XFS_ICI_RECLAIM_TAG) {
+                int found;
+                int ref;
+                spin_lock(&mp->m_perag_lock);
+                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                (void **)&pag, *first, 1, tag);
+                if (found <= 0) {
+                        spin_unlock(&mp->m_perag_lock);
+                        return NULL;
+                }
+                *first = pag->pag_agno + 1;
+                /* open coded pag reference increment */
+                ref = atomic_inc_return(&pag->pag_ref);
+                spin_unlock(&mp->m_perag_lock);
+                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+        } else {
+                pag = xfs_perag_get(mp, *first);
+                (*first)++;
+        }
+        return pag;
+}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
@@ -154,16 +178,15 @@ xfs_inode_ag_iterator(
        int                     exclusive,
        int                     *nr_to_scan)
 {
+        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
        int                     nr;
        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
-        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+        ag = 0;
-                struct xfs_perag        *pag;
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
-                pag = xfs_perag_get(mp, ag);
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -285,7 +308,7 @@ xfs_sync_inode_attr(
 /*
 * Write out pagecache data for the whole filesystem.
 */
-int
+STATIC int
 xfs_sync_data(
        struct xfs_mount        *mp,
        int                     flags)
@@ -306,7 +329,7 @@ xfs_sync_data(
 /*
 * Write out inode metadata (attributes) for the whole filesystem.
 */
-int
+STATIC int
 xfs_sync_attr(
        struct xfs_mount        *mp,
        int                     flags)
@@ -339,8 +362,7 @@ xfs_commit_dummy_trans(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        error = xfs_trans_commit(tp, 0);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -640,6 +662,17 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        if (!pag->pag_ici_reclaimable) {
+                /* propagate the reclaim tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
        pag->pag_ici_reclaimable++;
 }
@@ -674,6 +707,16 @@ __xfs_inode_clear_reclaim_tag(
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        pag->pag_ici_reclaimable--;
+        if (!pag->pag_ici_reclaimable) {
+                /* clear the reclaim tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
 }
 /*
@@ -812,7 +855,36 @@ out:
 reclaim:
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_ireclaim(ip);
+        XFS_STATS_INC(xs_ig_reclaims);
+        /*
+         * Remove the inode from the per-AG radix tree.
+         *
+         * Because radix_tree_delete won't complain even if the item was never
+         * added to the tree assert that it's been there before to catch
+         * problems with the inode life time early on.
+         */
+        write_lock(&pag->pag_ici_lock);
+        if (!radix_tree_delete(&pag->pag_ici_root,
+                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                ASSERT(0);
+        write_unlock(&pag->pag_ici_lock);
+        /*
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.  We get
+         * both the ilock and the iolock because the code may need to drop the
+         * ilock one but will still hold the iolock.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_qm_dqdetach(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_inode_free(ip);
        return error;
 }
@@ -828,83 +900,52 @@ xfs_reclaim_inodes(
 /*
 * Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
 */
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
 static int
 xfs_reclaim_inode_shrink(
+        struct shrinker *shrink,
        int             nr_to_scan,
        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
-        int             reclaimable = 0;
+        int             reclaimable;
+        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                down_read(&xfs_mount_list_lock);
+                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
-                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-                        if (nr_to_scan <= 0)
+                /* if we don't exhaust the scan, don't bother coming back */
-                                break;
+                if (nr_to_scan > 0)
-                }
+                        return -1;
-                up_read(&xfs_mount_list_lock);
+       }
-        }
-        down_read(&xfs_mount_list_lock);
+        reclaimable = 0;
-        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+        ag = 0;
-                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
-                        pag = xfs_perag_get(mp, ag);
+                                        XFS_ICI_RECLAIM_TAG))) {
-                        reclaimable += pag->pag_ici_reclaimable;
+                reclaimable += pag->pag_ici_reclaimable;
-                        xfs_perag_put(pag);
+                xfs_perag_put(pag);
-                }
        }
-        up_read(&xfs_mount_list_lock);
        return reclaimable;
 }
-static struct shrinker xfs_inode_shrinker = {
-        .shrink = xfs_reclaim_inode_shrink,
-        .seeks = DEFAULT_SEEKS,
-};
-void __init
-xfs_inode_shrinker_init(void)
-{
-        init_rwsem(&xfs_mount_list_lock);
-        register_shrinker(&xfs_inode_shrinker);
-}
-void
-xfs_inode_shrinker_destroy(void)
-{
-        ASSERT(list_empty(&xfs_mount_list));
-        unregister_shrinker(&xfs_inode_shrinker);
-}
 void
 xfs_inode_shrinker_register(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
-        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
-        up_write(&xfs_mount_list_lock);
+        register_shrinker(&mp->m_inode_shrink);
 }
 void
 xfs_inode_shrinker_unregister(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        unregister_shrinker(&mp->m_inode_shrink);
-        list_del(&mp->m_mplist);
-        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..fe78726196f8 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_sync_attr(struct xfs_mount *mp, int flags);
-int xfs_sync_data(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
@@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
        int flags, int tag, int write_lock, int *nr_to_scan);
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index d12be8470cba..88d25d4aa56e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,17 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa117384..c657cdca2cd2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name,	\
                 unsigned long caller_ip),                                      \
        TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -314,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
 DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
-DEFINE_BUF_EVENT(xfs_buf_pin);
-DEFINE_BUF_EVENT(xfs_buf_unpin);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -538,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
 DEFINE_LOCK_EVENT(xfs_ilock_demote);
 DEFINE_LOCK_EVENT(xfs_iunlock);
-DECLARE_EVENT_CLASS(xfs_iget_class,
+DECLARE_EVENT_CLASS(xfs_inode_class,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
@@ -554,16 +555,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
                  __entry->ino)
 )
-#define DEFINE_IGET_EVENT(name) \
+#define DEFINE_INODE_EVENT(name) \
-DEFINE_EVENT(xfs_iget_class, name, \
+DEFINE_EVENT(xfs_inode_class, name, \
        TP_PROTO(struct xfs_inode *ip), \
        TP_ARGS(ip))
-DEFINE_IGET_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_skip);
-DEFINE_IGET_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
-DEFINE_IGET_EVENT(xfs_iget_found);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
-DEFINE_IGET_EVENT(xfs_iget_alloc);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
-DECLARE_EVENT_CLASS(xfs_inode_class,
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_clear_inode);
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
        TP_STRUCT__entry(
@@ -588,20 +611,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
                  (char *)__entry->caller_ip)
 )
-#define DEFINE_INODE_EVENT(name) \
+#define DEFINE_IREF_EVENT(name) \
-DEFINE_EVENT(xfs_inode_class, name, \
+DEFINE_EVENT(xfs_iref_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
        TP_ARGS(ip, caller_ip))
-DEFINE_INODE_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_ihold);
-DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_irele);
-DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_pin);
-DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
-DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+        TP_ARGS(dp, name),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, dp_ino)
+                __dynamic_array(char, name, name->len)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(dp)->i_sb->s_dev;
+                __entry->dp_ino = dp->i_ino;
+                memcpy(__get_str(name), name->name, name->len);
+        ),
+        TP_printk("dev %d:%d dp ino 0x%llx name %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->dp_ino,
+                  __get_str(name))
+)
-/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
+#define DEFINE_NAMESPACE_EVENT(name) \
-DEFINE_INODE_EVENT(xfs_inode);
+DEFINE_EVENT(xfs_namespace_class, name, \
-#define xfs_itrace_entry(ip)    \
+        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
-        trace_xfs_inode(ip, _THIS_IP_)
+        TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+TRACE_EVENT(xfs_rename,
+        TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+                 struct xfs_name *src_name, struct xfs_name *target_name),
+        TP_ARGS(src_dp, target_dp, src_name, target_name),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, src_dp_ino)
+                __field(xfs_ino_t, target_dp_ino)
+                __dynamic_array(char, src_name, src_name->len)
+                __dynamic_array(char, target_name, target_name->len)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+                __entry->src_dp_ino = src_dp->i_ino;
+                __entry->target_dp_ino = target_dp->i_ino;
+                memcpy(__get_str(src_name), src_name->name, src_name->len);
+                memcpy(__get_str(target_name), target_name->name, target_name->len);
+        ),
+        TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+                  " src name %s target name %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->src_dp_ino,
+                  __entry->target_dp_ino,
+                  __get_str(src_name),
+                  __get_str(target_name))
+)
 DECLARE_EVENT_CLASS(xfs_dquot_class,
        TP_PROTO(struct xfs_dquot *dqp),
@@ -681,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
 DEFINE_DQUOT_EVENT(xfs_dqflush_force);
 DEFINE_DQUOT_EVENT(xfs_dqflush_done);
-/* not really iget events, but we re-use the format */
-DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
-DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
 DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -831,33 +902,29 @@ DECLARE_EVENT_CLASS(xfs_page_class,
                __field(loff_t, size)
                __field(unsigned long, offset)
                __field(int, delalloc)
-                __field(int, unmapped)
                __field(int, unwritten)
        ),
        TP_fast_assign(
-                int delalloc = -1, unmapped = -1, unwritten = -1;
+                int delalloc = -1, unwritten = -1;
                if (page_has_buffers(page))
-                        xfs_count_page_state(page, &delalloc,
+                        xfs_count_page_state(page, &delalloc, &unwritten);
-                                             &unmapped, &unwritten);
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = XFS_I(inode)->i_ino;
                __entry->pgoff = page_offset(page);
                __entry->size = i_size_read(inode);
                __entry->offset = off;
                __entry->delalloc = delalloc;
-                __entry->unmapped = unmapped;
                __entry->unwritten = unwritten;
        ),
        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                  "delalloc %d unmapped %d unwritten %d",
+                  "delalloc %d unwritten %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->pgoff,
                  __entry->size,
                  __entry->offset,
                  __entry->delalloc,
-                  __entry->unmapped,
                  __entry->unwritten)
 )
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 585e7633dfc7..e1a2f6800e01 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
   flush lock - ditto.
 */
-STATIC void             xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
 #ifdef DEBUG
 xfs_buftarg_t *xfs_dqerror_target;
 int xfs_do_dqerror;
@@ -390,21 +378,14 @@ xfs_qm_dqalloc(
                return (ESRCH);
        }
-        /*
+        xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to keep the quota
-         * inode around, we bump the vnode ref count now.
-         */
-        IHOLD(quotip);
-        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
        if ((error = xfs_bmapi(tp, quotip,
                              offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
                              XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
                              &firstblock,
                              XFS_QM_DQALLOC_SPACE_RES(mp),
-                              &map, &nmaps, &flist, NULL))) {
+                              &map, &nmaps, &flist))) {
                goto error0;
        }
        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -520,7 +501,7 @@ xfs_qm_dqtobp(
                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL, NULL);
+                                  NULL, 0, &map, &nmaps, NULL);
                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
                if (error)
@@ -1141,6 +1122,46 @@ xfs_qm_dqrele(
        xfs_qm_dqput(dqp);
 }
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+        struct xfs_buf          *bp,
+        struct xfs_log_item     *lip)
+{
+        xfs_dq_logitem_t        *qip = (struct xfs_dq_logitem *)lip;
+        xfs_dquot_t             *dqp = qip->qli_dquot;
+        struct xfs_ail          *ailp = lip->li_ailp;
+        /*
+         * We only want to pull the item from the AIL if its
+         * location in the log has not changed since we started the flush.
+         * Thus, we only bother if the dquot's lsn has
+         * not changed. First we check the lsn outside the lock
+         * since it's cheaper, and then we recheck while
+         * holding the lock before removing the dquot from the AIL.
+         */
+        if ((lip->li_flags & XFS_LI_IN_AIL) &&
+            lip->li_lsn == qip->qli_flush_lsn) {
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                spin_lock(&ailp->xa_lock);
+                if (lip->li_lsn == qip->qli_flush_lsn)
+                        xfs_trans_ail_delete(ailp, lip);
+                else
+                        spin_unlock(&ailp->xa_lock);
+        }
+        /*
+         * Release the dq's flush lock since we're done with it.
+         */
+        xfs_dqfunlock(dqp);
+}
 /*
 * Write a modified dquot to disk.
@@ -1222,8 +1243,9 @@ xfs_qm_dqflush(
         * Attach an iodone routine so that we can remove this dquot from the
         * AIL and release the flush lock once the dquot is synced to disk.
         */
-        xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
+        xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
-                              xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+                                  &dqp->q_logitem.qli_item);
        /*
         * If the buffer is pinned then push on the log so we won't
         * get stuck waiting in the write for too long.
@@ -1247,50 +1269,6 @@ xfs_qm_dqflush(
 }
-/*
- * This is the dquot flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk.  It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
-        xfs_buf_t               *bp,
-        xfs_dq_logitem_t        *qip)
-{
-        xfs_dquot_t             *dqp;
-        struct xfs_ail          *ailp;
-        dqp = qip->qli_dquot;
-        ailp = qip->qli_item.li_ailp;
-        /*
-         * We only want to pull the item from the AIL if its
-         * location in the log has not changed since we started the flush.
-         * Thus, we only bother if the dquot's lsn has
-         * not changed. First we check the lsn outside the lock
-         * since it's cheaper, and then we recheck while
-         * holding the lock before removing the dquot from the AIL.
-         */
-        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                spin_lock(&ailp->xa_lock);
-                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-                else
-                        spin_unlock(&ailp->xa_lock);
-        }
-        /*
-         * Release the dq's flush lock since we're done with it.
-         */
-        xfs_dqfunlock(dqp);
-}
 int
 xfs_qm_dqlock_nowait(
        xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 8d89a24ae324..2a1f3dc10a02 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
 /*
 * returns the number of iovecs needed to log the given dquot item.
 */
-/* ARGSUSED */
 STATIC uint
 xfs_qm_dquot_logitem_size(
-        xfs_dq_logitem_t        *logitem)
+        struct xfs_log_item     *lip)
 {
        /*
         * we need only two iovecs, one for the format, one for the real thing
         */
-        return (2);
+        return 2;
 }
 /*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
 */
 STATIC void
 xfs_qm_dquot_logitem_format(
-        xfs_dq_logitem_t        *logitem,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *logvec)
+        struct xfs_log_iovec    *logvec)
 {
-        ASSERT(logitem);
+        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-        ASSERT(logitem->qli_dquot);
-        logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+        logvec->i_addr = &qlip->qli_format;
        logvec->i_len  = sizeof(xfs_dq_logformat_t);
        logvec->i_type = XLOG_REG_TYPE_QFORMAT;
        logvec++;
-        logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+        logvec->i_addr = &qlip->qli_dquot->q_core;
        logvec->i_len  = sizeof(xfs_disk_dquot_t);
        logvec->i_type = XLOG_REG_TYPE_DQUOT;
-        ASSERT(2 == logitem->qli_item.li_desc->lid_size);
+        ASSERT(2 == lip->li_desc->lid_size);
-        logitem->qli_format.qlf_size = 2;
+        qlip->qli_format.qlf_size = 2;
 }
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
 */
 STATIC void
 xfs_qm_dquot_logitem_pin(
-        xfs_dq_logitem_t *logitem)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t *dqp = logitem->qli_dquot;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        atomic_inc(&dqp->q_pincount);
@@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin(
 * dquot must have been previously pinned with a call to
 * xfs_qm_dquot_logitem_pin().
 */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-        xfs_dq_logitem_t *logitem)
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        xfs_dquot_t *dqp = logitem->qli_dquot;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
        ASSERT(atomic_read(&dqp->q_pincount) > 0);
        if (atomic_dec_and_test(&dqp->q_pincount))
                wake_up(&dqp->q_pinwait);
 }
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
-        xfs_dq_logitem_t *logitem,
-        xfs_trans_t      *tp)
-{
-        xfs_qm_dquot_logitem_unpin(logitem);
-}
 /*
 * Given the logitem, this writes the corresponding dquot entry to disk
 * asynchronously. This is called with the dquot entry securely locked;
@@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
 */
 STATIC void
 xfs_qm_dquot_logitem_push(
-        xfs_dq_logitem_t        *logitem)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        int             error;
+        int                     error;
-        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
@@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push(
        xfs_dqunlock(dqp);
 }
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_dquot_logitem_committed(
-        xfs_dq_logitem_t        *l,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
        /*
         * We always re-log the entire dquot when it becomes dirty,
         * so, the latest copy _is_ the only one that matters.
         */
-        return (lsn);
+        return lsn;
 }
 /*
 * This is called to wait for the given dquot to be unpinned.
 * Most of these pin/unpin routines are plagiarized from inode code.
 */
 void
 xfs_qm_dqunpin_wait(
-        xfs_dquot_t     *dqp)
+        struct xfs_dquot        *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        if (atomic_read(&dqp->q_pincount) == 0)
@@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait(
 */
 STATIC void
 xfs_qm_dquot_logitem_pushbuf(
-        xfs_dq_logitem_t    *qip)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-        xfs_mount_t     *mp;
+        struct xfs_dquot        *dqp = qlip->qli_dquot;
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp;
-        dqp = qip->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        /*
@@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if (completion_done(&dqp->q_flush)  ||
+        if (completion_done(&dqp->q_flush) ||
-            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_dqunlock(dqp);
                return;
        }
-        mp = dqp->q_mount;
-        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
+        bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
-                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+                        dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
                return;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
-        return;
 }
 /*
@@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
 */
 STATIC uint
 xfs_qm_dquot_logitem_trylock(
-        xfs_dq_logitem_t        *qip)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t             *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        dqp = qip->qli_dquot;
        if (atomic_read(&dqp->q_pincount) > 0)
                return XFS_ITEM_PINNED;
-        if (! xfs_qm_dqlock_nowait(dqp))
+        if (!xfs_qm_dqlock_nowait(dqp))
                return XFS_ITEM_LOCKED;
        if (!xfs_dqflock_nowait(dqp)) {
@@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
                return XFS_ITEM_PUSHBUF;
        }
-        ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+        ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        return XFS_ITEM_SUCCESS;
 }
 /*
 * Unlock the dquot associated with the log item.
 * Clear the fields of the dquot and dquot log item that
@@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
 */
 STATIC void
 xfs_qm_dquot_logitem_unlock(
-        xfs_dq_logitem_t    *ql)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        ASSERT(ql != NULL);
-        dqp = ql->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        /*
@@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock(
        xfs_dqunlock(dqp);
 }
 /*
 * this needs to stamp an lsn into the dquot, I think.
 * rpc's that look at user dquot's would then have to
 * push on the dependency recorded in the dquot
 */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_committing(
-        xfs_dq_logitem_t        *l,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector for dquots
 */
 static struct xfs_item_ops xfs_dquot_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
+        .iop_size       = xfs_qm_dquot_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_dquot_logitem_format,
-                                        xfs_qm_dquot_logitem_format,
+        .iop_pin        = xfs_qm_dquot_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
+        .iop_unpin      = xfs_qm_dquot_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
+        .iop_trylock    = xfs_qm_dquot_logitem_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_unlock     = xfs_qm_dquot_logitem_unlock,
-                                        xfs_qm_dquot_logitem_unpin_remove,
+        .iop_committed  = xfs_qm_dquot_logitem_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))
+        .iop_push       = xfs_qm_dquot_logitem_push,
-                                        xfs_qm_dquot_logitem_trylock,
+        .iop_pushbuf    = xfs_qm_dquot_logitem_pushbuf,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
+        .iop_committing = xfs_qm_dquot_logitem_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_dquot_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))
-                                        xfs_qm_dquot_logitem_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_dquot_logitem_committing
 };
 /*
@@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
 */
 void
 xfs_qm_dquot_logitem_init(
-        struct xfs_dquot *dqp)
+        struct xfs_dquot        *dqp)
 {
-        xfs_dq_logitem_t  *lp;
+        struct xfs_dq_logitem   *lp = &dqp->q_logitem;
-        lp = &dqp->q_logitem;
        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
                                        &xfs_dquot_item_ops);
@@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init(
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
 /*
 * This returns the number of iovecs needed to log the given quotaoff item.
 * We only need 1 iovec for an quotaoff item.  It just logs the
 * quotaoff_log_format structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_size(
+        struct xfs_log_item     *lip)
 {
-        return (1);
+        return 1;
 }
 /*
@@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
 * slots in the quotaoff item have been filled.
 */
 STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t   *qf,
+xfs_qm_qoff_logitem_format(
-                           xfs_log_iovec_t      *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+        struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+        ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-        log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+        log_vector->i_addr = &qflip->qql_format;
        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
        log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
-        qf->qql_format.qf_size = 1;
+        qflip->qql_format.qf_size = 1;
 }
 /*
 * Pinning has no meaning for an quotaoff item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * Since pinning has no meaning for an quotaoff item, unpinning does
 * not either.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        return;
-}
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
-        return;
 }
 /*
 * Quotaoff items have no locking, so just return success.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_LOCKED;
 }
@@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
 * Quotaoff items have no locking or pushing, so return failure
 * so that the caller doesn't bother with us.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unlock(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * The quotaoff-start-item is logged only once and cannot be moved in the log,
 * so simply return the lsn at which it's been logged.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+xfs_qm_qoff_logitem_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return (lsn);
+        return lsn;
 }
 /*
 * There isn't much you can do to push on an quotaoff item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_qoffend_logitem_committed(
-        xfs_qoff_logitem_t *qfe,
+        struct xfs_log_item     *lip,
-        xfs_lsn_t lsn)
+        xfs_lsn_t               lsn)
 {
-        xfs_qoff_logitem_t      *qfs;
+        struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
-        struct xfs_ail          *ailp;
+        struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+        struct xfs_ail          *ailp = qfs->qql_item.li_ailp;
-        qfs = qfe->qql_start_lip;
-        ailp = qfs->qql_item.li_ailp;
-        spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
         * xfs_trans_ail_delete() drops the AIL lock.
         */
+        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
@@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed(
 * (truly makes the quotaoff irrevocable).  If we do something else,
 * then maybe we don't need two.
 */
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
-        return;
-}
-/* ARGSUSED */
 STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+xfs_qm_qoff_logitem_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               commit_lsn)
 {
-        return;
 }
 static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_size       = xfs_qm_qoff_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_qoff_logitem_format,
-                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = xfs_qm_qoff_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+        .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+        .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_committed  = xfs_qm_qoffend_logitem_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_push       = xfs_qm_qoff_logitem_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+        .iop_committing = xfs_qm_qoff_logitem_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoffend_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoffend_logitem_committing
 };
 /*
 * This is the ops vector shared by all quotaoff-start log items.
 */
 static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_size       = xfs_qm_qoff_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_qoff_logitem_format,
-                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = xfs_qm_qoff_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+        .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+        .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_committed  = xfs_qm_qoff_logitem_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_push       = xfs_qm_qoff_logitem_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+        .iop_committing = xfs_qm_qoff_logitem_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoff_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoff_logitem_committing
 };
 /*
 * Allocate and initialize an quotaoff item of the correct quota type(s).
 */
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
 xfs_qm_qoff_logitem_init(
-        struct xfs_mount *mp,
+        struct xfs_mount        *mp,
-        xfs_qoff_logitem_t *start,
+        struct xfs_qoff_logitem *start,
-        uint flags)
+        uint                    flags)
 {
-        xfs_qoff_logitem_t      *qf;
+        struct xfs_qoff_logitem *qf;
-        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
@@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init(
        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
        qf->qql_format.qf_flags = flags;
        qf->qql_start_lip = start;
-        return (qf);
+        return qf;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8c117ff2e3ab..9a92407109a1 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -69,7 +62,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -1497,7 +1490,7 @@ xfs_qm_dqiterate(
                                  maxlblkcnt - lblkno,
                                  XFS_BMAPI_METADATA,
                                  NULL,
-                                  0, map, &nmaps, NULL, NULL);
+                                  0, map, &nmaps, NULL);
                xfs_iunlock(qip, XFS_ILOCK_SHARED);
                if (error)
                        break;
@@ -1669,7 +1662,8 @@ xfs_qm_dqusage_adjust(
         * making us disable quotas for the file system.
         */
        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-                xfs_iput(ip, XFS_ILOCK_EXCL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                IRELE(ip);
                *res = BULKSTAT_RV_GIVEUP;
                return error;
        }
@@ -1682,7 +1676,8 @@ xfs_qm_dqusage_adjust(
                 * Walk thru the extent list and count the realtime blocks.
                 */
                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
-                        xfs_iput(ip, XFS_ILOCK_EXCL);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        IRELE(ip);
                        if (udqp)
                                xfs_qm_dqput(udqp);
                        if (gdqp)
@@ -2117,7 +2112,10 @@ xfs_qm_shake_freelist(
 */
 /* ARGSUSED */
 STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+        struct shrinker *shrink,
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c12794..bea02d786c5d 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 3d1fc79532e2..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index b4487764e923..45e5849df238 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -248,40 +238,74 @@ out_unlock:
        return error;
 }
+STATIC int
+xfs_qm_scall_trunc_qfile(
+        struct xfs_mount        *mp,
+        xfs_ino_t               ino)
+{
+        struct xfs_inode        *ip;
+        struct xfs_trans        *tp;
+        int                     error;
+        if (ino == NULLFSINO)
+                return 0;
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+        if (error)
+                return error;
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                  XFS_TRANS_PERM_LOG_RES,
+                                  XFS_ITRUNCATE_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                goto out_put;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
+        error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+        if (error) {
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                     XFS_TRANS_ABORT);
+                goto out_unlock;
+        }
+        xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out_unlock:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+        IRELE(ip);
+        return error;
+}
 int
 xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
        int             error = 0, error2 = 0;
-        xfs_inode_t     *qip;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
-        if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
+        if (flags & XFS_DQ_USER)
-                error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip);
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-                if (!error) {
+        if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
-                        error = xfs_truncate_file(mp, qip);
+                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
-                        IRELE(qip);
-                }
-        }
-        if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
-            mp->m_sb.sb_gquotino != NULLFSINO) {
-                error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip);
-                if (!error2) {
-                        error2 = xfs_truncate_file(mp, qip);
-                        IRELE(qip);
-                }
-        }
        return error ? error : error2;
 }
 /*
 * Switch on (a given) quota enforcement for a filesystem.  This takes
 * effect immediately.
@@ -786,9 +810,9 @@ xfs_qm_export_dquot(
        }
 #ifdef DEBUG
-        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
             (XFS_IS_OQUOTA_ENFORCED(mp) &&
-                        (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+                        (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
            dst->d_id != 0) {
                if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
                    (dst->d_blk_softlimit > 0)) {
@@ -809,17 +833,17 @@ xfs_qm_export_qtype_flags(
        /*
         * Can't be more than one, or none.
         */
-        ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
+        ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-                (XFS_PROJ_QUOTA | XFS_USER_QUOTA));
+                (FS_PROJ_QUOTA | FS_USER_QUOTA));
-        ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
+        ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-                (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
+                (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
+        ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-                (XFS_USER_QUOTA | XFS_GROUP_QUOTA));
+                (FS_USER_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
+        ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
        return (flags & XFS_DQ_USER) ?
-                XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+                FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-                        XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
+                        FS_PROJ_QUOTA : FS_GROUP_QUOTA;
 }
 STATIC uint
@@ -830,16 +854,16 @@ xfs_qm_export_flags(
        uflags = 0;
        if (flags & XFS_UQUOTA_ACCT)
-                uflags |= XFS_QUOTA_UDQ_ACCT;
+                uflags |= FS_QUOTA_UDQ_ACCT;
        if (flags & XFS_PQUOTA_ACCT)
-                uflags |= XFS_QUOTA_PDQ_ACCT;
+                uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_GQUOTA_ACCT)
-                uflags |= XFS_QUOTA_GDQ_ACCT;
+                uflags |= FS_QUOTA_GDQ_ACCT;
        if (flags & XFS_UQUOTA_ENFD)
-                uflags |= XFS_QUOTA_UDQ_ENFD;
+                uflags |= FS_QUOTA_UDQ_ENFD;
        if (flags & (XFS_OQUOTA_ENFD)) {
                uflags |= (flags & XFS_GQUOTA_ACCT) ?
-                        XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
+                        FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
        }
        return (uflags);
 }
@@ -875,8 +899,9 @@ xfs_dqrele_inode(
                xfs_qm_dqrele(ip->i_gdquot);
                ip->i_gdquot = NULL;
        }
-        xfs_iput(ip, XFS_ILOCK_EXCL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
        return 0;
 }
@@ -1143,7 +1168,8 @@ xfs_qm_internalqcheck_adjust(
         * of those now.
         */
        if (! ipreleased) {
-                xfs_iput(ip, lock_flags);
+                xfs_iunlock(ip, lock_flags);
+                IRELE(ip);
                ipreleased = B_TRUE;
                goto again;
        }
@@ -1160,7 +1186,8 @@ xfs_qm_internalqcheck_adjust(
                ASSERT(gd);
                xfs_qm_internalqcheck_dqadjust(ip, gd);
        }
-        xfs_iput(ip, lock_flags);
+        xfs_iunlock(ip, lock_flags);
+        IRELE(ip);
        *res = BULKSTAT_RV_DIDONE;
        return (0);
 }
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 061d827da33c..7de91d1b75c0 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -59,16 +49,14 @@ xfs_trans_dqjoin(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_dq_logitem_t    *lp = &dqp->q_logitem;
        ASSERT(dqp->q_transp != tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(lp->qli_dquot == dqp);
+        ASSERT(dqp->q_logitem.qli_dquot == dqp);
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+        xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
        /*
         * Initialize i_transp so we can later determine if this dquot is
@@ -93,16 +81,11 @@ xfs_trans_log_dquot(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_log_item_desc_t     *lidp;
        ASSERT(dqp->q_transp == tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 /*
@@ -874,9 +857,8 @@ xfs_trans_get_qoff_item(
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
+        xfs_trans_add_item(tp, &q->qql_item);
+        return q;
-        return (q);
 }
@@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item(
        xfs_trans_t             *tp,
        xfs_qoff_logitem_t      *qlp)
 {
-        xfs_log_item_desc_t     *lidp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee05..975aa10e1a47 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a7fbe8a99b12..af168faccc7a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -688,8 +683,6 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbno;          /* start bno of left side entry */
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
-        /*REFERENCED*/
-        xfs_agblock_t   ltend;          /* end bno of left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
@@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near(
                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                ltend = ltbno + ltlen;
+                ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-                ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
                args->len = blen;
                if (!xfs_alloc_fix_minleft(args)) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near(
                 */
                args->agbno = bnew;
                ASSERT(bnew >= ltbno);
-                ASSERT(bnew + blen <= ltend);
+                ASSERT(bnew + blen <= ltbno + ltlen);
                /*
                 * Set up a cursor for the by-bno tree.
                 */
@@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
        /*
         * Fix up the length and compute the useful address.
         */
-        ltend = ltbno + ltlen;
        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args)) {
@@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
                ltlen, &ltnew);
        ASSERT(ltnew >= ltbno);
-        ASSERT(ltnew + rlen <= ltend);
+        ASSERT(ltnew + rlen <= ltbno + ltlen);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 6d05199b667c..895009a97271 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -27,16 +27,16 @@ struct xfs_busy_extent;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
 */
-typedef enum xfs_alloctype
+#define XFS_ALLOCTYPE_ANY_AG    0x01    /* allocate anywhere, use rotor */
-{
+#define XFS_ALLOCTYPE_FIRST_AG  0x02    /* ... start at ag 0 */
-        XFS_ALLOCTYPE_ANY_AG,           /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_START_AG  0x04    /* anywhere, start in this a.g. */
-        XFS_ALLOCTYPE_FIRST_AG,         /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_THIS_AG   0x08    /* anywhere in this a.g. */
-        XFS_ALLOCTYPE_START_AG,         /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10    /* near this block else anywhere */
-        XFS_ALLOCTYPE_THIS_AG,          /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_NEAR_BNO  0x20    /* in this a.g. and near this block */
-        XFS_ALLOCTYPE_START_BNO,        /* near this block else anywhere */
+#define XFS_ALLOCTYPE_THIS_BNO  0x40    /* at exactly this block */
-        XFS_ALLOCTYPE_NEAR_BNO,         /* in this a.g. and near this block */
-        XFS_ALLOCTYPE_THIS_BNO          /* at exactly this block */
+/* this should become an enum again when the tracing code is fixed */
-} xfs_alloctype_t;
+typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_TYPES \
        { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 83f494218759..97f7328967fd 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c42..c2568242a901 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
                return (error);
        }
-        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(args.trans, dp);
-        xfs_trans_ihold(args.trans, dp);
        /*
         * If the attribute list is non-existent or a shortform list,
@@ -396,10 +389,8 @@ xfs_attr_set_int(
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args.trans, dp);
-                        xfs_trans_ihold(args.trans, dp);
-                }
                /*
                 * Commit the leaf transformation.  We'll need another (linked)
@@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
         * No need to make quota reservations here. We expect to release some
         * blocks not allocate in the common case.
         */
-        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(args.trans, dp);
-        xfs_trans_ihold(args.trans, dp);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
         * No need to make quota reservations here. We expect to release some
         * blocks, not allocate, in the common case.
         */
-        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(trans, dp);
-        xfs_trans_ihold(trans, dp);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                /*
                 * Commit the current trans (including the inode) and start
@@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                } else
                        xfs_da_buf_done(bp);
@@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
        } else
                xfs_da_buf_done(bp);
        return(0);
@@ -1317,10 +1300,8 @@ restart:
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                        /*
                         * Commit the node conversion and start the next
@@ -1356,10 +1337,8 @@ restart:
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
        } else {
                /*
                 * Addition succeeded, update Btree hashvals.
@@ -1470,10 +1449,8 @@ restart:
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                }
                /*
@@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                /*
                 * Commit the Btree join operation and start a new trans.
@@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                } else
                        xfs_da_brelse(args->trans, bp);
        }
@@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                  NULL, 0, map, &nmap, NULL, NULL);
+                                  NULL, 0, map, &nmap, NULL);
                if (error)
                        return(error);
                ASSERT(nmap >= 1);
@@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
                                                        XFS_BMAPI_WRITE,
                                  args->firstblock, args->total, &map, &nmap,
-                                  args->flist, NULL);
+                                  args->flist);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                &committed);
@@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                  args->firstblock, 0, &map, &nmap,
-                                  NULL, NULL);
+                                  NULL);
                if (error) {
                        return(error);
                }
@@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                                        args->rmtblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                        args->firstblock, 0, &map, &nmap,
-                                        args->flist, NULL);
+                                        args->flist);
                if (error) {
                        return(error);
                }
@@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                    1, args->firstblock, args->flist,
-                                    NULL, &done);
+                                    &done);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                &committed);
@@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, args->dp);
-                        xfs_trans_ihold(args->trans, args->dp);
-                }
                /*
                 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc256..a6cff8edcdb6 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                nmap = 1;
                error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                        NULL, 0, &map, &nmap, NULL, NULL);
+                                        NULL, 0, &map, &nmap, NULL);
                if (error) {
                        return(error);
                }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 99587ded043f..23f14e595c18 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);  /* OK to allocate reserved blocks */
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork); /* data or attr fork */
 /*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        xfs_extdelta_t          *delta); /* Change made to incore extents */
 /*
 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);   /* OK to allocate reserved blocks */
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to use reserved data blocks */
 {
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else
                        logflags = 0;
-                /* DELTA: single new extent */
-                if (delta) {
-                        if (delta->xed_startoff > new->br_startoff)
-                                delta->xed_startoff = new->br_startoff;
-                        if (delta->xed_blockcount <
-                                        new->br_startoff + new->br_blockcount)
-                                delta->xed_blockcount = new->br_startoff +
-                                                new->br_blockcount;
-                }
        }
        /*
         * Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                &logflags, delta, rsvd)))
+                                &logflags, rsvd)))
                        goto done;
        }
        /*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, delta, whichfork)))
+                                &logflags, whichfork)))
                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
                                                XFS_BTCUR_BPRV_WASDEL);
                                if ((error = xfs_bmap_add_extent_delay_real(ip,
                                        idx, &cur, new, &da_new, first, flist,
-                                        &logflags, delta, rsvd)))
+                                        &logflags, rsvd)))
                                        goto done;
                        } else if (new->br_state == XFS_EXT_NORM) {
                                ASSERT(new->br_state == XFS_EXT_NORM);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags, delta)))
+                                        ip, idx, &cur, new, &logflags)))
                                        goto done;
                        } else {
                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags, delta)))
+                                        ip, idx, &cur, new, &logflags)))
                                        goto done;
                        }
                        ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, delta, whichfork)))
+                                        new, &logflags, whichfork)))
                                goto done;
                }
        }
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Three in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
-                /* DELTA: The in-core extent described by new changed type. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real(
                        nullstartblock((int)temp2));
                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
                *dnew = temp + temp2;
-                /* DELTA: One in-core extent is split in three. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real(
                ASSERT(0);
        }
        *curp = cur;
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        xfs_extdelta_t          *delta) /* Change made to incore extents */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
@@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
-        xfs_filblks_t           temp=0;
-        xfs_filblks_t           temp2=0;
 #define LEFT            r[0]
 #define RIGHT           r[1]
@@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real(
                                RIGHT.br_blockcount, LEFT.br_state)))
                                goto done;
                }
-                /* DELTA: Three in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state)))
                                goto done;
                }
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: The in-core extent described by new changed type. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state))
                                goto done;
                }
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING:
@@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING:
@@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in three. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real(
                ASSERT(0);
        }
        *curp = cur;
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)           /* OK to allocate reserved blocks */
 {
        xfs_bmbt_rec_host_t     *ep;    /* extent record for idx */
@@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     state;  /* state bits, accessed thru macros */
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
-        xfs_filblks_t           temp2=0;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay(
                xfs_iext_remove(ip, idx, 1, state);
                ip->i_df.if_lastex = idx - 1;
-                /* DELTA: Two in-core extents were replaced by one. */
-                temp2 = temp;
-                temp = left.br_startoff;
                break;
        case BMAP_LEFT_CONTIG:
@@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay(
                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
                ip->i_df.if_lastex = idx - 1;
-                /* DELTA: One in-core extent grew into a hole. */
-                temp2 = temp;
-                temp = left.br_startoff;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay(
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ip->i_df.if_lastex = idx;
-                /* DELTA: One in-core extent grew into a hole. */
-                temp2 = temp;
-                temp = new->br_startoff;
                break;
        case 0:
@@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay(
                oldlen = newlen = 0;
                xfs_iext_insert(ip, idx, 1, new, state);
                ip->i_df.if_lastex = idx;
-                /* DELTA: A new in-core extent was added in a hole. */
-                temp2 = new->br_blockcount;
-                temp = new->br_startoff;
                break;
        }
        if (oldlen != newlen) {
@@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay(
                 * Nothing to do for disk quota accounting here.
                 */
        }
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
        *logflagsp = 0;
        return 0;
 }
@@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork) /* data or attr fork */
 {
        xfs_bmbt_rec_host_t     *ep;    /* pointer to extent entry ins. point */
@@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
-        xfs_filblks_t           temp=0;
-        xfs_filblks_t           temp2=0;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real(
                                        left.br_state)))
                                goto done;
                }
-                /* DELTA: Two in-core extents were replaced by one. */
-                temp = left.br_startoff;
-                temp2 = left.br_blockcount +
-                        new->br_blockcount +
-                        right.br_blockcount;
                break;
        case BMAP_LEFT_CONTIG:
@@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real(
                                        left.br_state)))
                                goto done;
                }
-                /* DELTA: One in-core extent grew. */
-                temp = left.br_startoff;
-                temp2 = left.br_blockcount +
-                        new->br_blockcount;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_state)))
                                goto done;
                }
-                /* DELTA: One in-core extent grew. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount +
-                        right.br_blockcount;
                break;
        case 0:
@@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: A new extent was added in a hole. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        }
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -2959,7 +2809,6 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to allocate reserved blocks */
 {
@@ -3265,14 +3114,6 @@ xfs_bmap_del_extent(
        if (da_old > da_new)
                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
                        rsvd);
-        if (delta) {
-                /* DELTA: report the original extent. */
-                if (delta->xed_startoff > got.br_startoff)
-                        delta->xed_startoff = got.br_startoff;
-                if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
-                        delta->xed_blockcount = got.br_startoff +
-                                                        got.br_blockcount;
-        }
 done:
        *logflagsp = flags;
        return error;
@@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork(
                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        IHOLD(ip);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_DEV:
                ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -4483,8 +4325,7 @@ xfs_bmapi(
        xfs_extlen_t    total,          /* total blocks needed */
        xfs_bmbt_irec_t *mval,          /* output: map values */
        int             *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t *flist,         /* i/o: list extents to free */
+        xfs_bmap_free_t *flist)         /* i/o: list extents to free */
-        xfs_extdelta_t  *delta)         /* o: change made to incore extents */
 {
        xfs_fsblock_t   abno;           /* allocated block number */
        xfs_extlen_t    alen;           /* allocated extent length */
@@ -4596,10 +4437,7 @@ xfs_bmapi(
        end = bno + len;
        obno = bno;
        bma.ip = NULL;
-        if (delta) {
-                delta->xed_startoff = NULLFILEOFF;
-                delta->xed_blockcount = 0;
-        }
        while (bno < end && n < *nmap) {
                /*
                 * Reading past eof, act as though there's a hole
@@ -4620,19 +4458,13 @@ xfs_bmapi(
                         * allocate the stuff asked for in this bmap call
                         * but that wouldn't be as good.
                         */
-                        if (wasdelay && !(flags & XFS_BMAPI_EXACT)) {
+                        if (wasdelay) {
                                alen = (xfs_extlen_t)got.br_blockcount;
                                aoff = got.br_startoff;
                                if (lastx != NULLEXTNUM && lastx) {
                                        ep = xfs_iext_get_ext(ifp, lastx - 1);
                                        xfs_bmbt_get_all(ep, &prev);
                                }
-                        } else if (wasdelay) {
-                                alen = (xfs_extlen_t)
-                                        XFS_FILBLKS_MIN(len,
-                                                (got.br_startoff +
-                                                 got.br_blockcount) - bno);
-                                aoff = bno;
                        } else {
                                alen = (xfs_extlen_t)
                                        XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4831,7 +4663,7 @@ xfs_bmapi(
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-                                firstblock, flist, &tmp_logflags, delta,
+                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
@@ -4927,7 +4759,7 @@ xfs_bmapi(
                        }
                        mval->br_state = XFS_EXT_NORM;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-                                firstblock, flist, &tmp_logflags, delta,
+                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
@@ -5017,14 +4849,6 @@ xfs_bmapi(
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
               XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
        error = 0;
-        if (delta && delta->xed_startoff != NULLFILEOFF) {
-                /* A change was actually made.
-                 * Note that delta->xed_blockount is an offset at this
-                 * point and needs to be converted to a block count.
-                 */
-                ASSERT(delta->xed_blockcount > delta->xed_startoff);
-                delta->xed_blockcount -= delta->xed_startoff;
-        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5136,8 +4960,6 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        xfs_extdelta_t          *delta,         /* o: change made to incore
-                                                   extents */
        int                     *done)          /* set if not done yet */
 {
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
@@ -5196,10 +5018,7 @@ xfs_bunmapi(
        bno = start + len - 1;
        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
                &prev);
-        if (delta) {
-                delta->xed_startoff = NULLFILEOFF;
-                delta->xed_blockcount = 0;
-        }
        /*
         * Check to see if the given block number is past the end of the
         * file, back up to the last block if so...
@@ -5297,7 +5116,7 @@ xfs_bunmapi(
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-                                firstblock, flist, &logflags, delta,
+                                firstblock, flist, &logflags,
                                XFS_DATA_FORK, 0);
                        if (error)
                                goto error0;
@@ -5352,7 +5171,7 @@ xfs_bunmapi(
                                prev.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        delta, XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5361,7 +5180,7 @@ xfs_bunmapi(
                                del.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        delta, XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5414,7 +5233,7 @@ xfs_bunmapi(
                        goto error0;
                }
                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-                                &tmp_logflags, delta, whichfork, rsvd);
+                                &tmp_logflags, whichfork, rsvd);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
@@ -5471,14 +5290,6 @@ nodelete:
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
-        if (delta && delta->xed_startoff != NULLFILEOFF) {
-                /* A change was actually made.
-                 * Note that delta->xed_blockount is an offset at this
-                 * point and needs to be converted to a block count.
-                 */
-                ASSERT(delta->xed_blockcount > delta->xed_startoff);
-                delta->xed_blockcount -= delta->xed_startoff;
-        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5605,28 +5416,6 @@ xfs_getbmap(
                prealloced = 0;
                fixlen = 1LL << 32;
        } else {
-                /*
-                 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
-                 * not generate a DMAPI read event.  Otherwise, if the
-                 * DM_EVENT_READ bit is set for the file, generate a read
-                 * event in order that the DMAPI application may do its thing
-                 * before we return the extents.  Usually this means restoring
-                 * user file data to regions of the file that look like holes.
-                 *
-                 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-                 * BMV_IF_NO_DMAPI_READ so that read events are generated.
-                 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
-                 * could misinterpret holes in a DMAPI file as true holes,
-                 * when in fact they may represent offline user data.
-                 */
-                if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-                    !(iflags & BMV_IF_NO_DMAPI_READ)) {
-                        error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
-                                              0, 0, 0, NULL);
-                        if (error)
-                                return XFS_ERROR(error);
-                }
                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5502,7 @@ xfs_getbmap(
                error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
                                  XFS_BB_TO_FSB(mp, bmv->bmv_length),
                                  bmapi_flags, NULL, 0, map, &nmap,
-                                  NULL, NULL);
+                                  NULL);
                if (error)
                        goto out_free_map;
                ASSERT(nmap <= subnex);
@@ -5859,66 +5648,34 @@ xfs_bmap_eof(
 }
 #ifdef DEBUG
-STATIC
+STATIC struct xfs_buf *
-xfs_buf_t *
 xfs_bmap_get_bp(
-        xfs_btree_cur_t         *cur,
+        struct xfs_btree_cur    *cur,
        xfs_fsblock_t           bno)
 {
-        int i;
+        struct xfs_log_item_desc *lidp;
-        xfs_buf_t *bp;
+        int                     i;
        if (!cur)
-                return(NULL);
+                return NULL;
-        bp = NULL;
-        for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-                bp = cur->bc_bufs[i];
-                if (!bp) break;
-                if (XFS_BUF_ADDR(bp) == bno)
-                        break;  /* Found it */
-        }
-        if (i == XFS_BTREE_MAXLEVELS)
-                bp = NULL;
-        if (!bp) { /* Chase down all the log items to see if the bp is there */
-                xfs_log_item_chunk_t    *licp;
-                xfs_trans_t             *tp;
-                tp = cur->bc_tp;
-                licp = &tp->t_items;
-                while (!bp && licp != NULL) {
-                        if (xfs_lic_are_all_free(licp)) {
-                                licp = licp->lic_next;
-                                continue;
-                        }
-                        for (i = 0; i < licp->lic_unused; i++) {
-                                xfs_log_item_desc_t     *lidp;
-                                xfs_log_item_t          *lip;
-                                xfs_buf_log_item_t      *bip;
-                                xfs_buf_t               *lbp;
-                                if (xfs_lic_isfree(licp, i)) {
-                                        continue;
-                                }
-                                lidp = xfs_lic_slot(licp, i);
-                                lip = lidp->lid_item;
-                                if (lip->li_type != XFS_LI_BUF)
-                                        continue;
-                                bip = (xfs_buf_log_item_t *)lip;
+        for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-                                lbp = bip->bli_buf;
+                if (!cur->bc_bufs[i])
+                        break;
+                if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+                        return cur->bc_bufs[i];
+        }
-                                if (XFS_BUF_ADDR(lbp) == bno) {
+        /* Chase down all the log items to see if the bp is there */
-                                        bp = lbp;
+        list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-                                        break; /* Found it */
+                struct xfs_buf_log_item *bip;
-                                }
+                bip = (struct xfs_buf_log_item *)lidp->lid_item;
-                        }
+                if (bip->bli_item.li_type == XFS_LI_BUF &&
-                        licp = licp->lic_next;
+                    XFS_BUF_ADDR(bip->bli_buf) == bno)
-                }
+                        return bip->bli_buf;
        }
-        return(bp);
+        return NULL;
 }
 STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87d..b13569a6179b 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
 extern kmem_zone_t      *xfs_bmap_free_item_zone;
 /*
- * DELTA: describe a change to the in-core extent list.
- *
- * Internally the use of xed_blockount is somewhat funky.
- * xed_blockcount contains an offset much of the time because this
- * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
- * the same underlying type).
- */
-typedef struct xfs_extdelta
-{
-        xfs_fileoff_t           xed_startoff;   /* offset of range */
-        xfs_filblks_t           xed_blockcount; /* blocks in range */
-} xfs_extdelta_t;
-/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -82,16 +68,13 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_DELAY         0x002   /* delayed write operation */
 #define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
-#define XFS_BMAPI_EXACT         0x010   /* allocate only to spec'd bounds */
+#define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
-#define XFS_BMAPI_ATTRFORK      0x020   /* use attribute fork not data */
+#define XFS_BMAPI_RSVBLOCKS     0x020   /* OK to alloc. reserved data blocks */
-#define XFS_BMAPI_ASYNC         0x040   /* bunmapi xactions can be async */
+#define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
-#define XFS_BMAPI_RSVBLOCKS     0x080   /* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
-#define XFS_BMAPI_PREALLOC      0x100   /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE       0x200   /* Ignore state - */
                                        /* combine contig. space */
-#define XFS_BMAPI_CONTIG        0x400   /* must allocate only one extent */
+#define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
-/*      XFS_BMAPI_DIRECT_IO     0x800   */
+#define XFS_BMAPI_CONVERT       0x200   /* unwritten extent conversion - */
-#define XFS_BMAPI_CONVERT       0x1000  /* unwritten extent conversion - */
                                        /* need write cache flushing and no */
                                        /* additional allocation alignments */
@@ -100,9 +83,7 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_DELAY,      "DELAY" }, \
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
-        { XFS_BMAPI_EXACT,      "EXACT" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-        { XFS_BMAPI_ASYNC,      "ASYNC" }, \
        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
@@ -310,9 +291,7 @@ xfs_bmapi(
        xfs_extlen_t            total,          /* total blocks needed */
        struct xfs_bmbt_irec    *mval,          /* output: map values */
        int                     *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_bmap_free_t         *flist);        /* i/o: list extents to free */
-        xfs_extdelta_t          *delta);        /* o: change made to incore
-                                                   extents */
 /*
 * Map file blocks to filesystem blocks, simple version.
@@ -346,8 +325,6 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        xfs_extdelta_t          *delta,         /* o: change made to incore
-                                                   extents */
        int                     *done);         /* set if not done yet */
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b83..87d3c10b6954 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f2496..829af92f0fba 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 02a80984aa05..1b09d7a280df 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
 kmem_zone_t     *xfs_buf_item_zone;
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
 #ifdef XFS_TRANS_DEBUG
 /*
 * This function uses an alternate strategy for tracking the bytes
@@ -151,12 +156,13 @@ STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 */
 STATIC uint
 xfs_buf_item_size(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        uint            nvecs;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        int             next_bit;
+        struct xfs_buf          *bp = bip->bli_buf;
-        int             last_bit;
+        uint                    nvecs;
-        xfs_buf_t       *bp;
+        int                     next_bit;
+        int                     last_bit;
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        if (bip->bli_flags & XFS_BLI_STALE) {
@@ -170,7 +176,6 @@ xfs_buf_item_size(
                return 1;
        }
-        bp = bip->bli_buf;
        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
        nvecs = 1;
        last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -219,13 +224,13 @@ xfs_buf_item_size(
 */
 STATIC void
 xfs_buf_item_format(
-        xfs_buf_log_item_t      *bip,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *log_vector)
+        struct xfs_log_iovec    *vecp)
 {
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf  *bp = bip->bli_buf;
        uint            base_size;
        uint            nvecs;
-        xfs_log_iovec_t *vecp;
-        xfs_buf_t       *bp;
        int             first_bit;
        int             last_bit;
        int             next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
-        bp = bip->bli_buf;
-        vecp = log_vector;
        /*
         * The size of the base structure is the size of the
@@ -248,7 +251,7 @@ xfs_buf_item_format(
        base_size =
                (uint)(sizeof(xfs_buf_log_format_t) +
                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
-        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+        vecp->i_addr = &bip->bli_format;
        vecp->i_len = base_size;
        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
        vecp++;
@@ -263,7 +266,7 @@ xfs_buf_item_format(
         */
        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
-                      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+                      xfs_log_item_in_current_chkpt(lip)))
                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
        }
@@ -356,66 +359,90 @@ xfs_buf_item_format(
 /*
 * This is called to pin the buffer associated with the buf log item in memory
- * so it cannot be written out.  Simply call bpin() on the buffer to do this.
+ * so it cannot be written out.
 *
 * We also always take a reference to the buffer log item here so that the bli
 * is held while the item is pinned in memory. This means that we can
 * unconditionally drop the reference count a transaction holds when the
 * transaction is completed.
 */
 STATIC void
 xfs_buf_item_pin(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
-        atomic_inc(&bip->bli_refcount);
        trace_xfs_buf_item_pin(bip);
-        xfs_bpin(bp);
-}
+        atomic_inc(&bip->bli_refcount);
+        atomic_inc(&bip->bli_buf->b_pin_count);
+}
 /*
 * This is called to unpin the buffer associated with the buf log
 * item which was previously pinned with a call to xfs_buf_item_pin().
- * Just call bunpin() on the buffer to do this.
 *
 * Also drop the reference to the buf item for the current transaction.
 * If the XFS_BLI_STALE flag is set and we are the last reference,
 * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path.  If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
 */
 STATIC void
 xfs_buf_item_unpin(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        struct xfs_ail  *ailp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        xfs_buf_t       *bp;
+        xfs_buf_t       *bp = bip->bli_buf;
-        int             freed;
+        struct xfs_ail  *ailp = lip->li_ailp;
        int             stale = bip->bli_flags & XFS_BLI_STALE;
+        int             freed;
-        bp = bip->bli_buf;
-        ASSERT(bp != NULL);
        ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        trace_xfs_buf_item_unpin(bip);
        freed = atomic_dec_and_test(&bip->bli_refcount);
-        ailp = bip->bli_item.li_ailp;
-        xfs_bunpin(bp);
+        if (atomic_dec_and_test(&bp->b_pin_count))
+                wake_up_all(&bp->b_waiters);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
+                if (remove) {
+                        /*
+                         * We have to remove the log item from the transaction
+                         * as we are about to release our reference to the
+                         * buffer.  If we don't, the unlock that occurs later
+                         * in xfs_trans_uncommit() will ry to reference the
+                         * buffer which we no longer have a hold on.
+                         */
+                        xfs_trans_del_item(lip);
+                        /*
+                         * Since the transaction no longer refers to the buffer,
+                         * the buffer should no longer refer to the transaction.
+                         */
+                        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+                }
                /*
                 * If we get called here because of an IO error, we may
                 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -437,48 +464,6 @@ xfs_buf_item_unpin(
 }
 /*
- * this is called from uncommit in the forced-shutdown path.
- * we need to check to see if the reference count on the log item
- * is going to drop to zero.  If so, unpin will free the log item
- * so we need to free the item's descriptor (that points to the item)
- * in the transaction.
- */
-STATIC void
-xfs_buf_item_unpin_remove(
-        xfs_buf_log_item_t      *bip,
-        xfs_trans_t             *tp)
-{
-        /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
-        if ((atomic_read(&bip->bli_refcount) == 1) &&
-            (bip->bli_flags & XFS_BLI_STALE)) {
-                /*
-                 * yes -- We can safely do some work here and then call
-                 * buf_item_unpin to do the rest because we are
-                 * are holding the buffer locked so no one else will be
-                 * able to bump up the refcount. We have to remove the
-                 * log item from the transaction as we are about to release
-                 * our reference to the buffer. If we don't, the unlock that
-                 * occurs later in the xfs_trans_uncommit() will try to
-                 * reference the buffer which we no longer have a hold on.
-                 */
-                struct xfs_log_item_desc *lidp;
-                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
-                trace_xfs_buf_item_unpin_stale(bip);
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
-                xfs_trans_free_item(tp, lidp);
-                /*
-                 * Since the transaction no longer refers to the buffer, the
-                 * buffer should no longer refer to the transaction.
-                 */
-                XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
-        }
-        xfs_buf_item_unpin(bip);
-}
-/*
 * This is called to attempt to lock the buffer associated with this
 * buf log item.  Don't sleep on the buffer lock.  If we can't get
 * the lock right away, return 0.  If we can get the lock, take a
@@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove(
 */
 STATIC uint
 xfs_buf_item_trylock(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
-        bp = bip->bli_buf;
        if (XFS_BUF_ISPINNED(bp))
                return XFS_ITEM_PINNED;
        if (!XFS_BUF_CPSEMA(bp))
@@ -529,13 +514,12 @@ xfs_buf_item_trylock(
 */
 STATIC void
 xfs_buf_item_unlock(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        int             aborted;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp = bip->bli_buf;
-        uint            hold;
+        int                     aborted;
+        uint                    hold;
-        bp = bip->bli_buf;
        /* Clear the buffer's association with this transaction. */
        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
@@ -546,7 +530,7 @@ xfs_buf_item_unlock(
         * (cancelled) buffers at unpin time, but we'll never go through the
         * pin/unpin cycle if we abort inside commit.
         */
-        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+        aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
        /*
         * Before possibly freeing the buf item, determine if we should
@@ -607,16 +591,16 @@ xfs_buf_item_unlock(
 */
 STATIC xfs_lsn_t
 xfs_buf_item_committed(
-        xfs_buf_log_item_t      *bip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        trace_xfs_buf_item_committed(bip);
-        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
-            (bip->bli_item.li_lsn != 0)) {
+                return lip->li_lsn;
-                return bip->bli_item.li_lsn;
+        return lsn;
-        }
-        return (lsn);
 }
 /*
@@ -626,15 +610,16 @@ xfs_buf_item_committed(
 */
 STATIC void
 xfs_buf_item_push(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
        trace_xfs_buf_item_push(bip);
-        bp = bip->bli_buf;
-        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
        xfs_buf_relse(bp);
 }
@@ -646,22 +631,24 @@ xfs_buf_item_push(
 */
 STATIC void
 xfs_buf_item_pushbuf(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(XFS_BUF_ISDELAYWRITE(bp));
        trace_xfs_buf_item_pushbuf(bip);
-        bp = bip->bli_buf;
-        ASSERT(XFS_BUF_ISDELAYWRITE(bp));
        xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
 }
-/* ARGSUSED */
 STATIC void
-xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+xfs_buf_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               commit_lsn)
 {
 }
@@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
 * This is the ops vector shared by all buf log items.
 */
 static struct xfs_item_ops xfs_buf_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
+        .iop_size       = xfs_buf_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_buf_item_format,
-                                        xfs_buf_item_format,
+        .iop_pin        = xfs_buf_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
+        .iop_unpin      = xfs_buf_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
+        .iop_trylock    = xfs_buf_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+        .iop_unlock     = xfs_buf_item_unlock,
-                                        xfs_buf_item_unpin_remove,
+        .iop_committed  = xfs_buf_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
+        .iop_push       = xfs_buf_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
+        .iop_pushbuf    = xfs_buf_item_pushbuf,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+        .iop_committing = xfs_buf_item_committing
-                                        xfs_buf_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_buf_item_committing
 };
@@ -712,7 +694,6 @@ xfs_buf_item_init(
         */
        if (bp->b_mount != mp)
                bp->b_mount = mp;
-        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                if (lip->li_type == XFS_LI_BUF) {
@@ -1098,15 +1079,14 @@ xfs_buf_error_relse(
 * It is called by xfs_buf_iodone_callbacks() above which will take
 * care of cleaning up the buffer itself.
 */
-/* ARGSUSED */
 void
 xfs_buf_iodone(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
+        struct xfs_ail          *ailp = lip->li_ailp;
-        ASSERT(bip->bli_buf == bp);
+        ASSERT(BUF_ITEM(lip)->bli_buf == bp);
        xfs_buf_rele(bp);
@@ -1120,6 +1100,6 @@ xfs_buf_iodone(
         * Either way, AIL is useless if we're forcing a shutdown.
         */
        spin_lock(&ailp->xa_lock);
-        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+        xfs_trans_ail_delete(ailp, lip);
-        xfs_buf_item_free(bip);
+        xfs_buf_item_free(BUF_ITEM(lip));
 }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f20bb472d582..0e2ed43f16c7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -124,7 +124,7 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
                              void(*)(struct xfs_buf *, xfs_log_item_t *),
                              xfs_log_item_t *);
 void    xfs_buf_iodone_callbacks(struct xfs_buf *);
-void    xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+void    xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 #ifdef XFS_TRANS_DEBUG
 void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf31..30fa0e206fba 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        xfs_da_intnode_t *node;
        xfs_da_node_entry_t *btree;
        int tmp;
-        xfs_mount_t *mp;
        node = oldblk->bp->data;
-        mp = state->mp;
        ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
        ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
        ASSERT(newblk->blkno != 0);
        if (state->args->whichfork == XFS_DATA_FORK)
-                ASSERT(newblk->blkno >= mp->m_dirleafblk &&
+                ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
-                       newblk->blkno < mp->m_dirfreeblk);
+                       newblk->blkno < state->mp->m_dirfreeblk);
        /*
         * We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
                        XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist, NULL))) {
+                        args->flist))) {
                return error;
        }
        ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
                                        XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist,
+                                        &mapp[mapi], &nmap, args->flist))) {
-                                        NULL))) {
                                kmem_free(mapp);
                                return error;
                        }
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                 */
                if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
                                xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                0, args->firstblock, args->flist, NULL,
+                                0, args->firstblock, args->flist,
                                &done)) == ENOSPC) {
                        if (w != XFS_DATA_FORK)
                                break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
                                        nfsb,
                                        XFS_BMAPI_METADATA |
                                                xfs_bmapi_aflag(whichfork),
-                                        NULL, 0, mapp, &nmap, NULL, NULL)))
+                                        NULL, 0, mapp, &nmap, NULL)))
                                goto exit0;
                }
        } else {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7f159d2a429a..3b9582c60a22 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dfrag.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
@@ -425,11 +416,8 @@ xfs_swap_extents(
        }
-        IHOLD(ip);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        IHOLD(tip);
-        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
        xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f041265..a1321bc7f192 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
        int             rval;           /* return value */
        int             v;              /* type-checking value */
-        xfs_itrace_entry(dp);
+        trace_xfs_readdir(dp);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
        if ((error = xfs_bmapi(tp, dp, bno, count,
                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist, NULL)))
+                        args->flist)))
                return error;
        ASSERT(nmap <= 1);
        if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
                        if ((error = xfs_bmapi(tp, dp, b, c,
                                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist,
+                                        &mapp[mapi], &nmap, args->flist))) {
-                                        NULL))) {
                                kmem_free(mapp);
                                return error;
                        }
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
         */
        if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
                        XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-                        NULL, &done))) {
+                        &done))) {
                /*
                 * ENOSPC actually can happen if we're in a removename with
                 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a84..580d99cef9e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
         */
        buf_len = dp->i_df.if_bytes;
-        buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+        buf = kmem_alloc(buf_len, KM_SLEEP);
-        memcpy(buf, sfp, dp->i_df.if_bytes);
+        memcpy(buf, sfp, buf_len);
-        xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
        dp->i_d.di_size = 0;
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d694330..921595b84f5b 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9e..504be8640e91 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
                                        xfs_dir2_byte_to_da(mp,
                                                XFS_DIR2_LEAF_OFFSET) - map_off,
                                        XFS_BMAPI_METADATA, NULL, 0,
-                                        &map[map_valid], &nmap, NULL, NULL);
+                                        &map[map_valid], &nmap, NULL);
                                /*
                                 * Don't know if we should ignore this or
                                 * try to return an error.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae756..f9a0864b696a 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463a..b1bae6b1eed9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd72375..000000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_H__
-#define __XFS_DMAPI_H__
-/*      Values used to define the on-disk version of dm_attrname_t. All
- *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
- *
- *      In the on-disk inode, DMAPI attribute names consist of the user-provided
- *      name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
- *      changed.
- */
-#define DMATTR_PREFIXLEN        8
-#define DMATTR_PREFIXSTRING     "SGI_DMI_"
-typedef enum {
-        DM_EVENT_INVALID        = -1,
-        DM_EVENT_CANCEL         = 0,            /* not supported */
-        DM_EVENT_MOUNT          = 1,
-        DM_EVENT_PREUNMOUNT     = 2,
-        DM_EVENT_UNMOUNT        = 3,
-        DM_EVENT_DEBUT          = 4,            /* not supported */
-        DM_EVENT_CREATE         = 5,
-        DM_EVENT_CLOSE          = 6,            /* not supported */
-        DM_EVENT_POSTCREATE     = 7,
-        DM_EVENT_REMOVE         = 8,
-        DM_EVENT_POSTREMOVE     = 9,
-        DM_EVENT_RENAME         = 10,
-        DM_EVENT_POSTRENAME     = 11,
-        DM_EVENT_LINK           = 12,
-        DM_EVENT_POSTLINK       = 13,
-        DM_EVENT_SYMLINK        = 14,
-        DM_EVENT_POSTSYMLINK    = 15,
-        DM_EVENT_READ           = 16,
-        DM_EVENT_WRITE          = 17,
-        DM_EVENT_TRUNCATE       = 18,
-        DM_EVENT_ATTRIBUTE      = 19,
-        DM_EVENT_DESTROY        = 20,
-        DM_EVENT_NOSPACE        = 21,
-        DM_EVENT_USER           = 22,
-        DM_EVENT_MAX            = 23
-} dm_eventtype_t;
-#define HAVE_DM_EVENTTYPE_T
-typedef enum {
-        DM_RIGHT_NULL,
-        DM_RIGHT_SHARED,
-        DM_RIGHT_EXCL
-} dm_right_t;
-#define HAVE_DM_RIGHT_T
-/* Defines for determining if an event message should be sent. */
-#ifdef HAVE_DMAPI
-#define DM_EVENT_ENABLED(ip, event) ( \
-        unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
-                ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
-                  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
-        )
-#else
-#define DM_EVENT_ENABLED(ip, event)     (0)
-#endif
-#define DM_XFS_VALID_FS_EVENTS          ( \
-        (1 << DM_EVENT_PREUNMOUNT)      | \
-        (1 << DM_EVENT_UNMOUNT)         | \
-        (1 << DM_EVENT_NOSPACE)         | \
-        (1 << DM_EVENT_DEBUT)           | \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a regular file or a symlink.  These events are persistent.
-*/
-#define DM_XFS_VALID_FILE_EVENTS        ( \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a directory.  These events are persistent.
-*/
-#define DM_XFS_VALID_DIRECTORY_EVENTS   ( \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events supported by the XFS filesystem. */
-#define DM_XFS_SUPPORTED_EVENTS         ( \
-        (1 << DM_EVENT_MOUNT)           | \
-        (1 << DM_EVENT_PREUNMOUNT)      | \
-        (1 << DM_EVENT_UNMOUNT)         | \
-        (1 << DM_EVENT_NOSPACE)         | \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_READ)            | \
-        (1 << DM_EVENT_WRITE)           | \
-        (1 << DM_EVENT_TRUNCATE)        | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/*
- *      Definitions used for the flags field on dm_send_*_event().
- */
-#define DM_FLAGS_NDELAY         0x001   /* return EAGAIN after dm_pending() */
-#define DM_FLAGS_UNWANTED       0x002   /* event not in fsys dm_eventset_t */
-#define DM_FLAGS_IMUX           0x004   /* thread holds i_mutex */
-#define DM_FLAGS_IALLOCSEM_RD   0x010   /* thread holds i_alloc_sem rd */
-#define DM_FLAGS_IALLOCSEM_WR   0x020   /* thread holds i_alloc_sem wr */
-/*
- *      Pull in platform specific event flags defines
- */
-#include "xfs_dmapi_priv.h"
-/*
- *      Macros to turn caller specified delay/block flags into
- *      dm_send_xxxx_event flag DM_FLAGS_NDELAY.
- */
-#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
-                        DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-#endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c3..000000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-static struct xfs_dmops xfs_dmcore_stub = {
-        .xfs_send_data          = (xfs_send_data_t)fs_nosys,
-        .xfs_send_mmap          = (xfs_send_mmap_t)fs_noerr,
-        .xfs_send_destroy       = (xfs_send_destroy_t)fs_nosys,
-        .xfs_send_namesp        = (xfs_send_namesp_t)fs_nosys,
-        .xfs_send_mount         = (xfs_send_mount_t)fs_nosys,
-        .xfs_send_unmount       = (xfs_send_unmount_t)fs_noerr,
-};
-int
-xfs_dmops_get(struct xfs_mount *mp)
-{
-        if (mp->m_flags & XFS_MOUNT_DMAPI) {
-                cmn_err(CE_WARN,
-                        "XFS: dmapi support not available in this kernel.");
-                return EINVAL;
-        }
-        mp->m_dm_ops = &xfs_dmcore_stub;
-        return 0;
-}
-void
-xfs_dmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 047b8a8e5c29..ed9990267661 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_utils.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 409fe81585fd..a55e687bf562 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
 kmem_zone_t     *xfs_efi_zone;
 kmem_zone_t     *xfs_efd_zone;
-STATIC void     xfs_efi_item_unlock(xfs_efi_log_item_t *);
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
 void
-xfs_efi_item_free(xfs_efi_log_item_t *efip)
+xfs_efi_item_free(
+        struct xfs_efi_log_item *efip)
 {
-        int nexts = efip->efi_format.efi_nextents;
+        if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
-        if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
                kmem_free(efip);
-        } else {
+        else
                kmem_zone_free(xfs_efi_zone, efip);
-        }
 }
 /*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_size(xfs_efi_log_item_t *efip)
+xfs_efi_item_size(
+        struct xfs_log_item     *lip)
 {
        return 1;
 }
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
 * slots in the efi item have been filled.
 */
 STATIC void
-xfs_efi_item_format(xfs_efi_log_item_t  *efip,
+xfs_efi_item_format(
-                    xfs_log_iovec_t     *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        uint    size;
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        uint                    size;
        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
        size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
        efip->efi_format.efi_size = 1;
-        log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+        log_vector->i_addr = &efip->efi_format;
        log_vector->i_len = size;
        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
        ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 /*
 * Pinning has no meaning for an efi item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+xfs_efi_item_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * While EFIs cannot really be pinned, the unpin operation is the
 * last place at which the EFI is manipulated during a transaction.
 * Here we coordinate with xfs_efi_cancel() to determine who gets to
 * free the EFI.
 */
-/*ARGSUSED*/
-STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
-{
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        spin_lock(&ailp->xa_lock);
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
-        }
-}
-/*
- * like unpin only we have to also clear the xaction descriptor
- * pointing the log item if we free the item.  This routine duplicates
- * unpin because efi_flags is protected by the AIL lock.  Freeing
- * the descriptor and then calling unpin would force us to drop the AIL
- * lock which would open up a race condition.
- */
 STATIC void
-xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+xfs_efi_item_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_ail          *ailp = lip->li_ailp;
        spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /*
+                if (remove)
-                 * free the xaction descriptor pointing to this item
+                        xfs_trans_del_item(lip);
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
-                xfs_trans_free_item(tp, lidp);
                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
+                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 * XFS_ITEM_PINNED so that the caller will eventually flush the log.
 * This should help in getting the EFI out of the AIL.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+xfs_efi_item_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_PINNED;
 }
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
 /*
 * Efi items have no locking, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+xfs_efi_item_unlock(
+        struct xfs_log_item     *lip)
 {
-        if (efip->efi_item.li_flags & XFS_LI_ABORTED)
+        if (lip->li_flags & XFS_LI_ABORTED)
-                xfs_efi_item_free(efip);
+                xfs_efi_item_free(EFI_ITEM(lip));
-        return;
 }
 /*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
 * flag is not paid any attention here.  Checking for that is delayed
 * until the EFI is unpinned.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
        return lsn;
 }
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
 * stuck waiting for all of its corresponding efd items to be
 * committed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_push(xfs_efi_log_item_t *efip)
+xfs_efi_item_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
@@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
 * example, for inodes, the inode is locked throughout the extent freeing
 * so the dependency should be recorded there.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector shared by all efi log items.
 */
 static struct xfs_item_ops xfs_efi_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
+        .iop_size       = xfs_efi_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_efi_item_format,
-                                        xfs_efi_item_format,
+        .iop_pin        = xfs_efi_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
+        .iop_unpin      = xfs_efi_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
+        .iop_trylock    = xfs_efi_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+        .iop_unlock     = xfs_efi_item_unlock,
-                                        xfs_efi_item_unpin_remove,
+        .iop_committed  = xfs_efi_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
+        .iop_push       = xfs_efi_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
+        .iop_committing = xfs_efi_item_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efi_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efi_item_committing
 };
 /*
 * Allocate and initialize an efi item with the given number of extents.
 */
-xfs_efi_log_item_t *
+struct xfs_efi_log_item *
-xfs_efi_init(xfs_mount_t        *mp,
+xfs_efi_init(
-             uint               nextents)
+        struct xfs_mount        *mp,
+        uint                    nextents)
 {
-        xfs_efi_log_item_t      *efip;
+        struct xfs_efi_log_item *efip;
        uint                    size;
        ASSERT(nextents > 0);
        if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efi_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+                efip = kmem_zalloc(size, KM_SLEEP);
        } else {
-                efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
+                efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
-                                                             KM_SLEEP);
        }
        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
-        return (efip);
+        return efip;
 }
 /*
@@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t	*mp,
 int
 xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 {
-        xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+        xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
        uint i;
        uint len = sizeof(xfs_efi_log_format_t) + 
                (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
@@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
                return 0;
        } else if (buf->i_len == len32) {
-                xfs_efi_log_format_32_t *src_efi_fmt_32 =
+                xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
-                        (xfs_efi_log_format_32_t *)buf->i_addr;
                dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
                dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
@@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                }
                return 0;
        } else if (buf->i_len == len64) {
-                xfs_efi_log_format_64_t *src_efi_fmt_64 =
+                xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
-                        (xfs_efi_log_format_64_t *)buf->i_addr;
                dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
                dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
@@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
        }
 }
-STATIC void
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
-xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 {
-        int nexts = efdp->efd_format.efd_nextents;
+        return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
-        if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+        if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                kmem_free(efdp);
-        } else {
+        else
                kmem_zone_free(xfs_efd_zone, efdp);
-        }
 }
 /*
@@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 * We only need 1 iovec for an efd item.  It just logs the efd_log_format
 * structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+xfs_efd_item_size(
+        struct xfs_log_item     *lip)
 {
        return 1;
 }
@@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
 * slots in the efd item have been filled.
 */
 STATIC void
-xfs_efd_item_format(xfs_efd_log_item_t  *efdp,
+xfs_efd_item_format(
-                    xfs_log_iovec_t     *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        uint    size;
+        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+        uint                    size;
        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
@@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
        size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
        efdp->efd_format.efd_size = 1;
-        log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+        log_vector->i_addr = &efdp->efd_format;
        log_vector->i_len = size;
        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
        ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 /*
 * Pinning has no meaning for an efd item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+xfs_efd_item_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * Since pinning has no meaning for an efd item, unpinning does
 * not either.
 */
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
-{
-        return;
-}
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+xfs_efd_item_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        return;
 }
 /*
 * Efd items have no locking, so just return success.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_LOCKED;
 }
@@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
 * Efd items have no locking or pushing, so return failure
 * so that the caller doesn't bother with us.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_unlock(
+        struct xfs_log_item     *lip)
 {
-        if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
+        if (lip->li_flags & XFS_LI_ABORTED)
-                xfs_efd_item_free(efdp);
+                xfs_efd_item_free(EFD_ITEM(lip));
-        return;
 }
 /*
@@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
 * return -1 to keep the transaction code from further referencing
 * this item.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+xfs_efd_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
+        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
        /*
         * If we got a log I/O error, it's always the case that the LR with the
         * EFI got unpinned and freed before the EFD got aborted.
         */
-        if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+        if (!(lip->li_flags & XFS_LI_ABORTED))
                xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
        xfs_efd_item_free(efdp);
@@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
 * There isn't much you can do to push on an efd item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+xfs_efd_item_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
@@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
 * example, for inodes, the inode is locked throughout the extent freeing
 * so the dependency should be recorded there.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efd_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector shared by all efd log items.
 */
 static struct xfs_item_ops xfs_efd_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
+        .iop_size       = xfs_efd_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_efd_item_format,
-                                        xfs_efd_item_format,
+        .iop_pin        = xfs_efd_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
+        .iop_unpin      = xfs_efd_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
+        .iop_trylock    = xfs_efd_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_unlock     = xfs_efd_item_unlock,
-                                        xfs_efd_item_unpin_remove,
+        .iop_committed  = xfs_efd_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
+        .iop_push       = xfs_efd_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
+        .iop_committing = xfs_efd_item_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efd_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efd_item_committing
 };
 /*
 * Allocate and initialize an efd item with the given number of extents.
 */
-xfs_efd_log_item_t *
+struct xfs_efd_log_item *
-xfs_efd_init(xfs_mount_t        *mp,
+xfs_efd_init(
-             xfs_efi_log_item_t *efip,
+        struct xfs_mount        *mp,
-             uint               nextents)
+        struct xfs_efi_log_item *efip,
+        uint                    nextents)
 {
-        xfs_efd_log_item_t      *efdp;
+        struct xfs_efd_log_item *efdp;
        uint                    size;
        ASSERT(nextents > 0);
        if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efd_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+                efdp = kmem_zalloc(size, KM_SLEEP);
        } else {
-                efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
+                efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
-                                                             KM_SLEEP);
        }
        xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
@@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t	*mp,
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-        return (efdp);
+        return efdp;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee6603..9b715dce5699 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
 #include "xfs.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
        xfs_inode_t     *pip;   /* Parent directory inode pointer. */
 } fstrm_item_t;
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all.  The race condition this addresses is the following:
+ *
+ *  - The work queue scheduler fires and pulls a filestream directory cache
+ *    element off the LRU end of the cache for deletion, then gets pre-empted.
+ *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ *    remaining items from the cache and reallocates the mount point's per-ag
+ *    array, resetting all the counters to zero.
+ *  - The work queue thread resumes and calls the free function for the element
+ *    it started cleaning up earlier.  In the process it decrements the
+ *    filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_read(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
+}
+static int
+xfs_filestream_get_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_inc_return(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
+}
+static void
+xfs_filestream_put_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        pag = xfs_perag_get(mp, agno);
+        atomic_dec(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+}
 /*
 * Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
 {
        fstrm_item_t    *item  = (fstrm_item_t *)data;
        xfs_inode_t     *ip = item->ip;
-        int ref;
        ASSERT(ip->i_ino == ino);
        xfs_iflags_clear(ip, XFS_IFILESTREAM);
        /* Drop the reference taken on the AG when the item was added. */
-        ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
+        xfs_filestream_put_ag(ip->i_mount, item->ag);
-        ASSERT(ref >= 0);
        TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
                xfs_filestream_peek_ag(ip->i_mount, item->ag));
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5d..09dd9af45434 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
 #endif
-/*
- * Allocation group filestream associations are tracked with per-ag atomic
- * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all.  The race condition this addresses is the following:
- *
- *  - The work queue scheduler fires and pulls a filestream directory cache
- *    element off the LRU end of the cache for deletion, then gets pre-empted.
- *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
- *    remaining items from the cache and reallocates the mount point's per-ag
- *    array, resetting all the counters to zero.
- *  - The work queue thread resumes and calls the free function for the element
- *    it started cleaning up earlier.  In the process it decrements the
- *    filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
- */
-/*
- * xfs_filestream_peek_ag is only used in tracing code
- */
-static inline int
-xfs_filestream_peek_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_read(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
-static inline int
-xfs_filestream_get_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_inc_return(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
-static inline int
-xfs_filestream_put_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_dec_return(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
 /* allocation selection flags */
 typedef enum xfs_fstrm_alloc {
        XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b6..dbca5f5c37ba 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -626,8 +622,7 @@ xfs_fs_log_dummy(
        ip = mp->m_rootip;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c7142a064c48..abf80ae1e95b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af5393..d352862cefa0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8f8b91be2c99..b1ecc6f97ade 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
        return ip;
 }
-STATIC void
+void
 xfs_inode_free(
        struct xfs_inode        *ip)
 {
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
                        ip->i_flags &= ~XFS_INEW;
                        ip->i_flags |= XFS_IRECLAIMABLE;
                        __xfs_inode_set_reclaim_tag(pag, ip);
-                        trace_xfs_iget_reclaim(ip);
+                        trace_xfs_iget_reclaim_fail(ip);
                        goto out_error;
                }
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
+                        trace_xfs_iget_skip(ip);
                        error = EAGAIN;
                        goto out_error;
                }
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
                read_unlock(&pag->pag_ici_lock);
+                trace_xfs_iget_hit(ip);
        }
        if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
        xfs_iflags_clear(ip, XFS_ISTALE);
        XFS_STATS_INC(xs_ig_found);
-        trace_xfs_iget_found(ip);
        return 0;
 out_error:
@@ -264,7 +261,6 @@ xfs_iget_cache_miss(
 {
        struct xfs_inode        *ip;
        int                     error;
-        unsigned long           first_index, mask;
        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
        ip = xfs_inode_alloc(mp, ino);
@@ -275,7 +271,7 @@ xfs_iget_cache_miss(
        if (error)
                goto out_destroy;
-        xfs_itrace_entry(ip);
+        trace_xfs_iget_miss(ip);
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                error = ENOENT;
@@ -301,8 +297,6 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
-        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
        /* insert the new inode */
@@ -321,7 +315,6 @@ xfs_iget_cache_miss(
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        trace_xfs_iget_alloc(ip);
        *ipp = ip;
        return 0;
@@ -422,97 +415,6 @@ out_error_or_again:
 }
 /*
- * Decrement reference count of an inode structure and unlock it.
- *
- * ip -- the inode being released
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be released.  See the comment on xfs_iunlock() for a list
- *       of valid values.
- */
-void
-xfs_iput(xfs_inode_t    *ip,
-         uint           lock_flags)
-{
-        xfs_itrace_entry(ip);
-        xfs_iunlock(ip, lock_flags);
-        IRELE(ip);
-}
-/*
- * Special iput for brand-new inodes that are still locked
- */
-void
-xfs_iput_new(
-        xfs_inode_t     *ip,
-        uint            lock_flags)
-{
-        struct inode    *inode = VFS_I(ip);
-        xfs_itrace_entry(ip);
-        if ((ip->i_d.di_mode == 0)) {
-                ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                make_bad_inode(inode);
-        }
-        if (inode->i_state & I_NEW)
-                unlock_new_inode(inode);
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
-        IRELE(ip);
-}
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- *
- * Note: because we don't initialise everything on reallocation out
- * of the zone, we must ensure we nullify everything correctly before
- * freeing the structure.
- */
-void
-xfs_ireclaim(
-        struct xfs_inode        *ip)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_perag        *pag;
-        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-        XFS_STATS_INC(xs_ig_reclaims);
-        /*
-         * Remove the inode from the per-AG radix tree.
-         *
-         * Because radix_tree_delete won't complain even if the item was never
-         * added to the tree assert that it's been there before to catch
-         * problems with the inode life time early on.
-         */
-        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
-        if (!radix_tree_delete(&pag->pag_ici_root, agino))
-                ASSERT(0);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_perag_put(pag);
-        /*
-         * Here we do an (almost) spurious inode lock in order to coordinate
-         * with inode cache radix tree lookups.  This is because the lookup
-         * can reference the inodes in the cache without taking references.
-         *
-         * We make that OK here by ensuring that we wait until the inode is
-         * unlocked after the lookup before we go ahead and free it.  We get
-         * both the ilock and the iolock because the code may need to drop the
-         * ilock one but will still hold the iolock.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_qm_dqdetach(ip);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_inode_free(ip);
-}
-/*
 * This is a wrapper routine around the xfs_ilock() routine
 * used to centralize some grungy code.  It is used in places
 * that wish to lock the inode solely for reading the extents.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b76a829d7e20..68415cb4f23c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
 #include "xfs_quota.h"
@@ -426,7 +422,7 @@ xfs_iformat(
        if (!XFS_DFORK_Q(dip))
                return 0;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
        else {
                real_size = roundup(size, 4);
-                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
        }
        ifp->if_bytes = size;
        ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
        }
        ifp->if_broot_bytes = size;
-        ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
        ASSERT(ifp->if_broot != NULL);
        /*
         * Copy and convert from the on-disk structure
@@ -922,7 +918,6 @@ xfs_iread_extents(
        int             error;
        xfs_ifork_t     *ifp;
        xfs_extnum_t    nextents;
-        size_t          size;
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -930,7 +925,6 @@ xfs_iread_extents(
                return XFS_ERROR(EFSCORRUPTED);
        }
        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-        size = nextents * sizeof(xfs_bmbt_rec_t);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        /*
@@ -1226,7 +1220,7 @@ xfs_isize_check(
                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
                          map_first),
                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-                         NULL, NULL))
+                         NULL))
            return;
        ASSERT(nimaps == 1);
        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1460,7 +1454,7 @@ xfs_itruncate_finish(
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(ip->i_transp == *tp);
        ASSERT(ip->i_itemp != NULL);
-        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+        ASSERT(ip->i_itemp->ili_lock_flags == 0);
        ntp = *tp;
@@ -1589,11 +1583,10 @@ xfs_itruncate_finish(
                xfs_bmap_init(&free_list, &first_block);
                error = xfs_bunmapi(ntp, ip,
                                    first_unmap_block, unmap_len,
-                                    xfs_bmapi_aflag(fork) |
+                                    xfs_bmapi_aflag(fork),
-                                      (sync ? 0 : XFS_BMAPI_ASYNC),
                                    XFS_ITRUNC_MAX_EXTENTS,
                                    &first_block, &free_list,
-                                    NULL, &done);
+                                    &done);
                if (error) {
                        /*
                         * If the bunmapi call encounters an error,
@@ -1612,12 +1605,8 @@ xfs_itruncate_finish(
                 */
                error = xfs_bmap_finish(tp, &free_list, &committed);
                ntp = *tp;
-                if (committed) {
+                if (committed)
-                        /* link the inode into the next xact in the chain */
+                        xfs_trans_ijoin(ntp, ip);
-                        xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        xfs_trans_ihold(ntp, ip);
-                }
                if (error) {
                        /*
@@ -1646,9 +1635,7 @@ xfs_itruncate_finish(
                error = xfs_trans_commit(*tp, 0);
                *tp = ntp;
-                /* link the inode into the next transaction in the chain */
+                xfs_trans_ijoin(ntp, ip);
-                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_trans_ihold(ntp, ip);
                if (error)
                        return error;
@@ -1985,7 +1972,7 @@ xfs_ifree_cluster(
                        if (lip->li_type == XFS_LI_INODE) {
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
-                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                                lip->li_cb = xfs_istale_done;
                                xfs_trans_ail_copy_lsn(mp->m_ail,
                                                        &iip->ili_flush_lsn,
                                                        &iip->ili_item.li_lsn);
@@ -2055,9 +2042,8 @@ xfs_ifree_cluster(
                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
                                                &iip->ili_item.li_lsn);
-                        xfs_buf_attach_iodone(bp,
+                        xfs_buf_attach_iodone(bp, xfs_istale_done,
-                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                                                  &iip->ili_item);
-                                xfs_istale_done, (xfs_log_item_t *)iip);
                        if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2203,7 +2189,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2219,7 +2205,7 @@ xfs_iroot_realloc(
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
-                                KM_SLEEP);
+                                KM_SLEEP | KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2245,7 +2231,7 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = kmem_alloc(new_size, KM_SLEEP);
+                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
                /*
                 * First copy over the btree block header.
                 */
@@ -2349,7 +2335,8 @@ xfs_idata_realloc(
                real_size = roundup(new_size, 4);
                if (ifp->if_u1.if_data == NULL) {
                        ASSERT(ifp->if_real_bytes == 0);
-                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
                        /*
                         * Only do the realloc if the underlying size
@@ -2360,11 +2347,12 @@ xfs_idata_realloc(
                                        kmem_realloc(ifp->if_u1.if_data,
                                                        real_size,
                                                        ifp->if_real_bytes,
-                                                        KM_SLEEP);
+                                                        KM_SLEEP | KM_NOFS);
                        }
                } else {
                        ASSERT(ifp->if_real_bytes == 0);
-                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
                                ifp->if_bytes);
                }
@@ -2731,7 +2719,6 @@ cluster_corrupt_out:
                 * mark it as stale and brelse.
                 */
                if (XFS_BUF_IODONE_FUNC(bp)) {
-                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
                        XFS_BUF_ERROR(bp,EIO);
@@ -3069,8 +3056,7 @@ xfs_iflush_int(
                 * and unlock the inode's flush lock when the inode is
                 * completely written to disk.
                 */
-                xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-                                      xfs_iflush_done, (xfs_log_item_t *)iip);
                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
                ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3514,13 +3500,11 @@ xfs_iext_remove_indirect(
        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
        xfs_extnum_t    nex1;           /* number of extents before idx */
        xfs_extnum_t    nex2;           /* extents after idx + count */
-        int             nlists;         /* entries in indirection array */
        int             page_idx = idx; /* index in target extent list */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
        ASSERT(erp != NULL);
-        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        nex1 = page_idx;
        ext_cnt = count;
        while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 78550df13cd6..0898c5417d12 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -443,8 +443,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 */
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                         uint, uint, xfs_inode_t **);
-void            xfs_iput(xfs_inode_t *, uint);
-void            xfs_iput_new(xfs_inode_t *, uint);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
@@ -452,7 +450,7 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_ireclaim(xfs_inode_t *);
+void            xfs_inode_free(struct xfs_inode *ip);
 /*
 * xfs_inode.c prototypes.
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cf8249a60004..fe00777e2796 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
 /*
 * This returns the number of iovecs needed to log the given inode item.
 *
@@ -55,13 +51,11 @@ kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
 */
 STATIC uint
 xfs_inode_item_size(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        uint            nvecs;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        xfs_inode_t     *ip;
+        struct xfs_inode        *ip = iip->ili_inode;
+        uint                    nvecs = 2;
-        ip = iip->ili_inode;
-        nvecs = 2;
        /*
         * Only log the data/extents/b-tree root if there is something
@@ -212,21 +206,17 @@ xfs_inode_item_size(
 */
 STATIC void
 xfs_inode_item_format(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *log_vector)
+        struct xfs_log_iovec    *vecp)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
-        xfs_log_iovec_t         *vecp;
-        xfs_inode_t             *ip;
        size_t                  data_bytes;
        xfs_bmbt_rec_t          *ext_buffer;
-        int                     nrecs;
        xfs_mount_t             *mp;
-        ip = iip->ili_inode;
+        vecp->i_addr = &iip->ili_format;
-        vecp = log_vector;
-        vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
        vecp->i_len  = sizeof(xfs_inode_log_format_t);
        vecp->i_type = XLOG_REG_TYPE_IFORMAT;
        vecp++;
@@ -277,7 +267,7 @@ xfs_inode_item_format(
         */
        xfs_synchronize_times(ip);
-        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+        vecp->i_addr = &ip->i_d;
        vecp->i_len  = sizeof(struct xfs_icdinode);
        vecp->i_type = XLOG_REG_TYPE_ICORE;
        vecp++;
@@ -323,18 +313,17 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
                        ASSERT(ip->i_d.di_nextents > 0);
                        ASSERT(iip->ili_extents_buf == NULL);
-                        nrecs = ip->i_df.if_bytes /
+                        ASSERT((ip->i_df.if_bytes /
-                                (uint)sizeof(xfs_bmbt_rec_t);
+                                (uint)sizeof(xfs_bmbt_rec_t)) > 0);
-                        ASSERT(nrecs > 0);
 #ifdef XFS_NATIVE_HOST
-                        if (nrecs == ip->i_d.di_nextents) {
+                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                                               (uint)sizeof(xfs_bmbt_rec_t)) {
                                /*
                                 * There are no delayed allocation
                                 * extents, so just point to the
                                 * real extents array.
                                 */
-                                vecp->i_addr =
+                                vecp->i_addr = ip->i_df.if_u1.if_extents;
-                                        (char *)(ip->i_df.if_u1.if_extents);
                                vecp->i_len = ip->i_df.if_bytes;
                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        } else
@@ -352,7 +341,7 @@ xfs_inode_item_format(
                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
                                        KM_SLEEP);
                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                                vecp->i_addr = ext_buffer;
                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                                XFS_DATA_FORK);
                                vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +360,7 @@ xfs_inode_item_format(
                if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
                        ASSERT(ip->i_df.if_broot_bytes > 0);
                        ASSERT(ip->i_df.if_broot != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+                        vecp->i_addr = ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IBROOT;
                        vecp++;
@@ -389,7 +378,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+                        vecp->i_addr = ip->i_df.if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
@@ -437,7 +426,7 @@ xfs_inode_item_format(
         * Assert that no attribute-related log flags are set.
         */
        if (!XFS_IFORK_Q(ip)) {
-                ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+                ASSERT(nvecs == lip->li_desc->lid_size);
                iip->ili_format.ilf_size = nvecs;
                ASSERT(!(iip->ili_format.ilf_fields &
                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +438,21 @@ xfs_inode_item_format(
                ASSERT(!(iip->ili_format.ilf_fields &
                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
                if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-                        ASSERT(ip->i_afp->if_bytes > 0);
-                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-                        ASSERT(ip->i_d.di_anextents > 0);
 #ifdef DEBUG
-                        nrecs = ip->i_afp->if_bytes /
+                        int nrecs = ip->i_afp->if_bytes /
                                (uint)sizeof(xfs_bmbt_rec_t);
-#endif
                        ASSERT(nrecs > 0);
                        ASSERT(nrecs == ip->i_d.di_anextents);
+                        ASSERT(ip->i_afp->if_bytes > 0);
+                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+                        ASSERT(ip->i_d.di_anextents > 0);
+#endif
 #ifdef XFS_NATIVE_HOST
                        /*
                         * There are not delayed allocation extents
                         * for attributes, so just point at the array.
                         */
-                        vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +462,7 @@ xfs_inode_item_format(
                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
                                KM_SLEEP);
                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                        vecp->i_addr = ext_buffer;
                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                        XFS_ATTR_FORK);
 #endif
@@ -490,7 +479,7 @@ xfs_inode_item_format(
                if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
                        ASSERT(ip->i_afp->if_broot_bytes > 0);
                        ASSERT(ip->i_afp->if_broot != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+                        vecp->i_addr = ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
                        vecp++;
@@ -506,7 +495,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_afp->if_bytes > 0);
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+                        vecp->i_addr = ip->i_afp->if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
@@ -528,7 +517,7 @@ xfs_inode_item_format(
                break;
        }
-        ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+        ASSERT(nvecs == lip->li_desc->lid_size);
        iip->ili_format.ilf_size = nvecs;
 }
@@ -539,12 +528,14 @@ xfs_inode_item_format(
 */
 STATIC void
 xfs_inode_item_pin(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+        struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
+        trace_xfs_inode_pin(ip, _RET_IP_);
-        atomic_inc(&iip->ili_inode->i_pincount);
+        atomic_inc(&ip->i_pincount);
 }
@@ -554,12 +545,12 @@ xfs_inode_item_pin(
 *
 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
 */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        struct xfs_inode        *ip = iip->ili_inode;
+        struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
@@ -567,15 +558,6 @@ xfs_inode_item_unpin(
                wake_up(&ip->i_ipin_wait);
 }
-/* ARGSUSED */
-STATIC void
-xfs_inode_item_unpin_remove(
-        xfs_inode_log_item_t    *iip,
-        xfs_trans_t             *tp)
-{
-        xfs_inode_item_unpin(iip);
-}
 /*
 * This is called to attempt to lock the inode associated with this
 * inode log item, in preparation for the push routine which does the actual
@@ -591,19 +573,16 @@ xfs_inode_item_unpin_remove(
 */
 STATIC uint
 xfs_inode_item_trylock(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        register xfs_inode_t    *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ip = iip->ili_inode;
-        if (xfs_ipincount(ip) > 0) {
+        if (xfs_ipincount(ip) > 0)
                return XFS_ITEM_PINNED;
-        }
-        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                return XFS_ITEM_LOCKED;
-        }
        if (!xfs_iflock_nowait(ip)) {
                /*
@@ -629,7 +608,7 @@ xfs_inode_item_trylock(
        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                ASSERT(iip->ili_format.ilf_fields != 0);
                ASSERT(iip->ili_logged == 0);
-                ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+                ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        }
 #endif
        return XFS_ITEM_SUCCESS;
@@ -643,26 +622,18 @@ xfs_inode_item_trylock(
 */
 STATIC void
 xfs_inode_item_unlock(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        uint            hold;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        uint            iolocked;
+        struct xfs_inode        *ip = iip->ili_inode;
-        uint            lock_flags;
+        unsigned short          lock_flags;
-        xfs_inode_t     *ip;
-        ASSERT(iip != NULL);
        ASSERT(iip->ili_inode->i_itemp != NULL);
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
-        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-                  XFS_ILI_IOLOCKED_EXCL)) ||
-               xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
-        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-                  XFS_ILI_IOLOCKED_SHARED)) ||
-               xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
        /*
         * Clear the transaction pointer in the inode.
         */
-        ip = iip->ili_inode;
        ip->i_transp = NULL;
        /*
@@ -686,34 +657,11 @@ xfs_inode_item_unlock(
                iip->ili_aextents_buf = NULL;
        }
-        /*
+        lock_flags = iip->ili_lock_flags;
-         * Figure out if we should unlock the inode or not.
+        iip->ili_lock_flags = 0;
-         */
+        if (lock_flags) {
-        hold = iip->ili_flags & XFS_ILI_HOLD;
+                xfs_iunlock(iip->ili_inode, lock_flags);
+                IRELE(iip->ili_inode);
-        /*
-         * Before clearing out the flags, remember whether we
-         * are holding the inode's IO lock.
-         */
-        iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
-        /*
-         * Clear out the fields of the inode log item particular
-         * to the current transaction.
-         */
-        iip->ili_flags = 0;
-        /*
-         * Unlock the inode if XFS_ILI_HOLD was not set.
-         */
-        if (!hold) {
-                lock_flags = XFS_ILOCK_EXCL;
-                if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
-                        lock_flags |= XFS_IOLOCK_EXCL;
-                } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
-                        lock_flags |= XFS_IOLOCK_SHARED;
-                }
-                xfs_iput(iip->ili_inode, lock_flags);
        }
 }
@@ -725,13 +673,12 @@ xfs_inode_item_unlock(
 * is the only one that matters.  Therefore, simply return the
 * given lsn.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        return (lsn);
+        return lsn;
 }
 /*
@@ -743,13 +690,12 @@ xfs_inode_item_committed(
 */
 STATIC void
 xfs_inode_item_pushbuf(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_inode_t     *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        xfs_mount_t     *mp;
+        struct xfs_inode        *ip = iip->ili_inode;
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp;
-        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        /*
@@ -757,14 +703,13 @@ xfs_inode_item_pushbuf(
         * inode was taken off the AIL. So, just get out.
         */
        if (completion_done(&ip->i_flush) ||
-            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return;
        }
-        mp = ip->i_mount;
+        bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
-        bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
+                        iip->ili_format.ilf_len, XBF_TRYLOCK);
-                    iip->ili_format.ilf_len, XBF_TRYLOCK);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!bp)
@@ -772,10 +717,8 @@ xfs_inode_item_pushbuf(
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
-        return;
 }
 /*
 * This is called to asynchronously write the inode associated with this
 * inode log item out to disk. The inode will already have been locked by
@@ -783,14 +726,14 @@ xfs_inode_item_pushbuf(
 */
 STATIC void
 xfs_inode_item_push(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_inode_t     *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        ASSERT(!completion_done(&ip->i_flush));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
@@ -813,43 +756,34 @@ xfs_inode_item_push(
         */
        (void) xfs_iflush(ip, 0);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return;
 }
 /*
 * XXX rcc - this one really has to do something.  Probably needs
 * to stamp in a new field in the incore inode.
 */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_committing(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        iip->ili_last_lsn = lsn;
+        INODE_ITEM(lip)->ili_last_lsn = lsn;
-        return;
 }
 /*
 * This is the ops vector shared by all buf log items.
 */
 static struct xfs_item_ops xfs_inode_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
+        .iop_size       = xfs_inode_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_inode_item_format,
-                                        xfs_inode_item_format,
+        .iop_pin        = xfs_inode_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
+        .iop_unpin      = xfs_inode_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
+        .iop_trylock    = xfs_inode_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_unlock     = xfs_inode_item_unlock,
-                                        xfs_inode_item_unpin_remove,
+        .iop_committed  = xfs_inode_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
+        .iop_push       = xfs_inode_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
+        .iop_pushbuf    = xfs_inode_item_pushbuf,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+        .iop_committing = xfs_inode_item_committing
-                                        xfs_inode_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_inode_item_committing
 };
@@ -858,10 +792,10 @@ static struct xfs_item_ops xfs_inode_item_ops = {
 */
 void
 xfs_inode_item_init(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_inode_log_item_t    *iip;
+        struct xfs_inode_log_item *iip;
        ASSERT(ip->i_itemp == NULL);
        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
@@ -899,14 +833,14 @@ xfs_inode_item_destroy(
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
 */
-/*ARGSUSED*/
 void
 xfs_iflush_done(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        xfs_inode_t             *ip = iip->ili_inode;
-        struct xfs_ail          *ailp = iip->ili_item.li_ailp;
+        struct xfs_ail          *ailp = lip->li_ailp;
        /*
         * We only want to pull the item from the AIL if it is
@@ -917,12 +851,11 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged &&
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
-            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
                spin_lock(&ailp->xa_lock);
-                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+                if (lip->li_lsn == iip->ili_flush_lsn) {
                        /* xfs_trans_ail_delete() drops the AIL lock. */
-                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
+                        xfs_trans_ail_delete(ailp, lip);
                } else {
                        spin_unlock(&ailp->xa_lock);
                }
@@ -940,8 +873,6 @@ xfs_iflush_done(
         * Release the inode's flush lock since we're done with it.
         */
        xfs_ifunlock(ip);
-        return;
 }
 /*
@@ -957,10 +888,8 @@ xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        xfs_mount_t             *mp;
        iip = ip->i_itemp;
-        mp = ip->i_mount;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -991,10 +920,10 @@ xfs_iflush_abort(
 void
 xfs_istale_done(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_iflush_abort(iip->ili_inode);
+        xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
 }
 /*
@@ -1007,9 +936,8 @@ xfs_inode_item_format_convert(
        xfs_inode_log_format_t  *in_f)
 {
        if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
-                xfs_inode_log_format_32_t *in_f32;
+                xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-                in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
                in_f->ilf_type = in_f32->ilf_type;
                in_f->ilf_size = in_f32->ilf_size;
                in_f->ilf_fields = in_f32->ilf_fields;
@@ -1025,9 +953,8 @@ xfs_inode_item_format_convert(
                in_f->ilf_boffset = in_f32->ilf_boffset;
                return 0;
        } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
-                xfs_inode_log_format_64_t *in_f64;
+                xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-                in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
                in_f->ilf_type = in_f64->ilf_type;
                in_f->ilf_size = in_f64->ilf_size;
                in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecdd..d3dee61e6d91 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
                                 XFS_ILOG_ABROOT)
-#define XFS_ILI_HOLD            0x1
-#define XFS_ILI_IOLOCKED_EXCL   0x2
-#define XFS_ILI_IOLOCKED_SHARED 0x4
-#define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 static inline int xfs_ilog_fbroot(int w)
 {
        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
        struct xfs_inode        *ili_inode;        /* inode ptr */
        xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
        xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
-        unsigned short          ili_flags;         /* misc flags */
+        unsigned short          ili_lock_flags;    /* lock flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
 extern void xfs_iflush_abort(struct xfs_inode *);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
                                         xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ef14943829da..20576146369f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -123,7 +118,7 @@ xfs_iomap(
        error = xfs_bmapi(NULL, ip, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
                        bmapi_flags,  NULL, 0, imap,
-                        nimaps, NULL, NULL);
+                        nimaps, NULL);
        if (error)
                goto out;
@@ -138,7 +133,7 @@ xfs_iomap(
                        break;
                }
-                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+                if (flags & BMAPI_DIRECT) {
                        error = xfs_iomap_write_direct(ip, offset, count, flags,
                                                       imap, nimaps);
                } else {
@@ -247,7 +242,7 @@ xfs_iomap_write_direct(
        xfs_off_t       offset,
        size_t          count,
        int             flags,
-        xfs_bmbt_irec_t *ret_imap,
+        xfs_bmbt_irec_t *imap,
        int             *nmaps)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -261,7 +256,6 @@ xfs_iomap_write_direct(
        int             quota_flag;
        int             rt;
        xfs_trans_t     *tp;
-        xfs_bmbt_irec_t imap;
        xfs_bmap_free_t free_list;
        uint            qblocks, resblks, resrtextents;
        int             committed;
@@ -285,10 +279,10 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
+                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
-                                        ret_imap->br_blockcount +
+                                        imap->br_blockcount +
-                                        ret_imap->br_startoff);
+                                        imap->br_startoff);
        }
        count_fsb = last_fsb - offset_fsb;
        ASSERT(count_fsb > 0);
@@ -334,20 +328,22 @@ xfs_iomap_write_direct(
        if (error)
                goto error1;
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
-         * Issue the xfs_bmapi() call to allocate the blocks
+         * Issue the xfs_bmapi() call to allocate the blocks.
+         *
+         * From this point onwards we overwrite the imap pointer that the
+         * caller gave to us.
         */
        xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-                &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
+                &firstfsb, 0, imap, &nimaps, &free_list);
        if (error)
                goto error0;
@@ -369,12 +365,11 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, &imap);
+                error = xfs_cmn_err_fsblock_zero(ip, imap);
                goto error_out;
        }
-        *ret_imap = imap;
        *nmaps = 1;
        return 0;
@@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
                imaps = nimaps;
                firstblock = NULLFSBLOCK;
                error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
-                                  &firstblock, 0, imap, &imaps, NULL, NULL);
+                                  &firstblock, 0, imap, &imaps, NULL);
                if (error)
                        return error;
                for (n = 0; n < imaps; n++) {
@@ -500,7 +495,7 @@ retry:
                          (xfs_filblks_t)(last_fsb - offset_fsb),
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-                          &nimaps, NULL, NULL);
+                          &nimaps, NULL);
        if (error && (error != ENOSPC))
                return XFS_ERROR(error);
@@ -548,7 +543,7 @@ xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *map,
+        xfs_bmbt_irec_t *imap,
        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -557,7 +552,6 @@ xfs_iomap_write_allocate(
        xfs_fsblock_t   first_block;
        xfs_bmap_free_t free_list;
        xfs_filblks_t   count_fsb;
-        xfs_bmbt_irec_t imap;
        xfs_trans_t     *tp;
        int             nimaps, committed;
        int             error = 0;
@@ -573,8 +567,8 @@ xfs_iomap_write_allocate(
                return XFS_ERROR(error);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        count_fsb = map->br_blockcount;
+        count_fsb = imap->br_blockcount;
-        map_start_fsb = map->br_startoff;
+        map_start_fsb = imap->br_startoff;
        XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
@@ -602,8 +596,7 @@ xfs_iomap_write_allocate(
                                return XFS_ERROR(error);
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(tp, ip);
-                        xfs_trans_ihold(tp, ip);
                        xfs_bmap_init(&free_list, &first_block);
@@ -654,10 +647,15 @@ xfs_iomap_write_allocate(
                                }
                        }
-                        /* Go get the actual blocks */
+                        /*
+                         * Go get the actual blocks.
+                         *
+                         * From this point onwards we overwrite the imap
+                         * pointer that the caller gave to us.
+                         */
                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
                                        XFS_BMAPI_WRITE, &first_block, 1,
-                                        &imap, &nimaps, &free_list, NULL);
+                                        imap, &nimaps, &free_list);
                        if (error)
                                goto trans_cancel;
@@ -676,13 +674,12 @@ xfs_iomap_write_allocate(
                 * See if we were able to allocate an extent that
                 * covers at least part of the callers request
                 */
-                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_cmn_err_fsblock_zero(ip, imap);
-                if ((offset_fsb >= imap.br_startoff) &&
+                if ((offset_fsb >= imap->br_startoff) &&
-                    (offset_fsb < (imap.br_startoff +
+                    (offset_fsb < (imap->br_startoff +
-                                   imap.br_blockcount))) {
+                                   imap->br_blockcount))) {
-                        *map = imap;
                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
@@ -692,8 +689,8 @@ xfs_iomap_write_allocate(
                 * So far we have not mapped the requested part of the
                 * file, just surrounding data, try again.
                 */
-                count_fsb -= imap.br_blockcount;
+                count_fsb -= imap->br_blockcount;
-                map_start_fsb = imap.br_startoff + imap.br_blockcount;
+                map_start_fsb = imap->br_startoff + imap->br_blockcount;
        }
 trans_cancel:
@@ -766,8 +763,7 @@ xfs_iomap_write_unwritten(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Modify the unwritten extent state of the buffer.
@@ -776,7 +772,7 @@ xfs_iomap_write_unwritten(
                nimaps = 1;
                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
                                  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-                                  1, &imap, &nimaps, &free_list, NULL);
+                                  1, &imap, &nimaps, &free_list);
                if (error)
                        goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 81ac4afd45b3..7748a430f50d 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,17 +18,16 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-typedef enum {
+/* base extent manipulation calls */
-        /* base extent manipulation calls */
+#define BMAPI_READ      (1 << 0)        /* read extents */
-        BMAPI_READ = (1 << 0),          /* read extents */
+#define BMAPI_WRITE     (1 << 1)        /* create extents */
-        BMAPI_WRITE = (1 << 1),         /* create extents */
+#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
-        BMAPI_ALLOCATE = (1 << 2),      /* delayed allocate to real extents */
-        /* modifiers */
+/* modifiers */
-        BMAPI_IGNSTATE = (1 << 4),      /* ignore unwritten state on read */
+#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-        BMAPI_DIRECT = (1 << 5),        /* direct instead of buffered write */
+#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-        BMAPI_MMAP = (1 << 6),          /* allocate for mmap write */
+#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-        BMAPI_TRYLOCK = (1 << 7),       /* non-blocking request */
+#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-} bmapi_flags_t;
 #define BMAPI_FLAGS \
        { BMAPI_READ,           "READ" }, \
@@ -36,7 +35,6 @@ typedef enum {
        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_MMAP,           "MMAP" }, \
        { BMAPI_TRYLOCK,        "TRYLOCK" }
 struct xfs_inode;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2b86f8610512..7e3626e5925c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_btree.h"
+#include "xfs_trace.h"
 STATIC int
 xfs_internal_inum(
@@ -143,7 +140,8 @@ xfs_bulkstat_one_int(
                buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
                break;
        }
-        xfs_iput(ip, XFS_ILOCK_SHARED);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        IRELE(ip);
        error = formatter(buffer, ubsize, ubused, buf);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5215abc8023a..925d572bf0f4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_rw.h"
@@ -337,7 +333,6 @@ xfs_log_reserve(
        int                     retval = 0;
        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-        ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
@@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                                .magic = XLOG_UNMOUNT_TYPE,
                        };
                        struct xfs_log_iovec reg = {
-                                .i_addr = (void *)&magic,
+                                .i_addr = &magic,
                                .i_len = sizeof(magic),
                                .i_type = XLOG_REG_TYPE_UNMOUNT,
                        };
@@ -1047,7 +1042,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xlog_in_core_t          *iclog, *prev_iclog=NULL;
        xfs_buf_t               *bp;
        int                     i;
-        int                     iclogsize;
        int                     error = ENOMEM;
        uint                    log2_size = 0;
@@ -1127,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
         * with different amounts of memory.  See the definition of
         * xlog_in_core_t in xfs_log_priv.h for details.
         */
-        iclogsize = log->l_iclog_size;
        ASSERT(log->l_iclog_size >= 4096);
        for (i=0; i < log->l_iclog_bufs; i++) {
                *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1428,11 +1421,8 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_LOG_BUFFER;
-        /*
-         * Do an ordered write for the log block.
+        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
-         * Its unnecessary to flush the first split block in the log wrap case.
-         */
-        if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
                XFS_BUF_ORDERED(bp);
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 04c78e642cc8..916eb7db14d9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -55,14 +55,10 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 /*
 * Flags to xfs_log_reserve()
 *
- *      XFS_LOG_SLEEP:   If space is not available, sleep (default)
- *      XFS_LOG_NOSLEEP: If space is not available, return error
 *      XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
 *              performed against this type of reservation, the reservation
 *              is not decreased.  Long running transactions should use this.
 */
-#define XFS_LOG_SLEEP           0x0
-#define XFS_LOG_NOSLEEP         0x1
 #define XFS_LOG_PERM_RESERV     0x2
 /*
@@ -104,7 +100,7 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_MAX               19
 typedef struct xfs_log_iovec {
-        xfs_caddr_t     i_addr;         /* beginning address of region */
+        void            *i_addr;        /* beginning address of region */
        int             i_len;          /* length in bytes of region */
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
@@ -201,9 +197,4 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 #endif
-extern int xlog_debug;          /* set to 1 to enable real log */
 #endif  /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index bb17cc044bf3..31e4ea2d19ac 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -26,8 +26,6 @@
 #include "xfs_log_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_alloc.h"
@@ -554,7 +552,7 @@ xlog_cil_push(
        thdr.th_type = XFS_TRANS_CHECKPOINT;
        thdr.th_tid = tic->t_tid;
        thdr.th_num_items = num_iovecs;
-        lhdr.i_addr = (xfs_caddr_t)&thdr;
+        lhdr.i_addr = &thdr;
        lhdr.i_len = sizeof(xfs_trans_header_t);
        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9ac5cfab27b9..6f3f5fa37acf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -1565,9 +1561,7 @@ xlog_recover_reorder_trans(
        list_splice_init(&trans->r_itemq, &sort_list);
        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
-                xfs_buf_log_format_t    *buf_f;
+                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-                buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
@@ -1892,9 +1886,8 @@ xlog_recover_do_inode_buffer(
                 * current di_next_unlinked field.  Extract its value
                 * and copy it to the buffer copy.
                 */
-                logged_nextp = (xfs_agino_t *)
+                logged_nextp = item->ri_buf[item_index].i_addr +
-                               ((char *)(item->ri_buf[item_index].i_addr) +
+                                next_unlinked_offset - reg_buf_offset;
-                                (next_unlinked_offset - reg_buf_offset));
                if (unlikely(*logged_nextp == 0)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1973,8 +1966,7 @@ xlog_recover_do_reg_buffer(
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
+                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
-                                               item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -2187,7 +2179,7 @@ xlog_recover_do_buffer_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-        xfs_buf_log_format_t    *buf_f;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
        int                     error;
@@ -2197,8 +2189,6 @@ xlog_recover_do_buffer_trans(
        ushort                  flags;
        uint                    buf_flags;
-        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
        if (pass == XLOG_RECOVER_PASS1) {
                /*
                 * In this pass we're only looking for buf items
@@ -2319,10 +2309,9 @@ xlog_recover_do_inode_trans(
        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
-                in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+                in_f = item->ri_buf[0].i_addr;
        } else {
-                in_f = (xfs_inode_log_format_t *)kmem_alloc(
+                in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
-                        sizeof(xfs_inode_log_format_t), KM_SLEEP);
                need_free = 1;
                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
                if (error)
@@ -2370,7 +2359,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-        dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
+        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2461,7 +2450,7 @@ xlog_recover_do_inode_trans(
        }
        /* The core is in in-core format */
-        xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+        xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
        /* the rest is in on-disk format */
        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2578,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
                return (0);
        }
-        qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2622,9 +2611,8 @@ xlog_recover_do_dquot_trans(
        if (mp->m_qflags == 0)
                return (0);
-        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
+        recddq = item->ri_buf[1].i_addr;
+        if (recddq == NULL) {
-        if (item->ri_buf[1].i_addr == NULL) {
                cmn_err(CE_ALERT,
                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
@@ -2654,7 +2642,7 @@ xlog_recover_do_dquot_trans(
         * The other possibility, of course, is that the quota subsystem was
         * removed since the last mount - ENOSYS.
         */
-        dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
@@ -2721,7 +2709,7 @@ xlog_recover_do_efi_trans(
                return 0;
        }
-        efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+        efi_formatp = item->ri_buf[0].i_addr;
        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2767,7 +2755,7 @@ xlog_recover_do_efd_trans(
                return;
        }
-        efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 69f62d8b2816..aeb9d72ebf6e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4eda..622da2179a57 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,65 +66,6 @@ struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
-/*
- * Prototypes and functions for the Data Migration subsystem.
- */
-typedef int     (*xfs_send_data_t)(int, struct xfs_inode *,
-                        xfs_off_t, size_t, int, int *);
-typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int     (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
-typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-                        struct xfs_inode *, dm_right_t,
-                        struct xfs_inode *, dm_right_t,
-                        const unsigned char *, const unsigned char *,
-                        mode_t, int, int);
-typedef int     (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
-                        char *, char *);
-typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
-                        dm_right_t, mode_t, int, int);
-typedef struct xfs_dmops {
-        xfs_send_data_t         xfs_send_data;
-        xfs_send_mmap_t         xfs_send_mmap;
-        xfs_send_destroy_t      xfs_send_destroy;
-        xfs_send_namesp_t       xfs_send_namesp;
-        xfs_send_mount_t        xfs_send_mount;
-        xfs_send_unmount_t      xfs_send_unmount;
-} xfs_dmops_t;
-#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
-        (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
-#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
-        (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
-#define XFS_SEND_MMAP(mp, vma,fl) \
-        (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, ip,right) \
-        (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
-#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
-        (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_MOUNT(mp,right,path,name) \
-        (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_PREUNMOUNT(mp) \
-do { \
-        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-                (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
-                        (mp)->m_rootip, DM_RIGHT_NULL, \
-                        (mp)->m_rootip, DM_RIGHT_NULL, \
-                        NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-        } \
-} while (0)
-#define XFS_SEND_UNMOUNT(mp) \
-do { \
-        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-                (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
-                        DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-        } \
-} while (0)
 #ifdef HAVE_PERCPU_SB
 /*
@@ -241,8 +182,6 @@ typedef struct xfs_mount {
        uint                    m_chsize;       /* size of next field */
        struct xfs_chash        *m_chash;       /* fs private inode per-cluster
                                                 * hash table */
-        struct xfs_dmops        *m_dm_ops;      /* vector of DMI ops */
-        struct xfs_qmops        *m_qm_ops;      /* vector of XQM ops */
        atomic_t                m_active_trans; /* number trans frozen */
 #ifdef HAVE_PERCPU_SB
        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
@@ -259,7 +198,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
-        struct list_head        m_mplist;       /* inode shrinker mount list */
+        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
 } xfs_mount_t;
 /*
@@ -269,7 +208,6 @@ typedef struct xfs_mount {
                                                   must be synchronous except
                                                   for space allocations */
 #define XFS_MOUNT_DELAYLOG      (1ULL << 1)     /* delayed logging is enabled */
-#define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
@@ -282,8 +220,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
-#define XFS_MOUNT_OSYNCISOSYNC  (1ULL << 13)    /* o_sync is REALLY o_sync */
-                                                /* osyncisdsync is now default*/
 #define XFS_MOUNT_32BITINODES   (1ULL << 14)    /* do not create inodes above
                                                 * 32 bits in size */
 #define XFS_MOUNT_SMALL_INUMS   (1ULL << 15)    /* users wants 32bit inodes */
@@ -440,11 +376,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
-extern int      xfs_dmops_get(struct xfs_mount *);
-extern void     xfs_dmops_put(struct xfs_mount *);
-extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b817..8fca957200df 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
        int             spaceres;
        int             num_inodes;
-        xfs_itrace_entry(src_dp);
+        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-        xfs_itrace_entry(target_dp);
-        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
-            DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-                                        src_dp, DM_RIGHT_NULL,
-                                        target_dp, DM_RIGHT_NULL,
-                                        src_name->name, target_name->name,
-                                        0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        new_parent = (src_dp != target_dp);
        src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
-         * them.  Note that we need to add a vnode reference to the
+         * them.
-         * directories since trans_commit & trans_cancel will decrement
-         * them when they unlock the inodes.  Also, we need to be careful
-         * not to add an inode to the transaction more than once.
         */
-        IHOLD(src_dp);
+        xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+        if (new_parent)
+                xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
-        if (new_parent) {
+        xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
-                IHOLD(target_dp);
+        if (target_ip)
-                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
-        }
-        IHOLD(src_ip);
-        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-        if (target_ip) {
-                IHOLD(target_ip);
-                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-        }
        /*
         * If we are using project inheritance, we only allow renames
@@ -369,26 +341,13 @@ xfs_rename(
         * trans_commit will unlock src_ip, target_ip & decrement
         * the vnode references.
         */
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        /* Fall through to std_return with error = 0 or errno from
-         * xfs_trans_commit      */
-std_return:
-        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
-            DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
-                (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-                                        src_dp, DM_RIGHT_NULL,
-                                        target_dp, DM_RIGHT_NULL,
-                                        src_name->name, target_name->name,
-                                        0, error, 0);
-        }
-        return error;
 abort_return:
        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a2d32ce335aa..891260fea11e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -129,7 +122,7 @@ xfs_growfs_rt_alloc(
                cancelflags |= XFS_TRANS_ABORT;
                error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
                        XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-                        resblks, &map, &nmap, &flist, NULL);
+                        resblks, &map, &nmap, &flist);
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a4..56861d5daaef 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_buf_item.h"
 #include "xfs_rw.h"
-#include "xfs_trace.h"
 /*
 * Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 28547dfce037..fdca7416c754 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -47,6 +44,7 @@
 #include "xfs_trace.h"
 kmem_zone_t     *xfs_trans_zone;
+kmem_zone_t     *xfs_log_item_desc_zone;
 /*
@@ -597,8 +595,7 @@ _xfs_trans_alloc(
        tp->t_magic = XFS_TRANS_MAGIC;
        tp->t_type = type;
        tp->t_mountp = mp;
-        tp->t_items_free = XFS_LIC_NUM_SLOTS;
+        INIT_LIST_HEAD(&tp->t_items);
-        xfs_lic_init(&(tp->t_items));
        INIT_LIST_HEAD(&tp->t_busy);
        return tp;
 }
@@ -643,8 +640,7 @@ xfs_trans_dup(
        ntp->t_magic = XFS_TRANS_MAGIC;
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
-        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
+        INIT_LIST_HEAD(&ntp->t_items);
-        xfs_lic_init(&(ntp->t_items));
        INIT_LIST_HEAD(&ntp->t_busy);
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1124,6 +1120,108 @@ xfs_trans_unreserve_and_mod_sb(
 }
 /*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+        struct xfs_trans        *tp,
+        struct xfs_log_item     *lip)
+{
+        struct xfs_log_item_desc *lidp;
+        ASSERT(lip->li_mountp = tp->t_mountp);
+        ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+        lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+        lidp->lid_item = lip;
+        lidp->lid_flags = 0;
+        lidp->lid_size = 0;
+        list_add_tail(&lidp->lid_trans, &tp->t_items);
+        lip->li_desc = lidp;
+}
+STATIC void
+xfs_trans_free_item_desc(
+        struct xfs_log_item_desc *lidp)
+{
+        list_del_init(&lidp->lid_trans);
+        kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+        struct xfs_log_item     *lip)
+{
+        xfs_trans_free_item_desc(lip->li_desc);
+        lip->li_desc = NULL;
+}
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+STATIC void
+xfs_trans_free_items(
+        struct xfs_trans        *tp,
+        xfs_lsn_t               commit_lsn,
+        int                     flags)
+{
+        struct xfs_log_item_desc *lidp, *next;
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+                struct xfs_log_item     *lip = lidp->lid_item;
+                lip->li_desc = NULL;
+                if (commit_lsn != NULLCOMMITLSN)
+                        IOP_COMMITTING(lip, commit_lsn);
+                if (flags & XFS_TRANS_ABORT)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                IOP_UNLOCK(lip);
+                xfs_trans_free_item_desc(lidp);
+        }
+}
+/*
+ * Unlock the items associated with a transaction.
+ *
+ * Items which were not logged should be freed.  Those which were logged must
+ * still be tracked so they can be unpinned when the transaction commits.
+ */
+STATIC void
+xfs_trans_unlock_items(
+        struct xfs_trans        *tp,
+        xfs_lsn_t               commit_lsn)
+{
+        struct xfs_log_item_desc *lidp, *next;
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+                struct xfs_log_item     *lip = lidp->lid_item;
+                lip->li_desc = NULL;
+                if (commit_lsn != NULLCOMMITLSN)
+                        IOP_COMMITTING(lip, commit_lsn);
+                IOP_UNLOCK(lip);
+                /*
+                 * Free the descriptor if the item is not dirty
+                 * within this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                        xfs_trans_free_item_desc(lidp);
+        }
+}
+/*
 * Total up the number of log iovecs needed to commit this
 * transaction.  The transaction itself needs one for the
 * transaction header.  Ask each dirty item in turn how many
@@ -1134,30 +1232,27 @@ xfs_trans_count_vecs(
        struct xfs_trans        *tp)
 {
        int                     nvecs;
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_item_desc *lidp;
        nvecs = 1;
-        lidp = xfs_trans_first_item(tp);
-        ASSERT(lidp != NULL);
        /* In the non-debug case we need to start bailing out if we
         * didn't find a log_item here, return zero and let trans_commit
         * deal with it.
         */
-        if (lidp == NULL)
+        if (list_empty(&tp->t_items)) {
+                ASSERT(0);
                return 0;
+        }
-        while (lidp != NULL) {
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                /*
                 * Skip items which aren't dirty in this transaction.
                 */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
-                }
                lidp->lid_size = IOP_SIZE(lidp->lid_item);
                nvecs += lidp->lid_size;
-                lidp = xfs_trans_next_item(tp, lidp);
        }
        return nvecs;
@@ -1177,7 +1272,7 @@ xfs_trans_fill_vecs(
        struct xfs_trans        *tp,
        struct xfs_log_iovec    *log_vector)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_item_desc *lidp;
        struct xfs_log_iovec    *vecp;
        uint                    nitems;
@@ -1188,14 +1283,11 @@ xfs_trans_fill_vecs(
        vecp = log_vector + 1;
        nitems = 0;
-        lidp = xfs_trans_first_item(tp);
+        ASSERT(!list_empty(&tp->t_items));
-        ASSERT(lidp);
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-        while (lidp) {
                /* Skip items which aren't dirty in this transaction. */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
-                }
                /*
                 * The item may be marked dirty but not log anything.  This can
@@ -1206,7 +1298,6 @@ xfs_trans_fill_vecs(
                IOP_FORMAT(lidp->lid_item, vecp);
                vecp += lidp->lid_size;
                IOP_PIN(lidp->lid_item);
-                lidp = xfs_trans_next_item(tp, lidp);
        }
        /*
@@ -1284,7 +1375,7 @@ xfs_trans_item_committed(
         * log item flags, if anyone else stales the buffer we do not want to
         * pay any attention to it.
         */
-        IOP_UNPIN(lip);
+        IOP_UNPIN(lip, 0);
 }
 /*
@@ -1301,24 +1392,15 @@ xfs_trans_committed(
        struct xfs_trans        *tp,
        int                     abortflag)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_item_desc *lidp, *next;
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
        /* Call the transaction's completion callback if there is one. */
        if (tp->t_callback != NULL)
                tp->t_callback(tp, tp->t_callarg);
-        for (lidp = xfs_trans_first_item(tp);
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
-             lidp != NULL;
-             lidp = xfs_trans_next_item(tp, lidp)) {
                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
-        }
+                xfs_trans_free_item_desc(lidp);
-        /* free the item chunks, ignoring the embedded chunk */
-        for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
-                next_licp = licp->lic_next;
-                kmem_free(licp);
        }
        xfs_trans_free(tp);
@@ -1333,16 +1415,14 @@ xfs_trans_uncommit(
        struct xfs_trans        *tp,
        uint                    flags)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_item_desc *lidp;
-        for (lidp = xfs_trans_first_item(tp);
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-             lidp != NULL;
-             lidp = xfs_trans_next_item(tp, lidp)) {
                /*
                 * Unpin all but those that aren't dirty.
                 */
                if (lidp->lid_flags & XFS_LID_DIRTY)
-                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+                        IOP_UNPIN(lidp->lid_item, 1);
        }
        xfs_trans_unreserve_and_mod_sb(tp);
@@ -1508,33 +1588,28 @@ STATIC struct xfs_log_vec *
 xfs_trans_alloc_log_vecs(
        xfs_trans_t     *tp)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_item_desc *lidp;
        struct xfs_log_vec      *lv = NULL;
        struct xfs_log_vec      *ret_lv = NULL;
-        lidp = xfs_trans_first_item(tp);
        /* Bail out if we didn't find a log item.  */
-        if (!lidp) {
+        if (list_empty(&tp->t_items)) {
                ASSERT(0);
                return NULL;
        }
-        while (lidp != NULL) {
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                struct xfs_log_vec *new_lv;
                /* Skip items which aren't dirty in this transaction. */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
-                }
                /* Skip items that do not have any vectors for writing */
                lidp->lid_size = IOP_SIZE(lidp->lid_item);
-                if (!lidp->lid_size) {
+                if (!lidp->lid_size)
-                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
-                }
                new_lv = kmem_zalloc(sizeof(*new_lv) +
                                lidp->lid_size * sizeof(struct xfs_log_iovec),
@@ -1549,7 +1624,6 @@ xfs_trans_alloc_log_vecs(
                else
                        lv->lv_next = new_lv;
                lv = new_lv;
-                lidp = xfs_trans_next_item(tp, lidp);
        }
        return ret_lv;
@@ -1708,12 +1782,6 @@ xfs_trans_cancel(
        int                     flags)
 {
        int                     log_flags;
-#ifdef DEBUG
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        int                     i;
-#endif
        xfs_mount_t             *mp = tp->t_mountp;
        /*
@@ -1732,21 +1800,11 @@ xfs_trans_cancel(
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        }
 #ifdef DEBUG
-        if (!(flags & XFS_TRANS_ABORT)) {
+        if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
-                licp = &(tp->t_items);
+                struct xfs_log_item_desc *lidp;
-                while (licp != NULL) {
-                        lidp = licp->lic_descs;
+                list_for_each_entry(lidp, &tp->t_items, lid_trans)
-                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                        ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
-                                if (xfs_lic_isfree(licp, i)) {
-                                        continue;
-                                }
-                                lip = lidp->lid_item;
-                                if (!XFS_FORCED_SHUTDOWN(mp))
-                                        ASSERT(!(lip->li_type == XFS_LI_EFD));
-                        }
-                        licp = licp->lic_next;
-                }
        }
 #endif
        xfs_trans_unreserve_and_mod_sb(tp);
@@ -1834,7 +1892,6 @@ xfs_trans_roll(
        if (error)
                return error;
-        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(trans, dp);
-        xfs_trans_ihold(trans, dp);
        return 0;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e639e8e9a2a9..c13c0f97b494 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -161,105 +161,14 @@ typedef struct xfs_trans_header {
 * the amount of space needed to log the item it describes
 * once we get to commit processing (see xfs_trans_commit()).
 */
-typedef struct xfs_log_item_desc {
+struct xfs_log_item_desc {
        struct xfs_log_item     *lid_item;
-        ushort          lid_size;
+        ushort                  lid_size;
-        unsigned char   lid_flags;
+        unsigned char           lid_flags;
-        unsigned char   lid_index;
+        struct list_head        lid_trans;
-} xfs_log_item_desc_t;
+};
 #define XFS_LID_DIRTY           0x1
-#define XFS_LID_PINNED          0x2
-/*
- * This structure is used to maintain a chunk list of log_item_desc
- * structures. The free field is a bitmask indicating which descriptors
- * in this chunk's array are free.  The unused field is the first value
- * not used since this chunk was allocated.
- */
-#define XFS_LIC_NUM_SLOTS       15
-typedef struct xfs_log_item_chunk {
-        struct xfs_log_item_chunk       *lic_next;
-        ushort                          lic_free;
-        ushort                          lic_unused;
-        xfs_log_item_desc_t             lic_descs[XFS_LIC_NUM_SLOTS];
-} xfs_log_item_chunk_t;
-#define XFS_LIC_MAX_SLOT        (XFS_LIC_NUM_SLOTS - 1)
-#define XFS_LIC_FREEMASK        ((1 << XFS_LIC_NUM_SLOTS) - 1)
-/*
- * Initialize the given chunk.  Set the chunk's free descriptor mask
- * to indicate that all descriptors are free.  The caller gets to set
- * lic_unused to the right value (0 matches all free).  The
- * lic_descs.lid_index values are set up as each desc is allocated.
- */
-static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
-{
-        cp->lic_free = XFS_LIC_FREEMASK;
-}
-static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_descs[slot].lid_index = (unsigned char)(slot);
-}
-static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
-{
-        return cp->lic_free & XFS_LIC_FREEMASK;
-}
-static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
-{
-        cp->lic_free = XFS_LIC_FREEMASK;
-}
-static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
-{
-        return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
-}
-static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
-{
-        return (cp->lic_free & (1 << slot));
-}
-static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_free &= ~(1 << slot);
-}
-static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_free |= 1 << slot;
-}
-static inline xfs_log_item_desc_t *
-xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-        return &(cp->lic_descs[slot]);
-}
-static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
-{
-        return (uint)dp->lid_index;
-}
-/*
- * Calculate the address of a chunk given a descriptor pointer:
- * dp - dp->lid_index give the address of the start of the lic_descs array.
- * From this we subtract the offset of the lic_descs field in a chunk.
- * All of this yields the address of the chunk, which is
- * cast to a chunk pointer.
- */
-static inline xfs_log_item_chunk_t *
-xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
-{
-        return (xfs_log_item_chunk_t*) \
-                (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
-                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
-}
 #define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
@@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
 * Values for call flags parameter.
 */
-#define XFS_TRANS_NOSLEEP               0x1
-#define XFS_TRANS_WAIT                  0x2
 #define XFS_TRANS_RELEASE_LOG_RES       0x4
 #define XFS_TRANS_ABORT                 0x8
@@ -438,8 +345,7 @@ typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *);
+        void (*iop_unpin)(xfs_log_item_t *, int remove);
-        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
        uint (*iop_trylock)(xfs_log_item_t *);
        void (*iop_unlock)(xfs_log_item_t *);
        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -451,8 +357,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip)           (*(ip)->li_ops->iop_unpin)(ip)
+#define IOP_UNPIN(ip, remove)   (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
 #define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -516,8 +421,7 @@ typedef struct xfs_trans {
        int64_t                 t_rblocks_delta;/* superblock rblocks change */
        int64_t                 t_rextents_delta;/* superblocks rextents chg */
        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-        unsigned int            t_items_free;   /* log item descs free */
+        struct list_head        t_items;        /* log item descriptors */
-        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
        xfs_trans_header_t      t_header;       /* header for in-log trans */
        struct list_head        t_busy;         /* list of busy extents */
        unsigned long           t_pflags;       /* saved process flags state */
@@ -569,8 +473,8 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
                               xfs_ino_t , uint, uint, struct xfs_inode **);
-void            xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
+void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
-void            xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void            xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -595,6 +499,7 @@ int		xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
 extern kmem_zone_t      *xfs_trans_zone;
+extern kmem_zone_t      *xfs_log_item_desc_zone;
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f7245..dc9069568ff7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 63d81a22f4fd..90af025e6839 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_buf_item.h"
@@ -51,36 +47,17 @@ xfs_trans_buf_item_match(
        xfs_daddr_t             blkno,
        int                     len)
 {
-        xfs_log_item_chunk_t    *licp;
+        struct xfs_log_item_desc *lidp;
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_buf_log_item *blip;
-        xfs_buf_log_item_t      *blip;
-        int                     i;
        len = BBTOB(len);
-        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-                if (xfs_lic_are_all_free(licp)) {
+                blip = (struct xfs_buf_log_item *)lidp->lid_item;
-                        ASSERT(licp == &tp->t_items);
+                if (blip->bli_item.li_type == XFS_LI_BUF &&
-                        ASSERT(licp->lic_next == NULL);
+                    XFS_BUF_TARGET(blip->bli_buf) == target &&
-                        return NULL;
+                    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
-                }
+                    XFS_BUF_COUNT(blip->bli_buf) == len)
+                        return blip->bli_buf;
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i))
-                                continue;
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF)
-                                continue;
-                        if (XFS_BUF_TARGET(blip->bli_buf) == target &&
-                            XFS_BUF_ADDR(blip->bli_buf) == blkno &&
-                            XFS_BUF_COUNT(blip->bli_buf) == len)
-                                return blip->bli_buf;
-                }
        }
        return NULL;
@@ -127,7 +104,7 @@ _xfs_trans_bjoin(
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+        xfs_trans_add_item(tp, &bip->bli_item);
        /*
         * Initialize b_fsprivate2 so we can find it with incore_match()
@@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip;
        xfs_log_item_t          *lip;
-        xfs_log_item_desc_t     *lidp;
        /*
         * Default to a normal brelse() call if the tp is NULL.
@@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        /*
-         * Find the item descriptor pointing to this buffer's
-         * log item.  It must be there.
-         */
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        trace_xfs_trans_brelse(bip);
        /*
@@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * If the buffer is dirty within this transaction, we can't
         * release it until we commit.
         */
-        if (lidp->lid_flags & XFS_LID_DIRTY)
+        if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
                return;
        /*
@@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        /*
         * Free up the log item descriptor tracking the released item.
         */
-        xfs_trans_free_item(tp, lidp);
+        xfs_trans_del_item(&bip->bli_item);
        /*
         * Clear the hold flag in the buf log item if it is set.
@@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                  uint          last)
 {
        xfs_buf_log_item_t      *bip;
-        xfs_log_item_desc_t     *lidp;
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
-        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+        bip->bli_item.li_cb = xfs_buf_iodone;
        trace_xfs_trans_log_buf(bip);
@@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
 }
@@ -740,7 +705,6 @@ xfs_trans_binval(
        xfs_trans_t     *tp,
        xfs_buf_t       *bp)
 {
-        xfs_log_item_desc_t     *lidp;
        xfs_buf_log_item_t      *bip;
        ASSERT(XFS_BUF_ISBUSY(bp));
@@ -748,8 +712,6 @@ xfs_trans_binval(
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        trace_xfs_trans_binval(bip);
@@ -764,7 +726,7 @@ xfs_trans_binval(
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+                ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
        }
@@ -797,7 +759,7 @@ xfs_trans_binval(
        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
@@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_STALE_INODE;
-        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
+        bip->bli_item.li_cb = xfs_buf_iodone;
-                xfs_buf_iodone;
 }
 /*
 * Mark the buffer as being one which contains newly allocated
 * inodes.  We need to make sure that even if this buffer is
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e9..f783d5e9fa70 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t	*tp,
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
+        xfs_trans_add_item(tp, &efip->efi_item);
+        return efip;
-        return (efip);
 }
 /*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
                         xfs_fsblock_t          start_block,
                         xfs_extlen_t           ext_len)
 {
-        xfs_log_item_desc_t     *lidp;
        uint                    next_extent;
        xfs_extent_t            *extp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        next_extent = efip->efi_next_extent;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t		*tp,
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
+        xfs_trans_add_item(tp, &efdp->efd_item);
+        return efdp;
-        return (efdp);
 }
 /*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t		*tp,
                         xfs_fsblock_t          start_block,
                         xfs_extlen_t           ext_len)
 {
-        xfs_log_item_desc_t     *lidp;
        uint                    next_extent;
        xfs_extent_t            *extp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        next_extent = efdp->efd_next_extent;
        ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2559dfec946b..cdc53a1050c5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 #ifdef XFS_TRANS_DEBUG
 STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
 #define xfs_trans_inode_broot_debug(ip)
 #endif
 /*
 * Get an inode and join it to the transaction.
 */
@@ -63,77 +58,65 @@ xfs_trans_iget(
        int                     error;
        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp)
+        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp, lock_flags);
+                xfs_trans_ijoin(tp, *ipp);
+                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
+        }
        return error;
 }
 /*
- * Add the locked inode to the transaction.
+ * Add a locked inode to the transaction.
- * The inode must be locked, and it cannot be associated with any
+ *
- * transaction.  The caller must specify the locks already held
+ * The inode must be locked, and it cannot be associated with any transaction.
- * on the inode.
 */
 void
 xfs_trans_ijoin(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip)
-        uint            lock_flags)
 {
        xfs_inode_log_item_t    *iip;
        ASSERT(ip->i_transp == NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(lock_flags & XFS_ILOCK_EXCL);
        if (ip->i_itemp == NULL)
                xfs_inode_item_init(ip, ip->i_mount);
        iip = ip->i_itemp;
-        ASSERT(iip->ili_flags == 0);
+        ASSERT(iip->ili_lock_flags == 0);
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+        xfs_trans_add_item(tp, &iip->ili_item);
        xfs_trans_inode_broot_debug(ip);
        /*
-         * If the IO lock is already held, mark that in the inode log item.
-         */
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-        }
-        /*
         * Initialize i_transp so we can find it with xfs_inode_incore()
         * in xfs_trans_iget() above.
         */
        ip->i_transp = tp;
 }
 /*
- * Mark the inode as not needing to be unlocked when the inode item's
+ * Add a locked inode to the transaction.
- * IOP_UNLOCK() routine is called.  The inode must already be locked
+ *
- * and associated with the given transaction.
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is commited.  The inode will also be unlocked at that point.  The inode
+ * must be locked, and it cannot be associated with any transaction.
 */
-/*ARGSUSED*/
 void
-xfs_trans_ihold(
+xfs_trans_ijoin_ref(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip,
+        uint                    lock_flags)
 {
-        ASSERT(ip->i_transp == tp);
+        xfs_trans_ijoin(tp, ip);
-        ASSERT(ip->i_itemp != NULL);
+        IHOLD(ip);
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ip->i_itemp->ili_lock_flags = lock_flags;
-        ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
 }
 /*
 * This is called to mark the fields indicated in fieldmask as needing
 * to be logged when the transaction is committed.  The inode must
@@ -149,17 +132,12 @@ xfs_trans_log_inode(
        xfs_inode_t     *ip,
        uint            flags)
 {
-        xfs_log_item_desc_t     *lidp;
        ASSERT(ip->i_transp == tp);
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        /*
         * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index f11d37d06dcc..000000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has its own ailp */
-#include "xfs_bit.h"
-#include "xfs_buf_item.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
-                                        int, int, xfs_lsn_t);
-/*
- * This is called to add the given log item to the transaction's
- * list of log items.  It must find a free log item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to item descriptor used to point
- * to the new item.  The log item will now point to its new descriptor
- * with its li_desc field.
- */
-xfs_log_item_desc_t *
-xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_chunk_t    *licp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_items_free == 0) {
-                licp = (xfs_log_item_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
-                ASSERT(licp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                xfs_lic_init(licp);
-                xfs_lic_claim(licp, 0);
-                licp->lic_unused = 1;
-                xfs_lic_init_slot(licp, 0);
-                lidp = xfs_lic_slot(licp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                licp->lic_next = tp->t_items.lic_next;
-                tp->t_items.lic_next = licp;
-                tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lidp->lid_item = lip;
-                lidp->lid_flags = 0;
-                lidp->lid_size = 0;
-                lip->li_desc = lidp;
-                lip->li_mountp = tp->t_mountp;
-                lip->li_ailp = tp->t_mountp->m_ail;
-                return lidp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        licp = &tp->t_items;
-        while (licp != NULL) {
-                if (xfs_lic_vacancy(licp)) {
-                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
-                                i = licp->lic_unused;
-                                ASSERT(xfs_lic_isfree(licp, i));
-                                break;
-                        }
-                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-                                if (xfs_lic_isfree(licp, i))
-                                        break;
-                        }
-                        ASSERT(i <= XFS_LIC_MAX_SLOT);
-                        break;
-                }
-                licp = licp->lic_next;
-        }
-        ASSERT(licp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        xfs_lic_claim(licp, i);
-        if (licp->lic_unused <= i) {
-                licp->lic_unused = i + 1;
-                xfs_lic_init_slot(licp, i);
-        }
-        lidp = xfs_lic_slot(licp, i);
-        tp->t_items_free--;
-        lidp->lid_item = lip;
-        lidp->lid_flags = 0;
-        lidp->lid_size = 0;
-        lip->li_desc = lidp;
-        lip->li_mountp = tp->t_mountp;
-        lip->li_ailp = tp->t_mountp->m_ail;
-        return lidp;
-}
-/*
- * Free the given descriptor.
- *
- * This requires setting the bit in the chunk's free mask corresponding
- * to the given slot.
- */
-void
-xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
-        uint                    slot;
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    **licpp;
-        slot = xfs_lic_desc_to_slot(lidp);
-        licp = xfs_lic_desc_to_chunk(lidp);
-        xfs_lic_relse(licp, slot);
-        lidp->lid_item->li_desc = NULL;
-        tp->t_items_free++;
-        /*
-         * If there are no more used items in the chunk and this is not
-         * the chunk embedded in the transaction structure, then free
-         * the chunk. First pull it from the chunk list and then
-         * free it back to the heap.  We didn't bother with a doubly
-         * linked list here because the lists should be very short
-         * and this is not a performance path.  It's better to save
-         * the memory of the extra pointer.
-         *
-         * Also decrement the transaction structure's count of free items
-         * by the number in a chunk since we are freeing an empty chunk.
-         */
-        if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
-                licpp = &(tp->t_items.lic_next);
-                while (*licpp != licp) {
-                        ASSERT(*licpp != NULL);
-                        licpp = &((*licpp)->lic_next);
-                }
-                *licpp = licp->lic_next;
-                kmem_free(licp);
-                tp->t_items_free -= XFS_LIC_NUM_SLOTS;
-        }
-}
-/*
- * This is called to find the descriptor corresponding to the given
- * log item.  It returns a pointer to the descriptor.
- * The log item MUST have a corresponding descriptor in the given
- * transaction.  This routine does not return NULL, it panics.
- *
- * The descriptor pointer is kept in the log item's li_desc field.
- * Just return it.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
-        ASSERT(lip->li_desc != NULL);
-        return lip->li_desc;
-}
-/*
- * Return a pointer to the first descriptor in the chunk list.
- * This does not return NULL if there are none, it panics.
- *
- * The first descriptor must be in either the first or second chunk.
- * This is because the only chunk allowed to be empty is the first.
- * All others are freed when they become empty.
- *
- * At some point this and xfs_trans_next_item() should be optimized
- * to quickly look at the mask to determine if there is anything to
- * look at.
- */
-xfs_log_item_desc_t *
-xfs_trans_first_item(xfs_trans_t *tp)
-{
-        xfs_log_item_chunk_t    *licp;
-        int                     i;
-        licp = &tp->t_items;
-        /*
-         * If it's not in the first chunk, skip to the second.
-         */
-        if (xfs_lic_are_all_free(licp)) {
-                licp = licp->lic_next;
-        }
-        /*
-         * Return the first non-free descriptor in the chunk.
-         */
-        ASSERT(!xfs_lic_are_all_free(licp));
-        for (i = 0; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
-        return NULL;
-}
-/*
- * Given a descriptor, return the next descriptor in the chunk list.
- * This returns NULL if there are no more used descriptors in the list.
- *
- * We do this by first locating the chunk in which the descriptor resides,
- * and then scanning forward in the chunk and the list for the next
- * used descriptor.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
-        xfs_log_item_chunk_t    *licp;
-        int                     i;
-        licp = xfs_lic_desc_to_chunk(lidp);
-        /*
-         * First search the rest of the chunk. The for loop keeps us
-         * from referencing things beyond the end of the chunk.
-         */
-        for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        /*
-         * Now search the next chunk.  It must be there, because the
-         * next chunk would have been freed if it were empty.
-         * If there is no next chunk, return NULL.
-         */
-        if (licp->lic_next == NULL) {
-                return NULL;
-        }
-        licp = licp->lic_next;
-        ASSERT(!xfs_lic_are_all_free(licp));
-        for (i = 0; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        ASSERT(0);
-        /* NOTREACHED */
-        return NULL; /* keep gcc quite */
-}
-/*
- * This is called to unlock all of the items of a transaction and to free
- * all the descriptors of that transaction.
- *
- * It walks the list of descriptors and unlocks each item.  It frees
- * each chunk except that embedded in the transaction as it goes along.
- */
-void
-xfs_trans_free_items(
-        xfs_trans_t     *tp,
-        xfs_lsn_t       commit_lsn,
-        int             flags)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        int                     abort;
-        abort = flags & XFS_TRANS_ABORT;
-        licp = &tp->t_items;
-        /*
-         * Special case the embedded chunk so we don't free it below.
-         */
-        if (!xfs_lic_are_all_free(licp)) {
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
-                xfs_lic_all_free(licp);
-                licp->lic_unused = 0;
-        }
-        licp = licp->lic_next;
-        /*
-         * Unlock each item in each chunk and free the chunks.
-         */
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Reset the transaction structure's free item count.
-         */
-        tp->t_items_free = XFS_LIC_NUM_SLOTS;
-        tp->t_items.lic_next = NULL;
-}
-/*
- * This is called to unlock the items associated with a transaction.
- * Items which were not logged should be freed.
- * Those which were logged must still be tracked so they can be unpinned
- * when the transaction commits.
- */
-void
-xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_item_chunk_t    **licpp;
-        int                     freed;
-        freed = 0;
-        licp = &tp->t_items;
-        /*
-         * Special case the embedded chunk so we don't free.
-         */
-        if (!xfs_lic_are_all_free(licp)) {
-                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-        }
-        licpp = &(tp->t_items.lic_next);
-        licp = licp->lic_next;
-        /*
-         * Unlock each item in each chunk, free non-dirty descriptors,
-         * and free empty chunks.
-         */
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-                next_licp = licp->lic_next;
-                if (xfs_lic_are_all_free(licp)) {
-                        *licpp = next_licp;
-                        kmem_free(licp);
-                        freed -= XFS_LIC_NUM_SLOTS;
-                } else {
-                        licpp = &(licp->lic_next);
-                }
-                ASSERT(*licpp == next_licp);
-                licp = next_licp;
-        }
-        /*
-         * Fix the free descriptor count in the transaction.
-         */
-        tp->t_items_free += freed;
-}
-/*
- * Unlock each item pointed to by a descriptor in the given chunk.
- * Stamp the commit lsn into each item if necessary.
- * Free descriptors pointing to items which are not dirty if freeing_chunk
- * is zero. If freeing_chunk is non-zero, then we need to unlock all
- * items in the chunk.
- * 
- * Return the number of descriptors freed.
- */
-STATIC int
-xfs_trans_unlock_chunk(
-        xfs_log_item_chunk_t    *licp,
-        int                     freeing_chunk,
-        int                     abort,
-        xfs_lsn_t               commit_lsn)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        int                     i;
-        int                     freed;
-        freed = 0;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                lip->li_desc = NULL;
-                if (commit_lsn != NULLCOMMITLSN)
-                        IOP_COMMITTING(lip, commit_lsn);
-                if (abort)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                IOP_UNLOCK(lip);
-                /*
-                 * Free the descriptor if the item is not dirty
-                 * within this transaction and the caller is not
-                 * going to just free the entire thing regardless.
-                 */
-                if (!(freeing_chunk) &&
-                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-                        xfs_lic_relse(licp, i);
-                        freed++;
-                }
-        }
-        return freed;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c6e4f2c8de6e..e2d93d8ead7b 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,22 +23,8 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
-/*
+void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
- * From xfs_trans_item.c
+void    xfs_trans_del_item(struct xfs_log_item *);
- */
-struct xfs_log_item_desc        *xfs_trans_add_item(struct xfs_trans *,
-                                            struct xfs_log_item *);
-void                            xfs_trans_free_item(struct xfs_trans *,
-                                            struct xfs_log_item_desc *);
-struct xfs_log_item_desc        *xfs_trans_find_item(struct xfs_trans *,
-                                             struct xfs_log_item *);
-struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
-struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
-                                             struct xfs_log_item_desc *);
-void    xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
-void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-                                int flags);
 void    xfs_trans_item_committed(struct xfs_log_item *lip,
                                xfs_lsn_t commit_lsn, int aborted);
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde91..b7d5769d2df0 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_rw.h"
 #include "xfs_itable.h"
 #include "xfs_utils.h"
@@ -324,86 +320,3 @@ xfs_bumplink(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        return 0;
 }
-/*
- * Try to truncate the given file to 0 length.  Currently called
- * only out of xfs_remove when it has to truncate a file to free
- * up space for the remove to proceed.
- */
-int
-xfs_truncate_file(
-        xfs_mount_t     *mp,
-        xfs_inode_t     *ip)
-{
-        xfs_trans_t     *tp;
-        int             error;
-#ifdef QUOTADEBUG
-        /*
-         * This is called to truncate the quotainodes too.
-         */
-        if (XFS_IS_UQUOTA_ON(mp)) {
-                if (ip->i_ino != mp->m_sb.sb_uquotino)
-                        ASSERT(ip->i_udquot);
-        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                if (ip->i_ino != mp->m_sb.sb_gquotino)
-                        ASSERT(ip->i_gdquot);
-        }
-#endif
-        /*
-         * Make the call to xfs_itruncate_start before starting the
-         * transaction, because we cannot make the call while we're
-         * in a transaction.
-         */
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
-        if (error) {
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                      XFS_TRANS_PERM_LOG_RES,
-                                      XFS_ITRUNCATE_LOG_COUNT))) {
-                xfs_trans_cancel(tp, 0);
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
-        /*
-         * Follow the normal truncate locking protocol.  Since we
-         * hold the inode in the transaction, we know that its number
-         * of references will stay constant.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_ihold(tp, ip);
-        /*
-         * Signal a sync xaction.  The only case where that isn't
-         * the case is if we're truncating an already unlinked file
-         * on a wsync fs.  In that case, we know the blocks can't
-         * reappear in the file because the links to file are
-         * permanently toast.  Currently, we're always going to
-         * want a sync transaction because this code is being
-         * called from places where nlink is guaranteed to be 1
-         * but I'm leaving the tests in to protect against future
-         * changes -- rcc.
-         */
-        error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
-                                     XFS_DATA_FORK,
-                                     ((ip->i_d.di_nlink != 0 ||
-                                       !(mp->m_flags & XFS_MOUNT_WSYNC))
-                                      ? 1 : 0));
-        if (error) {
-                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                 XFS_TRANS_ABORT);
-        } else {
-                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        return error;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d269..f55b9678264f 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
-extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
                                xfs_inode_t **, int *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c1646838898f..3ac137dd531b 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
        int                     need_iolock = 1;
-        xfs_itrace_entry(ip);
+        trace_xfs_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return XFS_ERROR(EROFS);
@@ -143,16 +138,6 @@ xfs_setattr(
                        goto error_return;
                }
        } else {
-                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
-                    !(flags & XFS_ATTR_DMI)) {
-                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
-                                iattr->ia_size, 0, dmflags, NULL);
-                        if (code) {
-                                lock_flags = 0;
-                                goto error_return;
-                        }
-                }
                if (need_iolock)
                        lock_flags |= XFS_IOLOCK_EXCL;
        }
@@ -283,8 +268,7 @@ xfs_setattr(
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, lock_flags);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Only change the c/mtime if we are changing the size
@@ -334,8 +318,7 @@ xfs_setattr(
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
        } else if (tp) {
-                xfs_trans_ijoin(tp, ip, lock_flags);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        }
        /*
@@ -470,17 +453,10 @@ xfs_setattr(
                        return XFS_ERROR(code);
        }
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
-            !(flags & XFS_ATTR_DMI)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL, NULL, NULL,
-                                        0, 0, AT_DELAY_FLAG(flags));
-        }
        return 0;
 abort_return:
        commit_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -516,7 +492,7 @@ xfs_readlink_bmap(
        int             error = 0;
        error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
-                        mval, &nmaps, NULL, NULL);
+                        mval, &nmaps, NULL);
        if (error)
                goto out;
@@ -557,7 +533,7 @@ xfs_readlink(
        int             pathlen;
        int             error = 0;
-        xfs_itrace_entry(ip);
+        trace_xfs_readlink(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -613,14 +589,14 @@ xfs_free_eofblocks(
         */
        end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-        map_len = last_fsb - end_fsb;
+        if (last_fsb <= end_fsb)
-        if (map_len <= 0)
                return 0;
+        map_len = last_fsb - end_fsb;
        nimaps = 1;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-                          NULL, 0, &imap, &nimaps, NULL, NULL);
+                          NULL, 0, &imap, &nimaps, NULL);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!error && (nimaps != 0) &&
@@ -675,10 +651,7 @@ xfs_free_eofblocks(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip,
+                xfs_trans_ijoin(tp, ip);
-                                XFS_IOLOCK_EXCL |
-                                XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
                error = xfs_itruncate_finish(&tp, ip,
                                             ip->i_size,
@@ -750,8 +723,7 @@ xfs_inactive_symlink_rmt(
        xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        size = (int)ip->i_d.di_size;
        ip->i_d.di_size = 0;
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Find the block(s) so we can inval and unmap them.
@@ -761,7 +733,7 @@ xfs_inactive_symlink_rmt(
        nmaps = ARRAY_SIZE(mval);
        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-                        &free_list, NULL)))
+                        &free_list)))
                goto error0;
        /*
         * Invalidate the block(s).
@@ -776,7 +748,7 @@ xfs_inactive_symlink_rmt(
         * Unmap the dead block(s) to the free_list.
         */
        if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-                        &first_block, &free_list, NULL, &done)))
+                        &first_block, &free_list, &done)))
                goto error1;
        ASSERT(done);
        /*
@@ -795,8 +767,7 @@ xfs_inactive_symlink_rmt(
         * Mark it dirty so it will be logged and moved forward in the log as
         * part of every commit.
         */
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Get a new, empty transaction to return to our caller.
@@ -929,8 +900,7 @@ xfs_inactive_attrs(
                goto error_cancel;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1005,6 @@ xfs_inactive(
        int             error;
        int             truncate;
-        xfs_itrace_entry(ip);
        /*
         * If the inode is already free, then there can be nothing
         * to clean up here.
@@ -1060,9 +1028,6 @@ xfs_inactive(
        mp = ip->i_mount;
-        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
-                XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
        error = 0;
        /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1085,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1118,7 @@ xfs_inactive(
                        return VN_INACTIVE_CACHE;
                }
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        } else {
                error = xfs_trans_reserve(tp, 0,
                                          XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1131,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        }
        /*
@@ -1257,7 +1219,7 @@ xfs_lookup(
        int                     error;
        uint                    lock_mode;
-        xfs_itrace_entry(dp);
+        trace_xfs_lookup(dp, name);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
@@ -1309,21 +1271,11 @@ xfs_create(
        uint                    log_res;
        uint                    log_count;
-        xfs_itrace_entry(dp);
+        trace_xfs_create(dp, name);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                dp, DM_RIGHT_NULL, NULL,
-                                DM_RIGHT_NULL, name->name, NULL,
-                                mode, 0, 0);
-                if (error)
-                        return error;
-        }
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
        else
@@ -1427,8 +1379,7 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        IHOLD(dp);
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1487,16 +1438,7 @@ xfs_create(
        xfs_qm_dqrele(gdqp);
        *ipp = ip;
+        return 0;
-        /* Fallthrough to std_return with error = 0  */
- std_return:
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
-                                ip, DM_RIGHT_NULL, name->name, NULL, mode,
-                                error, 0);
-        }
-        return error;
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
@@ -1510,8 +1452,8 @@ xfs_create(
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
-        goto std_return;
+        return error;
 out_abort_rele:
        /*
@@ -1726,20 +1668,11 @@ xfs_remove(
        uint                    resblks;
        uint                    log_count;
-        xfs_itrace_entry(dp);
+        trace_xfs_remove(dp, name);
-        xfs_itrace_entry(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL, name->name, NULL,
-                                        ip->i_d.di_mode, 0, 0);
-                if (error)
-                        return error;
-        }
        error = xfs_qm_dqattach(dp, 0);
        if (error)
                goto std_return;
@@ -1782,15 +1715,8 @@ xfs_remove(
        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        /*
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-         * At this point, we've gotten both the directory and the entry
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-         * inodes locked.
-         */
-        IHOLD(ip);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        IHOLD(dp);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        /*
         * If we're removing a directory perform some additional validation.
@@ -1877,21 +1803,15 @@ xfs_remove(
        if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
                xfs_filestream_deassociate(ip);
- std_return:
+        return 0;
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
-                                NULL, DM_RIGHT_NULL, name->name, NULL,
-                                ip->i_d.di_mode, error, 0);
-        }
-        return error;
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
 int
@@ -1909,25 +1829,13 @@ xfs_link(
        int                     committed;
        int                     resblks;
-        xfs_itrace_entry(tdp);
+        trace_xfs_link(tdp, target_name);
-        xfs_itrace_entry(sip);
        ASSERT(!S_ISDIR(sip->i_d.di_mode));
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-                                        tdp, DM_RIGHT_NULL,
-                                        sip, DM_RIGHT_NULL,
-                                        target_name->name, NULL, 0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        error = xfs_qm_dqattach(sip, 0);
        if (error)
                goto std_return;
@@ -1953,15 +1861,8 @@ xfs_link(
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-        /*
+        xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
-         * Increment vnode ref counts since xfs_trans_commit &
+        xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
-         * xfs_trans_cancel will both unlock the inodes and
-         * decrement the associated ref counts.
-         */
-        IHOLD(sip);
-        IHOLD(tdp);
-        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
        /*
         * If the source has too many links, we can't make any more to it.
@@ -2014,27 +1915,14 @@ xfs_link(
                goto abort_return;
        }
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error)
-                goto std_return;
-        /* Fall through to std_return with error = 0. */
-std_return:
-        if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-                                tdp, DM_RIGHT_NULL,
-                                sip, DM_RIGHT_NULL,
-                                target_name->name, NULL, 0, error, 0);
-        }
-        return error;
 abort_return:
        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
 int
@@ -2074,7 +1962,7 @@ xfs_symlink(
        ip = NULL;
        tp = NULL;
-        xfs_itrace_entry(dp);
+        trace_xfs_symlink(dp, link_name);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -2086,17 +1974,6 @@ xfs_symlink(
        if (pathlen >= MAXPATHLEN)      /* total string too long */
                return XFS_ERROR(ENAMETOOLONG);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
-                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        link_name->name,
-                                        (unsigned char *)target_path, 0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
@@ -2180,8 +2057,7 @@ xfs_symlink(
         * transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        IHOLD(dp);
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        /*
@@ -2215,7 +2091,7 @@ xfs_symlink(
                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
-                                  &free_list, NULL);
+                                  &free_list);
                if (error) {
                        goto error1;
                }
@@ -2278,21 +2154,8 @@ xfs_symlink(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
-        /* Fall through to std_return with error = 0 or errno from
+        *ipp = ip;
-         * xfs_trans_commit     */
+        return 0;
-std_return:
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-                                        dp, DM_RIGHT_NULL,
-                                        error ? NULL : ip,
-                                        DM_RIGHT_NULL, link_name->name,
-                                        (unsigned char *)target_path,
-                                        0, error, 0);
-        }
-        if (!error)
-                *ipp = ip;
-        return error;
 error2:
        IRELE(ip);
@@ -2306,8 +2169,8 @@ std_return:
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
-        goto std_return;
+        return error;
 }
 int
@@ -2333,13 +2196,12 @@ xfs_set_dmattrs(
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        ip->i_d.di_dmevmask = evmask;
        ip->i_d.di_dmstate  = state;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        IHOLD(ip);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -2390,7 +2252,7 @@ xfs_alloc_file_space(
        int                     committed;
        int                     error;
-        xfs_itrace_entry(ip);
+        trace_xfs_alloc_file_space(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -2412,25 +2274,9 @@ xfs_alloc_file_space(
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
-        /*      Generate a DMAPI event if needed.       */
-        if (alloc_type != 0 && offset < ip->i_size &&
-                        (attr_flags & XFS_ATTR_DMI) == 0  &&
-                        DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-                xfs_off_t           end_dmi_offset;
-                end_dmi_offset = offset+len;
-                if (end_dmi_offset > ip->i_size)
-                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
-                                      end_dmi_offset - offset, 0, NULL);
-                if (error)
-                        return error;
-        }
        /*
         * Allocate file space until done or until there is an error
         */
-retry:
        while (allocatesize_fsb && !error) {
                xfs_fileoff_t   s, e;
@@ -2488,8 +2334,7 @@ retry:
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2343,7 @@ retry:
                error = xfs_bmapi(tp, ip, startoffset_fsb,
                                  allocatesize_fsb, bmapi_flag,
                                  &firstfsb, 0, imapp, &nimaps,
-                                  &free_list, NULL);
+                                  &free_list);
                if (error) {
                        goto error0;
                }
@@ -2527,17 +2372,6 @@ retry:
                startoffset_fsb += allocated_fsb;
                allocatesize_fsb -= allocated_fsb;
        }
-dmapi_enospc_check:
-        if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-                                ip, DM_RIGHT_NULL,
-                                ip, DM_RIGHT_NULL,
-                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
-                if (error == 0)
-                        goto retry;     /* Maybe DMAPI app. has made space */
-                /* else fall through with error from XFS_SEND_DATA */
-        }
        return error;
@@ -2548,7 +2382,7 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        goto dmapi_enospc_check;
+        return error;
 }
 /*
@@ -2598,7 +2432,7 @@ xfs_zero_remaining_bytes(
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
                error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
-                        NULL, 0, &imap, &nimap, NULL, NULL);
+                        NULL, 0, &imap, &nimap, NULL);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
@@ -2661,7 +2495,6 @@ xfs_free_file_space(
 {
        int                     committed;
        int                     done;
-        xfs_off_t               end_dmi_offset;
        xfs_fileoff_t           endoffset_fsb;
        int                     error;
        xfs_fsblock_t           firstfsb;
@@ -2680,7 +2513,7 @@ xfs_free_file_space(
        mp = ip->i_mount;
-        xfs_itrace_entry(ip);
+        trace_xfs_free_file_space(ip);
        error = xfs_qm_dqattach(ip, 0);
        if (error)
@@ -2691,19 +2524,7 @@ xfs_free_file_space(
                return error;
        rt = XFS_IS_REALTIME_INODE(ip);
        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-        end_dmi_offset = offset + len;
+        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-        endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
-        if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-                if (end_dmi_offset > ip->i_size)
-                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
-                                offset, end_dmi_offset - offset,
-                                AT_DELAY_FLAG(attr_flags), NULL);
-                if (error)
-                        return error;
-        }
        if (attr_flags & XFS_ATTR_NOLOCK)
                need_iolock = 0;
@@ -2731,7 +2552,7 @@ xfs_free_file_space(
        if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                nimap = 1;
                error = xfs_bmapi(NULL, ip, startoffset_fsb,
-                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2567,7 @@ xfs_free_file_space(
                }
                nimap = 1;
                error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
-                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2635,7 @@ xfs_free_file_space(
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * issue the bunmapi() call to free the blocks
@@ -2823,7 +2643,7 @@ xfs_free_file_space(
                xfs_bmap_init(&free_list, &firstfsb);
                error = xfs_bunmapi(tp, ip, startoffset_fsb,
                                  endoffset_fsb - startoffset_fsb,
-                                  0, 2, &firstfsb, &free_list, NULL, &done);
+                                  0, 2, &firstfsb, &free_list, &done);
                if (error) {
                        goto error0;
                }
@@ -2883,8 +2703,6 @@ xfs_change_file_space(
        xfs_trans_t     *tp;
        struct iattr    iattr;
-        xfs_itrace_entry(ip);
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2985,8 +2803,7 @@ xfs_change_file_space(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        if ((attr_flags & XFS_ATTR_DMI) == 0) {
                ip->i_d.di_mode &= ~S_ISUID;