Merge commit 'v2.6.36' into wip-merge-2.6.36

Conflicts: Makefile arch/x86/include/asm/unistd_32.h arch/x86/kernel/syscall_table_32.S kernel/sched.c kernel/time/tick-sched.c Relevant API and functions changes (solved in this commit): - (API) .enqueue_task() (enqueue_task_litmus), dequeue_task() (dequeue_task_litmus), [litmus/sched_litmus.c] - (API) .select_task_rq() (select_task_rq_litmus) [litmus/sched_litmus.c] - (API) sysrq_dump_trace_buffer() and sysrq_handle_kill_rt_tasks() [litmus/sched_trace.c] - struct kfifo internal buffer name changed (buffer -> buf) [litmus/sched_trace.c] - add_wait_queue_exclusive_locked -> __add_wait_queue_tail_exclusive [litmus/fmlp.c] - syscall numbers for both x86_32 and x86_64
author: Andrea Bastoni <bastoni@cs.unc.edu> 2010-10-23 01:01:49 -0400
committer: Andrea Bastoni <bastoni@cs.unc.edu> 2010-10-23 01:01:49 -0400
commit: 3dd41424090a0ca3a660218d06afe6ff4441bad3 (patch)
tree: 511ef1bb1799027fc5aad574adce49120ecadd87 /fs
parent: 5c5456402d467969b217d7fdd6670f8c8600f5a8 (diff)
parent: f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)
834 files changed, 41646 insertions, 29632 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index 1a940ec7af61..91fba025fcbe 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o
        vfs_dir.o \
        vfs_dentry.o \
        v9fs.o \
-        fid.o
+        fid.o  \
+        xattr.o \
+        xattr_user.o
 9p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 7317b39b2815..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
        return ret;
 }
+/*
+ * We need to hold v9ses->rename_sem as long as we hold references
+ * to returned path array. Array element contain pointers to
+ * dentry names.
+ */
+static int build_path_from_dentry(struct v9fs_session_info *v9ses,
+                                  struct dentry *dentry, char ***names)
+{
+        int n = 0, i;
+        char **wnames;
+        struct dentry *ds;
+        for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+                n++;
+        wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+        if (!wnames)
+                goto err_out;
+        for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent)
+                wnames[i] = (char  *)ds->d_name.name;
+        *names = wnames;
+        return n;
+err_out:
+        return -ENOMEM;
+}
 /**
 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
 * @dentry: dentry to look for fid in
@@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        int i, n, l, clone, any, access;
        u32 uid;
        struct p9_fid *fid, *old_fid = NULL;
-        struct dentry *d, *ds;
+        struct dentry *ds;
        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
@@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
+        /*
+         * we don't have a matching fid. To do a TWALK we need
+         * parent fid. We need to prevent rename when we want to
+         * look at the parent.
+         */
+        down_read(&v9ses->rename_sem);
        ds = dentry->d_parent;
        fid = v9fs_fid_find(ds, uid, any);
-        if (!fid) { /* walk from the root */
+        if (fid) {
-                n = 0;
+                /* Found the parent fid do a lookup with that */
-                for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent)
+                fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1);
-                        n++;
+                goto fid_out;
+        }
+        up_read(&v9ses->rename_sem);
-                fid = v9fs_fid_find(ds, uid, any);
+        /* start from the root and try to do a lookup */
-                if (!fid) { /* the user is not attached to the fs yet */
+        fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
-                        if (access == V9FS_ACCESS_SINGLE)
+        if (!fid) {
-                                return ERR_PTR(-EPERM);
+                /* the user is not attached to the fs yet */
+                if (access == V9FS_ACCESS_SINGLE)
+                        return ERR_PTR(-EPERM);
-                        if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses))
                                uname = NULL;
-                        else
+                else
-                                uname = v9ses->uname;
+                        uname = v9ses->uname;
-                        fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
+                fid = p9_client_attach(v9ses->clnt, NULL, uname, uid,
-                                v9ses->aname);
+                                       v9ses->aname);
+                if (IS_ERR(fid))
-                        if (IS_ERR(fid))
+                        return fid;
-                                return fid;
-                        v9fs_fid_add(ds, fid);
-                }
-        } else /* walk from the parent */
-                n = 1;
-        if (ds == dentry)
+                v9fs_fid_add(dentry->d_sb->s_root, fid);
+        }
+        /* If we are root ourself just return that */
+        if (dentry->d_sb->s_root == dentry)
                return fid;
+        /*
-        wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL);
+         * Do a multipath walk with attached root.
-        if (!wnames)
+         * When walking parent we need to make sure we
-                return ERR_PTR(-ENOMEM);
+         * don't have a parallel rename happening
+         */
-        for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent)
+        down_read(&v9ses->rename_sem);
-                wnames[i] = (char *) d->d_name.name;
+        n  = build_path_from_dentry(v9ses, dentry, &wnames);
+        if (n < 0) {
+                fid = ERR_PTR(n);
+                goto err_out;
+        }
        clone = 1;
        i = 0;
        while (i < n) {
                l = min(n - i, P9_MAXWELEM);
+                /*
+                 * We need to hold rename lock when doing a multipath
+                 * walk to ensure none of the patch component change
+                 */
                fid = p9_client_walk(fid, l, &wnames[i], clone);
                if (IS_ERR(fid)) {
                        if (old_fid) {
@@ -193,15 +234,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
                                p9_client_clunk(old_fid);
                        }
                        kfree(wnames);
-                        return fid;
+                        goto err_out;
                }
                old_fid = fid;
                i += l;
                clone = 0;
        }
        kfree(wnames);
-        v9fs_fid_add(dentry, fid);
+fid_out:
+        if (!IS_ERR(fid))
+                v9fs_fid_add(dentry, fid);
+err_out:
+        up_read(&v9ses->rename_sem);
        return fid;
 }
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f8b86e92cd66..38dc0e067599 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                __putname(v9ses->uname);
                return ERR_PTR(-ENOMEM);
        }
+        init_rwsem(&v9ses->rename_sem);
        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
        if (rc) {
@@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
        /* for legacy mode, fall back to V9FS_ACCESS_ANY */
-        if (!v9fs_proto_dotu(v9ses) &&
+        if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) &&
                ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) {
                v9ses->flags &= ~V9FS_ACCESS_MASK;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bec4d0bcb458..4c963c9fc41f 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -104,6 +104,7 @@ struct v9fs_session_info {
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
        struct backing_dev_info bdi;
+        struct rw_semaphore rename_sem;
 };
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..88418c419ea7 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
@@ -50,9 +52,10 @@ void v9fs_destroy_inode(struct inode *inode);
 #endif
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
-void v9fs_clear_inode(struct inode *inode);
+void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..899f168fd19c 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf)
 }
 /**
- * v9fs_dir_readdir - read a directory
+ * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
 * @filp: opened file structure
- * @dirent: directory structure ???
+ * @buflen: Length in bytes of buffer to allocate
- * @filldir: function to populate directory structure ???
 *
 */
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-        int over;
-        struct p9_wstat st;
-        int err = 0;
-        struct p9_fid *fid;
-        int buflen;
-        int reclen = 0;
        struct p9_rdir *rdir;
+        struct p9_fid *fid;
+        int err = 0;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
        fid = filp->private_data;
-        buflen = fid->clnt->msize - P9_IOHDRSZ;
-        /* allocate rdir on demand */
        if (!fid->rdir) {
                rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
@@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                spin_unlock(&filp->f_dentry->d_lock);
                kfree(rdir);
        }
+exit:
+        return err;
+}
+/**
+ * v9fs_dir_readdir - read a directory
+ * @filp: opened file structure
+ * @dirent: directory structure ???
+ * @filldir: function to populate directory structure ???
+ *
+ */
+static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        int over;
+        struct p9_wstat st;
+        int err = 0;
+        struct p9_fid *fid;
+        int buflen;
+        int reclen = 0;
+        struct p9_rdir *rdir;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        fid = filp->private_data;
+        buflen = fid->clnt->msize - P9_IOHDRSZ;
+        err = v9fs_alloc_rdir_buf(filp, buflen);
+        if (err)
+                goto exit;
        rdir = (struct p9_rdir *) fid->rdir;
        err = mutex_lock_interruptible(&rdir->mutex);
@@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                while (rdir->head < rdir->tail) {
                        p9stat_init(&st);
                        err = p9stat_read(rdir->buf + rdir->head,
-                                                buflen - rdir->head, &st,
+                                                rdir->tail - rdir->head, &st,
                                                fid->clnt->proto_version);
                        if (err) {
                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
@@ -176,6 +196,88 @@ exit:
        return err;
 }
+/**
+ * v9fs_dir_readdir_dotl - read a directory
+ * @filp: opened file structure
+ * @dirent: buffer to fill dirent structures
+ * @filldir: function to populate dirent structures
+ *
+ */
+static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
+                                                filldir_t filldir)
+{
+        int over;
+        int err = 0;
+        struct p9_fid *fid;
+        int buflen;
+        struct p9_rdir *rdir;
+        struct p9_dirent curdirent;
+        u64 oldoffset = 0;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        fid = filp->private_data;
+        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
+        err = v9fs_alloc_rdir_buf(filp, buflen);
+        if (err)
+                goto exit;
+        rdir = (struct p9_rdir *) fid->rdir;
+        err = mutex_lock_interruptible(&rdir->mutex);
+        if (err)
+                return err;
+        while (err == 0) {
+                if (rdir->tail == rdir->head) {
+                        err = p9_client_readdir(fid, rdir->buf, buflen,
+                                                                filp->f_pos);
+                        if (err <= 0)
+                                goto unlock_and_exit;
+                        rdir->head = 0;
+                        rdir->tail = err;
+                }
+                while (rdir->head < rdir->tail) {
+                        err = p9dirent_read(rdir->buf + rdir->head,
+                                                buflen - rdir->head, &curdirent,
+                                                fid->clnt->proto_version);
+                        if (err < 0) {
+                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+                                err = -EIO;
+                                goto unlock_and_exit;
+                        }
+                        /* d_off in dirent structure tracks the offset into
+                         * the next dirent in the dir. However, filldir()
+                         * expects offset into the current dirent. Hence
+                         * while calling filldir send the offset from the
+                         * previous dirent structure.
+                         */
+                        over = filldir(dirent, curdirent.d_name,
+                                        strlen(curdirent.d_name),
+                                        oldoffset, v9fs_qid2ino(&curdirent.qid),
+                                        curdirent.d_type);
+                        oldoffset = curdirent.d_off;
+                        if (over) {
+                                err = 0;
+                                goto unlock_and_exit;
+                        }
+                        filp->f_pos = curdirent.d_off;
+                        rdir->head += err;
+                }
+        }
+unlock_and_exit:
+        mutex_unlock(&rdir->mutex);
+exit:
+        return err;
+}
 /**
 * v9fs_dir_release - close a directory
@@ -190,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS,
-                        "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid);
+                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
+                        inode, filp, fid ? fid->fid : -1);
        filemap_write_and_wait(inode->i_mapping);
-        p9_client_clunk(fid);
+        if (fid)
+                p9_client_clunk(fid);
        return 0;
 }
@@ -203,3 +307,11 @@ const struct file_operations v9fs_dir_operations = {
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
+const struct file_operations v9fs_dir_operations_dotl = {
+        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
+        .readdir = v9fs_dir_readdir_dotl,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..e97c92bd6f16 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        struct p9_fid *fid;
        int omode;
-        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
+        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
        v9ses = v9fs_inode2v9ses(inode);
-        omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses));
+        if (v9fs_proto_dotl(v9ses))
+                omode = file->f_flags;
+        else
+                omode = v9fs_uflags2omode(file->f_flags,
+                                        v9fs_proto_dotu(v9ses));
        fid = file->private_data;
        if (!fid) {
                fid = v9fs_fid_clone(file->f_path.dentry);
@@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                        p9_client_clunk(fid);
                        return err;
                }
-                if (omode & P9_OTRUNC) {
+                if (file->f_flags & O_TRUNC) {
                        i_size_write(inode, 0);
                        inode->i_blocks = 0;
                }
-                if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses)))
+                if ((file->f_flags & O_APPEND) &&
+                        (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
                        generic_file_llseek(file, 0, SEEK_END);
        }
@@ -139,7 +144,7 @@ ssize_t
 v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
               u64 offset)
 {
-        int n, total;
+        int n, total, size;
        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
@@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
        n = 0;
        total = 0;
+        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
        do {
                n = p9_client_read(fid, data, udata, offset, count);
                if (n <= 0)
@@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
                offset += n;
                count -= n;
                total += n;
-        } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+        } while (count > 0 && n == size);
        if (n < 0)
                total = n;
@@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 {
        int ret;
        struct p9_fid *fid;
+        size_t size;
        P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
        fid = filp->private_data;
-        if (count > (fid->clnt->msize - P9_IOHDRSZ))
+        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
+        if (count > size)
                ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
        else
                ret = p9_client_read(fid, NULL, udata, *offset, count);
@@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
        fid = filp->private_data;
        clnt = fid->clnt;
-        rsize = fid->iounit;
+        rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ;
-        if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
-                rsize = clnt->msize - P9_IOHDRSZ;
        do {
                if (count < rsize)
@@ -257,15 +263,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
+static int v9fs_file_fsync(struct file *filp, int datasync)
-                                        int datasync)
 {
        struct p9_fid *fid;
        struct p9_wstat wstat;
        int retval;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
+        P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
-                                                dentry, datasync);
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
@@ -296,3 +300,14 @@ const struct file_operations v9fs_file_operations = {
        .mmap = generic_file_readonly_mmap,
        .fsync = v9fs_file_fsync,
 };
+const struct file_operations v9fs_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_file_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..9e670d527646 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -35,6 +35,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/xattr.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -42,11 +43,15 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
 #include "cache.h"
+#include "xattr.h"
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -233,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode)
 #endif
 /**
+ * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a
+ * new file system object. This checks the S_ISGID to determine the owning
+ * group of the new file system object.
+ */
+static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+{
+        BUG_ON(dir_inode == NULL);
+        if (dir_inode->i_mode & S_ISGID) {
+                /* set_gid bit is set.*/
+                return dir_inode->i_gid;
+        }
+        return current_fsgid();
+}
+/**
+ * v9fs_dentry_from_dir_inode - helper function to get the dentry from
+ * dir inode.
+ *
+ */
+static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&dcache_lock);
+        /* Directory should have only one entry. */
+        BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
+        dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        spin_unlock(&dcache_lock);
+        return dentry;
+}
+/**
 * v9fs_get_inode - helper function to setup an inode
 * @sb: superblock
 * @mode: mode to setup inode with
@@ -253,9 +293,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                return ERR_PTR(-ENOMEM);
        }
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_rdev = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -266,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFBLK:
        case S_IFCHR:
        case S_IFSOCK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (v9fs_proto_dotl(v9ses)) {
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else if (v9fs_proto_dotu(v9ses)) {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                } else {
                        P9_DPRINTK(P9_DEBUG_ERROR,
                                   "special files without extended mode\n");
                        err = -EINVAL;
@@ -275,25 +319,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
                break;
        case S_IFREG:
-                inode->i_op = &v9fs_file_inode_operations;
+                if (v9fs_proto_dotl(v9ses)) {
-                inode->i_fop = &v9fs_file_operations;
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                }
                break;
        case S_IFLNK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-                                   "extended modes used w/o 9P2000.u\n");
+                                                "legacy protocol.\n");
                        err = -EINVAL;
                        goto error;
                }
-                inode->i_op = &v9fs_symlink_inode_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_op = &v9fs_symlink_inode_operations_dotl;
+                else
+                        inode->i_op = &v9fs_symlink_inode_operations;
                break;
        case S_IFDIR:
                inc_nlink(inode);
-                if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotl(v9ses))
-                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                        inode->i_op = &v9fs_dir_inode_operations_dotl;
+                else if (v9fs_proto_dotu(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_dotu;
                else
                        inode->i_op = &v9fs_dir_inode_operations;
-                inode->i_fop = &v9fs_dir_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_fop = &v9fs_dir_operations_dotl;
+                else
+                        inode->i_fop = &v9fs_dir_operations;
                break;
        default:
                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -367,8 +430,10 @@ error:
 * @inode: inode to release
 *
 */
-void v9fs_clear_inode(struct inode *inode)
+void v9fs_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(inode->i_mapping, 0);
+        end_writeback(inode);
        filemap_fdatawrite(inode->i_mapping);
 #ifdef CONFIG_9P_FSCACHE
@@ -376,23 +441,14 @@ void v9fs_clear_inode(struct inode *inode)
 #endif
 }
-/**
- * v9fs_inode_from_fid - populate an inode by issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
 static struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        struct super_block *sb)
 {
        int err, umode;
-        struct inode *ret;
+        struct inode *ret = NULL;
        struct p9_wstat *st;
-        ret = NULL;
        st = p9_client_stat(fid);
        if (IS_ERR(st))
                return ERR_CAST(st);
@@ -413,15 +469,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 #endif
        p9stat_free(st);
        kfree(st);
        return ret;
 error:
        p9stat_free(st);
        kfree(st);
        return ERR_PTR(err);
 }
+static struct inode *
+v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+        struct super_block *sb)
+{
+        struct inode *ret = NULL;
+        int err;
+        struct p9_stat_dotl *st;
+        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        ret = v9fs_get_inode(sb, st->st_mode);
+        if (IS_ERR(ret)) {
+                err = PTR_ERR(ret);
+                goto error;
+        }
+        v9fs_stat2inode_dotl(st, ret);
+        ret->i_ino = v9fs_qid2ino(&st->qid);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_cache_inode_get_cookie(ret);
+#endif
+        kfree(st);
+        return ret;
+error:
+        kfree(st);
+        return ERR_PTR(err);
+}
+/**
+ * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                        struct super_block *sb)
+{
+        if (v9fs_proto_dotl(v9ses))
+                return v9fs_inode_dotl(v9ses, fid, sb);
+        else
+                return v9fs_inode(v9ses, fid, sb);
+}
 /**
 * v9fs_remove - helper function to remove files and directories
 * @dir: directory inode that is being deleted
@@ -434,14 +537,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
        struct inode *file_inode;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *v9fid;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
        file_inode = file->d_inode;
-        v9ses = v9fs_inode2v9ses(file_inode);
        v9fid = v9fs_fid_clone(file);
        if (IS_ERR(v9fid))
                return PTR_ERR(v9fid);
@@ -484,12 +585,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        ofid = NULL;
        fid = NULL;
        name = (char *) dentry->d_name.name;
-        dfid = v9fs_fid_clone(dentry->d_parent);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        /* clone a fid to use for creation */
@@ -497,8 +597,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                ofid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +607,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* now walk from the parent so we can get unopened fid */
-        fid = p9_client_walk(dfid, 1, &name, 0);
+        fid = p9_client_walk(dfid, 1, &name, 1);
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                fid = NULL;
                goto error;
-        } else
+        }
-                dfid = NULL;
        /* instantiate inode and assign the unopened fid to the dentry */
        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +636,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        return ofid;
 error:
-        if (dfid)
-                p9_client_clunk(dfid);
        if (ofid)
                p9_client_clunk(ofid);
@@ -551,6 +646,121 @@ error:
 }
 /**
+ * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
+ * @dir: directory inode that is being created
+ * @dentry:  dentry that is being deleted
+ * @mode: create permissions
+ * @nd: path information
+ *
+ */
+static int
+v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
+{
+        int err = 0;
+        char *name = NULL;
+        gid_t gid;
+        int flags;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL;
+        struct p9_fid *dfid, *ofid;
+        struct file *filp;
+        struct p9_qid qid;
+        struct inode *inode;
+        v9ses = v9fs_inode2v9ses(dir);
+        if (nd && nd->flags & LOOKUP_OPEN)
+                flags = nd->intent.open.flags - 1;
+        else
+                flags = O_RDWR;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+                        "mode:0x%x\n", name, flags, mode);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        /* clone a fid to use for creation */
+        ofid = p9_client_walk(dfid, 0, NULL, 1);
+        if (IS_ERR(ofid)) {
+                err = PTR_ERR(ofid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                                "p9_client_open_dotl failed in creat %d\n",
+                                err);
+                goto error;
+        }
+        /* No need to populate the inode if we are not opening the file AND
+         * not in cached mode.
+         */
+        if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) {
+                /* Not in cached mode. No need to populate inode with stat */
+                dentry->d_op = &v9fs_dentry_operations;
+                p9_client_clunk(ofid);
+                d_instantiate(dentry, NULL);
+                return 0;
+        }
+        /* Now walk from the parent so we can get an unopened fid. */
+        fid = p9_client_walk(dfid, 1, &name, 1);
+        if (IS_ERR(fid)) {
+                err = PTR_ERR(fid);
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                fid = NULL;
+                goto error;
+        }
+        /* instantiate inode and assign the unopened fid to dentry */
+        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache)
+                dentry->d_op = &v9fs_cached_dentry_operations;
+        else
+                dentry->d_op = &v9fs_dentry_operations;
+        d_instantiate(dentry, inode);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
+        /* if we are opening a file, assign the open fid to the file */
+        if (nd && nd->flags & LOOKUP_OPEN) {
+                filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created);
+                if (IS_ERR(filp)) {
+                        p9_client_clunk(ofid);
+                        return PTR_ERR(filp);
+                }
+                filp->private_data = ofid;
+        } else
+                p9_client_clunk(ofid);
+        return 0;
+error:
+        if (ofid)
+                p9_client_clunk(ofid);
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
 * v9fs_vfs_create - VFS hook to create files
 * @dir: directory inode that is being created
 * @dentry:  dentry that is being deleted
@@ -640,6 +850,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return err;
 }
+/**
+ * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
+ * @dir:  inode that is being unlinked
+ * @dentry: dentry that is being unlinked
+ * @mode: mode for new directory
+ *
+ */
+static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry,
+                                        int mode)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        gid_t gid;
+        char *name;
+        struct inode *inode;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        err = 0;
+        v9ses = v9fs_inode2v9ses(dir);
+        mode |= S_IFDIR;
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
 /**
 * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
 * @dir:  inode that is being walked from
@@ -666,6 +953,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        sb = dir->i_sb;
        v9ses = v9fs_inode2v9ses(dir);
+        /* We can walk d_parent because we hold the dir->i_mutex */
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid))
                return ERR_CAST(dfid);
@@ -675,8 +963,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid)) {
                result = PTR_ERR(fid);
                if (result == -ENOENT) {
-                        d_add(dentry, NULL);
+                        inode = NULL;
-                        return NULL;
+                        goto inst_out;
                }
                return ERR_PTR(result);
@@ -693,7 +981,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (result < 0)
                goto error;
-        if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
        else
                dentry->d_op = &v9fs_dentry_operations;
@@ -772,20 +1061,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto clunk_olddir;
        }
-        /* 9P can only handle file rename in the same directory */
+        down_write(&v9ses->rename_sem);
-        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
+        if (v9fs_proto_dotl(v9ses)) {
+                retval = p9_client_rename(oldfid, newdirfid,
+                                        (char *) new_dentry->d_name.name);
+                if (retval != -ENOSYS)
+                        goto clunk_newdir;
+        }
+        if (old_dentry->d_parent != new_dentry->d_parent) {
+                /*
+                 * 9P .u can only handle file rename in the same directory
+                 */
                P9_DPRINTK(P9_DEBUG_ERROR,
                                "old dir and new dir are different\n");
                retval = -EXDEV;
                goto clunk_newdir;
        }
        v9fs_blank_wstat(&wstat);
        wstat.muid = v9ses->uname;
        wstat.name = (char *) new_dentry->d_name.name;
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
+        if (!retval)
+                /* successful rename */
+                d_move(old_dentry, new_dentry);
+        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
 clunk_olddir:
@@ -829,6 +1131,43 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
                generic_fillattr(dentry->d_inode, stat);
+        p9stat_free(st);
+        kfree(st);
+        return 0;
+}
+static int
+v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
+                 struct kstat *stat)
+{
+        int err;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_stat_dotl *st;
+        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        err = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+                return simple_getattr(mnt, dentry, stat);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        /* Ask for all the fields in stat structure. Server will return
+         * whatever it supports
+         */
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        v9fs_stat2inode_dotl(st, dentry->d_inode);
+        generic_fillattr(dentry->d_inode, stat);
+        /* Change block size to what the server returned */
+        stat->blksize = st->st_blksize;
        kfree(st);
        return 0;
 }
@@ -876,10 +1215,71 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
        }
        retval = p9_client_wstat(fid, &wstat);
-        if (retval >= 0)
+        if (retval < 0)
-                retval = inode_setattr(dentry->d_inode, iattr);
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
-        return retval;
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        return 0;
+}
+/**
+ * v9fs_vfs_setattr_dotl - set file metadata
+ * @dentry: file whose metadata to set
+ * @iattr: metadata assignment structure
+ *
+ */
+static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
+{
+        int retval;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_iattr_dotl p9attr;
+        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
+        p9attr.valid = iattr->ia_valid;
+        p9attr.mode = iattr->ia_mode;
+        p9attr.uid = iattr->ia_uid;
+        p9attr.gid = iattr->ia_gid;
+        p9attr.size = iattr->ia_size;
+        p9attr.atime_sec = iattr->ia_atime.tv_sec;
+        p9attr.atime_nsec = iattr->ia_atime.tv_nsec;
+        p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
+        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
+        retval = -EPERM;
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        retval = p9_client_setattr(fid, &p9attr);
+        if (retval < 0)
+                return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        return 0;
 }
 /**
@@ -960,6 +1360,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 /**
+ * v9fs_stat2inode_dotl - populate an inode structure with stat info
+ * @stat: stat structure
+ * @inode: inode to populate
+ * @sb: superblock of filesystem
+ *
+ */
+void
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+{
+        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
+                inode->i_atime.tv_sec = stat->st_atime_sec;
+                inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                inode->i_uid = stat->st_uid;
+                inode->i_gid = stat->st_gid;
+                inode->i_nlink = stat->st_nlink;
+                inode->i_mode = stat->st_mode;
+                inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                i_size_write(inode, stat->st_size);
+                inode->i_blocks = stat->st_blocks;
+        } else {
+                if (stat->st_result_mask & P9_STATS_ATIME) {
+                        inode->i_atime.tv_sec = stat->st_atime_sec;
+                        inode->i_atime.tv_nsec = stat->st_atime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_MTIME) {
+                        inode->i_mtime.tv_sec = stat->st_mtime_sec;
+                        inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_CTIME) {
+                        inode->i_ctime.tv_sec = stat->st_ctime_sec;
+                        inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+                }
+                if (stat->st_result_mask & P9_STATS_UID)
+                        inode->i_uid = stat->st_uid;
+                if (stat->st_result_mask & P9_STATS_GID)
+                        inode->i_gid = stat->st_gid;
+                if (stat->st_result_mask & P9_STATS_NLINK)
+                        inode->i_nlink = stat->st_nlink;
+                if (stat->st_result_mask & P9_STATS_MODE) {
+                        inode->i_mode = stat->st_mode;
+                        if ((S_ISBLK(inode->i_mode)) ||
+                                                (S_ISCHR(inode->i_mode)))
+                                init_special_inode(inode, inode->i_mode,
+                                                                inode->i_rdev);
+                }
+                if (stat->st_result_mask & P9_STATS_RDEV)
+                        inode->i_rdev = new_decode_dev(stat->st_rdev);
+                if (stat->st_result_mask & P9_STATS_SIZE)
+                        i_size_write(inode, stat->st_size);
+                if (stat->st_result_mask & P9_STATS_BLOCKS)
+                        inode->i_blocks = stat->st_blocks;
+        }
+        if (stat->st_result_mask & P9_STATS_GEN)
+                        inode->i_generation = stat->st_gen;
+        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
+         * because the inode structure does not have fields for them.
+         */
+}
+/**
 * v9fs_qid2ino - convert qid into inode number
 * @qid: qid to hash
 *
@@ -1002,7 +1473,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        if (IS_ERR(fid))
                return PTR_ERR(fid);
-        if (!v9fs_proto_dotu(v9ses))
+        if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))
                return -EBADF;
        st = p9_client_stat(fid);
@@ -1022,6 +1493,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        retval = strnlen(buffer, buflen);
 done:
+        p9stat_free(st);
        kfree(st);
        return retval;
 }
@@ -1108,6 +1580,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 }
 /**
+ * v9fs_vfs_symlink_dotl - helper function to create symlinks
+ * @dir: directory inode containing symlink
+ * @dentry: dentry for symlink
+ * @symname: symlink data
+ *
+ * See Also: 9P2000.L RFC for more information
+ *
+ */
+static int
+v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
+                const char *symname)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct inode *inode;
+        struct p9_qid qid;
+        char *name;
+        int err;
+        gid_t gid;
+        name = (char *) dentry->d_name.name;
+        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+                        dir->i_ino, name, symname);
+        v9ses = v9fs_inode2v9ses(dir);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                return err;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid);
+                goto error;
+        }
+        /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */
+        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                goto error;
+        }
+        if (v9ses->cache) {
+                /* Now walk from the parent so we can get an unopened fid. */
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                        err);
+                        fid = NULL;
+                        goto error;
+                }
+                /* instantiate inode and assign the unopened fid to dentry */
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                        err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /* Not in cached mode. No need to populate inode with stat */
+                inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+/**
 * v9fs_vfs_symlink - helper function to create symlinks
 * @dir: directory inode containing symlink
 * @dentry: dentry for symlink
@@ -1166,6 +1731,76 @@ clunk_fid:
 }
 /**
+ * v9fs_vfs_link_dotl - create a hardlink for dotl
+ * @old_dentry: dentry for file to link to
+ * @dir: inode destination for new link
+ * @dentry: dentry for link
+ *
+ */
+static int
+v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
+                struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *dfid, *oldfid;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+                        dir->i_ino, old_dentry->d_name.name,
+                        dentry->d_name.name);
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid))
+                return PTR_ERR(dfid);
+        oldfid = v9fs_fid_lookup(old_dentry);
+        if (IS_ERR(oldfid))
+                return PTR_ERR(oldfid);
+        name = (char *) dentry->d_name.name;
+        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
+        if (err < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                return err;
+        }
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                /* Get the latest stat info from server. */
+                struct p9_fid *fid;
+                struct p9_stat_dotl *st;
+                fid = v9fs_fid_lookup(old_dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st))
+                        return PTR_ERR(st);
+                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
+                kfree(st);
+        } else {
+                /* Caching disabled. No need to get upto date stat info.
+                 * This dentry will be released immediately. So, just i_count++
+                 */
+                atomic_inc(&old_dentry->d_inode->i_count);
+        }
+        dentry->d_op = old_dentry->d_op;
+        d_instantiate(dentry, old_dentry->d_inode);
+        return err;
+}
+/**
 * v9fs_vfs_mknod - create a special file
 * @dir: inode destination for new link
 * @dentry: dentry for file
@@ -1197,6 +1832,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
                sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
        else if (S_ISFIFO(mode))
                *name = 0;
+        else if (S_ISSOCK(mode))
+                *name = 0;
        else {
                __putname(name);
                return -EINVAL;
@@ -1208,7 +1845,101 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+/**
+ * v9fs_vfs_mknod_dotl - create a special file
+ * @dir: inode destination for new link
+ * @dentry: dentry for file
+ * @mode: mode for creation
+ * @rdev: device associated with special file
+ *
+ */
+static int
+v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode,
+                dev_t rdev)
+{
+        int err;
+        char *name;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid = NULL, *dfid = NULL;
+        struct inode *inode;
+        gid_t gid;
+        struct p9_qid qid;
+        struct dentry *dir_dentry;
+        P9_DPRINTK(P9_DEBUG_VFS,
+                " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino,
+                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        v9ses = v9fs_inode2v9ses(dir);
+        dir_dentry = v9fs_dentry_from_dir_inode(dir);
+        dfid = v9fs_fid_lookup(dir_dentry);
+        if (IS_ERR(dfid)) {
+                err = PTR_ERR(dfid);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                dfid = NULL;
+                goto error;
+        }
+        gid = v9fs_get_fsgid_for_create(dir);
+        if (gid < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n");
+                goto error;
+        }
+        name = (char *) dentry->d_name.name;
+        err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid);
+        if (err < 0)
+                goto error;
+        /* instantiate inode and assign the unopened fid to the dentry */
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                                err);
+                        fid = NULL;
+                        goto error;
+                }
+                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                                err);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_cached_dentry_operations;
+                d_instantiate(dentry, inode);
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                fid = NULL;
+        } else {
+                /*
+                 * Not in cached mode. No need to populate inode with stat.
+                 * socket syscall returns a fd, so we need instantiate
+                 */
+                inode = v9fs_get_inode(dir->i_sb, mode);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        goto error;
+                }
+                dentry->d_op = &v9fs_dentry_operations;
+                d_instantiate(dentry, inode);
+        }
+error:
+        if (fid)
+                p9_client_clunk(fid);
+        return err;
+}
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
        .symlink = v9fs_vfs_symlink,
@@ -1222,6 +1953,25 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = {
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
+        .create = v9fs_vfs_create_dotl,
+        .lookup = v9fs_vfs_lookup,
+        .link = v9fs_vfs_link_dotl,
+        .symlink = v9fs_vfs_symlink_dotl,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir_dotl,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod_dotl,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
 static const struct inode_operations v9fs_dir_inode_operations = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
@@ -1239,6 +1989,15 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +2005,15 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr_dotl,
+        .setattr = v9fs_vfs_setattr_dotl,
+        .setxattr = generic_setxattr,
+        .getxattr = generic_getxattr,
+        .removexattr = generic_removexattr,
+        .listxattr = v9fs_listxattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..1d12ba0ed3db 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,14 +38,16 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "fid.h"
+#include "xattr.h"
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 /**
 * v9fs_set_super - set the superblock
@@ -76,7 +78,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
-        sb->s_op = &v9fs_super_ops;
+        if (v9fs_proto_dotl(v9ses)) {
+                sb->s_op = &v9fs_super_ops_dotl;
+                sb->s_xattr = v9fs_xattr_handlers;
+        } else
+                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -103,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        struct inode *inode = NULL;
        struct dentry *root = NULL;
        struct v9fs_session_info *v9ses = NULL;
-        struct p9_wstat *st = NULL;
        int mode = S_IRWXUGO | S_ISVTX;
        struct p9_fid *fid;
        int retval = 0;
@@ -117,19 +122,17 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
                retval = PTR_ERR(fid);
+                /*
+                 * we need to call session_close to tear down some
+                 * of the data structure setup by session_init
+                 */
                goto close_session;
        }
-        st = p9_client_stat(fid);
-        if (IS_ERR(st)) {
-                retval = PTR_ERR(st);
-                goto clunk_fid;
-        }
        sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
        if (IS_ERR(sb)) {
                retval = PTR_ERR(sb);
-                goto free_stat;
+                goto clunk_fid;
        }
        v9fs_fill_super(sb, v9ses, flags, data);
@@ -145,35 +148,53 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                retval = -ENOMEM;
                goto release_sb;
        }
        sb->s_root = root;
-        root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
-        v9fs_stat2inode(st, root->d_inode, sb);
+        if (v9fs_proto_dotl(v9ses)) {
+                struct p9_stat_dotl *st = NULL;
+                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                if (IS_ERR(st)) {
+                        retval = PTR_ERR(st);
+                        goto release_sb;
+                }
+                v9fs_stat2inode_dotl(st, root->d_inode);
+                kfree(st);
+        } else {
+                struct p9_wstat *st = NULL;
+                st = p9_client_stat(fid);
+                if (IS_ERR(st)) {
+                        retval = PTR_ERR(st);
+                        goto release_sb;
+                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+                v9fs_stat2inode(st, root->d_inode, sb);
+                p9stat_free(st);
+                kfree(st);
+        }
        v9fs_fid_add(root, fid);
-        p9stat_free(st);
-        kfree(st);
-P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        simple_set_mnt(mnt, sb);
        return 0;
-free_stat:
-        p9stat_free(st);
-        kfree(st);
 clunk_fid:
        p9_client_clunk(fid);
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
        return retval;
 release_sb:
-        p9stat_free(st);
+        /*
-        kfree(st);
+         * we will do the session_close and root dentry release
+         * in the below call. But we need to clunk fid, because we haven't
+         * attached the fid to dentry so it won't get clunked
+         * automatically.
+         */
+        p9_client_clunk(fid);
        deactivate_locked_super(sb);
        return retval;
 }
@@ -211,13 +232,60 @@ v9fs_umount_begin(struct super_block *sb)
        v9fs_session_begin_cancel(v9ses);
 }
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_rstatfs rs;
+        int res;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                res = PTR_ERR(fid);
+                goto done;
+        }
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9fs_proto_dotl(v9ses)) {
+                res = p9_client_statfs(fid, &rs);
+                if (res == 0) {
+                        buf->f_type = rs.type;
+                        buf->f_bsize = rs.bsize;
+                        buf->f_blocks = rs.blocks;
+                        buf->f_bfree = rs.bfree;
+                        buf->f_bavail = rs.bavail;
+                        buf->f_files = rs.files;
+                        buf->f_ffree = rs.ffree;
+                        buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+                        buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+                        buf->f_namelen = rs.namelen;
+                }
+                if (res != -ENOSYS)
+                        goto done;
+        }
+        res = simple_statfs(dentry, buf);
+done:
+        return res;
+}
 static const struct super_operations v9fs_super_ops = {
 #ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
 #endif
        .statfs = simple_statfs,
-        .clear_inode = v9fs_clear_inode,
+        .evict_inode = v9fs_evict_inode,
+        .show_options = generic_show_options,
+        .umount_begin = v9fs_umount_begin,
+};
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+        .alloc_inode = v9fs_alloc_inode,
+        .destroy_inode = v9fs_destroy_inode,
+#endif
+        .statfs = v9fs_statfs,
+        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
 };
@@ -227,4 +295,5 @@ struct file_system_type v9fs_fs_type = {
        .get_sb = v9fs_get_sb,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE,
 };
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
new file mode 100644
index 000000000000..f88e5c2dc873
--- /dev/null
+++ b/fs/9p/xattr.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "fid.h"
+#include "xattr.h"
+/*
+ * v9fs_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
+                       void *buffer, size_t buffer_size)
+{
+        ssize_t retval;
+        int msize, read_count;
+        u64 offset = 0, attr_size;
+        struct p9_fid *fid, *attr_fid;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+                __func__, name, buffer_size);
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid))
+                return PTR_ERR(fid);
+        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
+        if (IS_ERR(attr_fid)) {
+                retval = PTR_ERR(attr_fid);
+                P9_DPRINTK(P9_DEBUG_VFS,
+                        "p9_client_attrwalk failed %zd\n", retval);
+                attr_fid = NULL;
+                goto error;
+        }
+        if (!buffer_size) {
+                /* request to get the attr_size */
+                retval = attr_size;
+                goto error;
+        }
+        if (attr_size > buffer_size) {
+                retval = -ERANGE;
+                goto error;
+        }
+        msize = attr_fid->clnt->msize;
+        while (attr_size) {
+                if (attr_size > (msize - P9_IOHDRSZ))
+                        read_count = msize - P9_IOHDRSZ;
+                else
+                        read_count = attr_size;
+                read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
+                                        NULL, offset, read_count);
+                if (read_count < 0) {
+                        /* error in xattr read */
+                        retval = read_count;
+                        goto error;
+                }
+                offset += read_count;
+                attr_size -= read_count;
+        }
+        /* Total read xattr bytes */
+        retval = offset;
+error:
+        if (attr_fid)
+                p9_client_clunk(attr_fid);
+        return retval;
+}
+/*
+ * v9fs_xattr_set()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int v9fs_xattr_set(struct dentry *dentry, const char *name,
+                   const void *value, size_t value_len, int flags)
+{
+        u64 offset = 0;
+        int retval, msize, write_count;
+        struct p9_fid *fid = NULL;
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+                __func__, name, value_len, flags);
+        fid = v9fs_fid_clone(dentry);
+        if (IS_ERR(fid)) {
+                retval = PTR_ERR(fid);
+                fid = NULL;
+                goto error;
+        }
+        /*
+         * On success fid points to xattr
+         */
+        retval = p9_client_xattrcreate(fid, name, value_len, flags);
+        if (retval < 0) {
+                P9_DPRINTK(P9_DEBUG_VFS,
+                        "p9_client_xattrcreate failed %d\n", retval);
+                goto error;
+        }
+        msize = fid->clnt->msize;;
+        while (value_len) {
+                if (value_len > (msize - P9_IOHDRSZ))
+                        write_count = msize - P9_IOHDRSZ;
+                else
+                        write_count = value_len;
+                write_count = p9_client_write(fid, ((char *)value)+offset,
+                                        NULL, offset, write_count);
+                if (write_count < 0) {
+                        /* error in xattr write */
+                        retval = write_count;
+                        goto error;
+                }
+                offset += write_count;
+                value_len -= write_count;
+        }
+        /* Total read xattr bytes */
+        retval = offset;
+error:
+        if (fid)
+                retval = p9_client_clunk(fid);
+        return retval;
+}
+ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+        return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+}
+const struct xattr_handler *v9fs_xattr_handlers[] = {
+        &v9fs_xattr_user_handler,
+        NULL
+};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
new file mode 100644
index 000000000000..9ddf672ae5c4
--- /dev/null
+++ b/fs/9p/xattr.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#ifndef FS_9P_XATTR_H
+#define FS_9P_XATTR_H
+#include <linux/xattr.h>
+extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern struct xattr_handler v9fs_xattr_user_handler;
+extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
+                              void *, size_t);
+extern int v9fs_xattr_set(struct dentry *, const char *,
+                          const void *, size_t, int);
+extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
+#endif /* FS_9P_XATTR_H */
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
new file mode 100644
index 000000000000..d0b701b72080
--- /dev/null
+++ b/fs/9p/xattr_user.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "xattr.h"
+static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
+                        void *buffer, size_t size, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_USER_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+        memcpy(full_name+prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_get(dentry, full_name, buffer, size);
+        kfree(full_name);
+        return retval;
+}
+static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
+                        const void *value, size_t size, int flags, int type)
+{
+        int retval;
+        char *full_name;
+        size_t name_len;
+        size_t prefix_len = XATTR_USER_PREFIX_LEN;
+        if (name == NULL)
+                return -EINVAL;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        name_len = strlen(name);
+        full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
+        if (!full_name)
+                return -ENOMEM;
+        memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
+        memcpy(full_name + prefix_len, name, name_len);
+        full_name[prefix_len + name_len] = '\0';
+        retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
+        kfree(full_name);
+        return retval;
+}
+struct xattr_handler v9fs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .get    = v9fs_xattr_user_get,
+        .set    = v9fs_xattr_user_set,
+};
diff --git a/fs/Kconfig b/fs/Kconfig
index 5f85b5947613..3d185308ec88 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -64,7 +64,7 @@ source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 config CUSE
-        tristate "Character device in Userpace support"
+        tristate "Character device in Userspace support"
        depends on FUSE_FS
        help
          This FUSE extension allows character devices to be
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-                stack.o fs_struct.o
+                stack.o fs_struct.o statfs.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..65794b8fe79e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,10 +50,19 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                adfs_get_block,
                                &ADFS_I(mapping->host)->mmu_private);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
@@ -322,11 +331,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (error)
                goto out;
+        /* XXX: this is missing some actual on-disk truncation.. */
        if (ia_valid & ATTR_SIZE)
-                error = vmtruncate(inode, attr->ia_size);
+                truncate_setsize(inode, attr->ia_size);
-        if (error)
-                goto out;
        if (ia_valid & ATTR_MTIME) {
                inode->i_mtime = attr->ia_mtime;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..a8cbdeb34025 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -171,8 +171,7 @@ extern int	affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 extern unsigned long             affs_parent_ino(struct inode *dir);
 extern struct inode             *affs_new_inode(struct inode *dir);
 extern int                       affs_notify_change(struct dentry *dentry, struct iattr *attr);
-extern void                      affs_delete_inode(struct inode *inode);
+extern void                      affs_evict_inode(struct inode *inode);
-extern void                      affs_clear_inode(struct inode *inode);
 extern struct inode             *affs_iget(struct super_block *sb,
                                        unsigned long ino);
 extern int                       affs_write_inode(struct inode *inode,
@@ -183,7 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
-int             affs_file_fsync(struct file *, struct dentry *, int);
+int             affs_file_fsync(struct file *, int);
 /* dir.c */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..c4a9875bd1a6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,10 +406,19 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                affs_get_block,
                                &AFFS_I(mapping->host)->mmu_private);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
@@ -916,9 +925,9 @@ affs_truncate(struct inode *inode)
        affs_free_prealloc(inode);
 }
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int ret, err;
        ret = write_inode_now(inode, 0);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index f4b2a4ee4f91..3a0fdec175ba 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -235,31 +235,36 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
                goto out;
        }
-        error = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-        if (!error && (attr->ia_valid & ATTR_MODE))
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        if (attr->ia_valid & ATTR_MODE)
                mode_to_prot(inode);
 out:
        return error;
 }
 void
-affs_delete_inode(struct inode *inode)
+affs_evict_inode(struct inode *inode)
-{
-        pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-        truncate_inode_pages(&inode->i_data, 0);
-        inode->i_size = 0;
-        affs_truncate(inode);
-        clear_inode(inode);
-        affs_free_block(inode->i_sb, inode->i_ino);
-}
-void
-affs_clear_inode(struct inode *inode)
 {
        unsigned long cache_page;
+        pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+        truncate_inode_pages(&inode->i_data, 0);
-        pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
+        if (!inode->i_nlink) {
+                inode->i_size = 0;
+                affs_truncate(inode);
+        }
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
        affs_free_prealloc(inode);
        cache_page = (unsigned long)AFFS_I(inode)->i_lc;
        if (cache_page) {
@@ -271,6 +276,9 @@ affs_clear_inode(struct inode *inode)
        affs_brelse(AFFS_I(inode)->i_ext_bh);
        AFFS_I(inode)->i_ext_last = ~1;
        AFFS_I(inode)->i_ext_bh = NULL;
+        if (!inode->i_nlink)
+                affs_free_block(inode->i_sb, inode->i_ino);
 }
 struct inode *
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                affs_brelse(bh);
                inode = affs_iget(sb, ino);
                if (IS_ERR(inode))
-                        return ERR_PTR(PTR_ERR(inode));
+                        return ERR_CAST(inode);
        }
        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 16a3e4765f68..33c4e7eef470 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -26,7 +26,7 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 static void
-affs_commit_super(struct super_block *sb, int clean)
+affs_commit_super(struct super_block *sb, int wait, int clean)
 {
        struct affs_sb_info *sbi = AFFS_SB(sb);
        struct buffer_head *bh = sbi->s_root_bh;
@@ -36,6 +36,8 @@ affs_commit_super(struct super_block *sb, int clean)
        secs_to_datestamp(get_seconds(), &tail->disk_change);
        affs_fix_checksum(sb, bh);
        mark_buffer_dirty(bh);
+        if (wait)
+                sync_dirty_buffer(bh);
 }
 static void
@@ -46,8 +48,8 @@ affs_put_super(struct super_block *sb)
        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY))
+        if (!(sb->s_flags & MS_RDONLY) && sb->s_dirt)
-                affs_commit_super(sb, 1);
+                affs_commit_super(sb, 1, 1);
        kfree(sbi->s_prefix);
        affs_free_bitmap(sb);
@@ -61,27 +63,20 @@ affs_put_super(struct super_block *sb)
 static void
 affs_write_super(struct super_block *sb)
 {
-        int clean = 2;
        lock_super(sb);
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (!(sb->s_flags & MS_RDONLY))
-                //      if (sbi->s_bitmap[i].bm_bh) {
+                affs_commit_super(sb, 1, 2);
-                //              if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
+        sb->s_dirt = 0;
-                //                      clean = 0;
-                affs_commit_super(sb, clean);
-                sb->s_dirt = !clean;    /* redo until bitmap synced */
-        } else
-                sb->s_dirt = 0;
        unlock_super(sb);
-        pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
+        pr_debug("AFFS: write_super() at %lu, clean=2\n", get_seconds());
 }
 static int
 affs_sync_fs(struct super_block *sb, int wait)
 {
        lock_super(sb);
-        affs_commit_super(sb, 2);
+        affs_commit_super(sb, wait, 2);
        sb->s_dirt = 0;
        unlock_super(sb);
        return 0;
@@ -140,8 +135,7 @@ static const struct super_operations affs_sops = {
        .alloc_inode    = affs_alloc_inode,
        .destroy_inode  = affs_destroy_inode,
        .write_inode    = affs_write_inode,
-        .delete_inode   = affs_delete_inode,
+        .evict_inode    = affs_evict_inode,
-        .clear_inode    = affs_clear_inode,
        .put_super      = affs_put_super,
        .write_super    = affs_write_super,
        .sync_fs        = affs_sync_fs,
@@ -554,9 +548,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                return 0;
        }
        if (*flags & MS_RDONLY) {
-                sb->s_dirt = 1;
+                affs_write_super(sb);
-                while (sb->s_dirt)
-                        affs_write_super(sb);
                affs_free_bitmap(sb);
        } else
                res = affs_init_bitmap(sb, flags);
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c772..8f975f25b486 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
        select AF_RXRPC
+        select DNS_RESOLVER
        help
          If you say Y here, you will get an experimental Andrew File System
          driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059ed..0d5eeadf6121 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
+#include <linux/dns_resolver.h>
 #include <linux/sched.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
@@ -30,21 +31,24 @@ static struct afs_cell *afs_cell_root;
 * allocate a cell record and fill in its name, VL server address list and
 * allocate an anonymous key
 */
-static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+                                       char *vllist)
 {
        struct afs_cell *cell;
        struct key *key;
-        size_t namelen;
        char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
+        char  *dvllist = NULL, *_vllist = NULL;
+        char  delimiter = ':';
        int ret;
-        _enter("%s,%s", name, vllist);
+        _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
        BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
-        namelen = strlen(name);
+        if (namelen > AFS_MAXCELLNAME) {
-        if (namelen > AFS_MAXCELLNAME)
+                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
+        }
        /* allocate and initialise a cell record */
        cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +68,35 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
        INIT_LIST_HEAD(&cell->vl_list);
        spin_lock_init(&cell->vl_lock);
+        /* if the ip address is invalid, try dns query */
+        if (!vllist || strlen(vllist) < 7) {
+                ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
+                if (ret < 0) {
+                        if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+                                /* translate these errors into something
+                                 * userspace might understand */
+                                ret = -EDESTADDRREQ;
+                        _leave(" = %d", ret);
+                        return ERR_PTR(ret);
+                }
+                _vllist = dvllist;
+                /* change the delimiter for user-space reply */
+                delimiter = ',';
+        } else {
+                _vllist = vllist;
+        }
        /* fill in the VL server list from the rest of the string */
        do {
                unsigned a, b, c, d;
-                next = strchr(vllist, ':');
+                next = strchr(_vllist, delimiter);
                if (next)
                        *next++ = 0;
-                if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
+                if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
                        goto bad_address;
                if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +105,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
                cell->vl_addrs[cell->vl_naddrs++].s_addr =
                        htonl((a << 24) | (b << 16) | (c << 8) | d);
-        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
+        } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
        /* create a key to represent an anonymous user */
        memcpy(keyname, "afs@", 4);
@@ -110,32 +134,36 @@ bad_address:
        ret = -EINVAL;
 error:
        key_put(cell->anonymous_key);
+        kfree(dvllist);
        kfree(cell);
        _leave(" = %d", ret);
        return ERR_PTR(ret);
 }
 /*
- * create a cell record
+ * afs_cell_crate() - create a cell record
- * - "name" is the name of the cell
+ * @name:       is the name of the cell.
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * @namsesz:    is the strlen of the cell name.
+ * @vllist:     is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref:     is T to return the cell reference when the cell exists.
 */
-struct afs_cell *afs_cell_create(const char *name, char *vllist)
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+                                 char *vllist, bool retref)
 {
        struct afs_cell *cell;
        int ret;
-        _enter("%s,%s", name, vllist);
+        _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
        down_write(&afs_cells_sem);
        read_lock(&afs_cells_lock);
        list_for_each_entry(cell, &afs_cells, link) {
-                if (strcasecmp(cell->name, name) == 0)
+                if (strncasecmp(cell->name, name, namesz) == 0)
                        goto duplicate_name;
        }
        read_unlock(&afs_cells_lock);
-        cell = afs_cell_alloc(name, vllist);
+        cell = afs_cell_alloc(name, namesz, vllist);
        if (IS_ERR(cell)) {
                _leave(" = %ld", PTR_ERR(cell));
                up_write(&afs_cells_sem);
@@ -175,8 +203,18 @@ error:
        return ERR_PTR(ret);
 duplicate_name:
+        if (retref && !IS_ERR(cell))
+                afs_get_cell(cell);
        read_unlock(&afs_cells_lock);
        up_write(&afs_cells_sem);
+        if (retref) {
+                _leave(" = %p", cell);
+                return cell;
+        }
+        _leave(" = -EEXIST");
        return ERR_PTR(-EEXIST);
 }
@@ -201,15 +239,13 @@ int afs_cell_init(char *rootcell)
        }
        cp = strchr(rootcell, ':');
-        if (!cp) {
+        if (!cp)
-                printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
+                _debug("kAFS: no VL server IP addresses specified");
-                _leave(" = -EINVAL");
+        else
-                return -EINVAL;
+                *cp++ = 0;
-        }
        /* allocate a cell record for the root cell */
-        *cp++ = 0;
+        new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
-        new_root = afs_cell_create(rootcell, cp);
        if (IS_ERR(new_root)) {
                _leave(" = %ld", PTR_ERR(new_root));
                return PTR_ERR(new_root);
@@ -229,11 +265,12 @@ int afs_cell_init(char *rootcell)
 /*
 * lookup a cell record
 */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+                                 bool dns_cell)
 {
        struct afs_cell *cell;
-        _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
+        _enter("\"%*.*s\",", namesz, namesz, name ?: "");
        down_read(&afs_cells_sem);
        read_lock(&afs_cells_lock);
@@ -247,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
                        }
                }
                cell = ERR_PTR(-ENOENT);
+                if (dns_cell)
+                        goto create_cell;
        found:
                ;
        } else {
@@ -269,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
        up_read(&afs_cells_sem);
        _leave(" = %p", cell);
        return cell;
+create_cell:
+        read_unlock(&afs_cells_lock);
+        up_read(&afs_cells_sem);
+        cell = afs_cell_create(name, namesz, NULL, true);
+        _leave(" = %p", cell);
+        return cell;
 }
 #if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..0d38c09bd55e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
                                     struct key *key)
 {
        struct page *page;
-        struct file file = {
-                .private_data = key,
-        };
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_mapping_page(dir->i_mapping, index, &file);
+        page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
        if (!IS_ERR(page)) {
                kmap(page);
                if (!PageChecked(page))
@@ -481,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 }
 /*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+        int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+        struct afs_fid *fid)
+{
+        const char *devname = dentry->d_name.name;
+        struct afs_vnode *vnode = AFS_FS_I(dir);
+        struct inode *inode;
+        _enter("%d, %p{%s}, {%x:%u}, %p",
+               ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+        if (ret != -ENOENT ||
+            !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+                goto out;
+        inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        *fid = AFS_FS_I(inode)->fid;
+        _leave("= %p", inode);
+        return inode;
+out:
+        _leave("= %d", ret);
+        return ERR_PTR(ret);
+}
+/*
 * look up an entry in a directory
 */
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -524,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        ret = afs_do_lookup(dir, dentry, &fid, key);
        if (ret < 0) {
+                inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+                if (!IS_ERR(inode)) {
+                        key_put(key);
+                        goto success;
+                }
+                ret = PTR_ERR(inode);
                key_put(key);
                if (ret == -ENOENT) {
                        d_add(dentry, NULL);
@@ -543,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_CAST(inode);
        }
+success:
        dentry->d_op = &afs_fs_dentry_operations;
        d_add(dentry, inode);
@@ -700,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
                goto zap;
        if (dentry->d_inode &&
-            test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
+            (test_bit(AFS_VNODE_DELETED,   &AFS_FS_I(dentry->d_inode)->flags) ||
-                        goto zap;
+             test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+                goto zap;
        _leave(" = 0 [keep]");
        return 0;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 /*
- * AFS read page from file, directory or symlink
+ * read page from file, directory or symlink, given a key to use
 */
-static int afs_readpage(struct file *file, struct page *page)
+int afs_page_filler(void *data, struct page *page)
 {
-        struct afs_vnode *vnode;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode;
+        struct afs_vnode *vnode = AFS_FS_I(inode);
-        struct key *key;
+        struct key *key = data;
        size_t len;
        off_t offset;
        int ret;
-        inode = page->mapping->host;
-        if (file) {
-                key = file->private_data;
-                ASSERT(key != NULL);
-        } else {
-                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
-                if (IS_ERR(key)) {
-                        ret = PTR_ERR(key);
-                        goto error_nokey;
-                }
-        }
        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
-        vnode = AFS_FS_I(inode);
        BUG_ON(!PageLocked(page));
        ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
                unlock_page(page);
        }
-        if (!file)
-                key_put(key);
        _leave(" = 0");
        return 0;
 error:
        SetPageError(page);
        unlock_page(page);
-        if (!file)
-                key_put(key);
-error_nokey:
        _leave(" = %d", ret);
        return ret;
 }
 /*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+        struct key *key;
+        int ret;
+        if (file) {
+                key = file->private_data;
+                ASSERT(key != NULL);
+                ret = afs_page_filler(key, page);
+        } else {
+                struct inode *inode = page->mapping->host;
+                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+                if (IS_ERR(key)) {
+                        ret = PTR_ERR(key);
+                } else {
+                        ret = afs_page_filler(key, page);
+                        key_put(key);
+                }
+        }
+        return ret;
+}
+/*
 * read a set of pages
 */
 static int afs_readpages(struct file *file, struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages)
 {
+        struct key *key = file->private_data;
        struct afs_vnode *vnode;
        int ret = 0;
-        _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
+        _enter("{%d},{%lu},,%d",
+               key_serial(key), mapping->host->i_ino, nr_pages);
+        ASSERT(key != NULL);
        vnode = AFS_FS_I(mapping->host);
        if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
        }
        /* load the missing pages from the network */
-        ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+        ret = read_cache_pages(mapping, pages, afs_page_filler, key);
        _leave(" = %d [netting]", ret);
        return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index d00b312e3110..0747339011c3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
 #include "internal.h"
 struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
 }
 /*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+        return 0;
+}
+/*
 * iget5() inode initialiser
 */
 static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
 }
 /*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+                                int namesz, struct key *key)
+{
+        struct afs_iget_data data;
+        struct afs_super_info *as;
+        struct afs_vnode *vnode;
+        struct super_block *sb;
+        struct inode *inode;
+        static atomic_t afs_autocell_ino;
+        _enter("{%x:%u},%*.*s,",
+               AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+               namesz, namesz, dev_name ?: "");
+        sb = dir->i_sb;
+        as = sb->s_fs_info;
+        data.volume = as->volume;
+        data.fid.vid = as->volume->vid;
+        data.fid.unique = 0;
+        data.fid.vnode = 0;
+        inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+                             afs_iget5_autocell_test, afs_iget5_set,
+                             &data);
+        if (!inode) {
+                _leave(" = -ENOMEM");
+                return ERR_PTR(-ENOMEM);
+        }
+        _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+               inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+               data.fid.unique);
+        vnode = AFS_FS_I(inode);
+        /* there shouldn't be an existing inode */
+        BUG_ON(!(inode->i_state & I_NEW));
+        inode->i_size           = 0;
+        inode->i_mode           = S_IFDIR | S_IRUGO | S_IXUGO;
+        inode->i_op             = &afs_autocell_inode_operations;
+        inode->i_nlink          = 2;
+        inode->i_uid            = 0;
+        inode->i_gid            = 0;
+        inode->i_ctime.tv_sec   = get_seconds();
+        inode->i_ctime.tv_nsec  = 0;
+        inode->i_atime          = inode->i_mtime = inode->i_ctime;
+        inode->i_blocks         = 0;
+        inode->i_version        = 0;
+        inode->i_generation     = 0;
+        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+        inode->i_flags |= S_NOATIME;
+        unlock_new_inode(inode);
+        _leave(" = %p", inode);
+        return inode;
+}
+/*
 * inode retrieval
 */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,9 +387,22 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 /*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+        _enter("");
+        if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+                return generic_delete_inode(inode);
+        else
+                return generic_drop_inode(inode);
+}
+/*
 * clear an AFS inode
 */
-void afs_clear_inode(struct inode *inode)
+void afs_evict_inode(struct inode *inode)
 {
        struct afs_permits *permits;
        struct afs_vnode *vnode;
@@ -335,6 +421,9 @@ void afs_clear_inode(struct inode *inode)
        ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        afs_give_up_callback(vnode);
        if (vnode->server) {
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..cca8eef736fc 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
 struct afs_mount_params {
        bool                    rwpath;         /* T if the parent should be considered R/W */
        bool                    force;          /* T to force cell type */
+        bool                    autocell;       /* T if set auto mount operation */
        afs_voltype_t           type;           /* type of volume requested */
        int                     volnamesz;      /* size of volume name */
        const char              *volname;       /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
 #define AFS_VNODE_READLOCKED    7               /* set if vnode is read-locked on the server */
 #define AFS_VNODE_WRITELOCKED   8               /* set if vnode is write-locked on the server */
 #define AFS_VNODE_UNLOCKING     9               /* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL      10              /* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR     11              /* set if Vnode is a pseudo directory */
        long                    acl_order;      /* ACL check count (callback break count) */
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
 extern struct afs_cell *afs_grab_cell(struct afs_cell *);
 extern void afs_put_cell(struct afs_cell *);
 extern void afs_cell_purge(void);
@@ -494,6 +497,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
 /*
 * flock.c
@@ -557,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
 /*
 * inode.c
 */
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+                                       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
                              struct afs_fid *, struct afs_file_status *,
                              struct afs_callback *);
@@ -564,7 +570,8 @@ extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int afs_setattr(struct dentry *, struct iattr *);
-extern void afs_clear_inode(struct inode *);
+extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
 /*
 * main.c
@@ -580,6 +587,7 @@ extern int afs_abort_to_error(u32);
 * mntpt.c
 */
 extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -739,7 +747,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
                              unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 /*****************************************************************************/
@@ -751,12 +759,6 @@ extern unsigned afs_debug;
 #define dbgprintk(FMT,...) \
        printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...) dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -791,9 +793,9 @@ do {							\
 } while (0)
 #else
-#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...) _dbprintk("    "FMT ,##__VA_ARGS__)
+#define _debug(FMT,...) no_printk("    "FMT ,##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c55..cfd1cbe25b22 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
        /* initialise the callback update process */
        ret = afs_callback_update_init();
+        if (ret < 0)
+                goto error_callback_update_init;
        /* create the RxRPC transport */
        ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
 error_fs:
        afs_close_socket();
 error_open_socket:
+        afs_callback_update_kill();
+error_callback_update_init:
+        afs_vlocation_purge();
 error_vl_update_init:
+        afs_cell_purge();
 error_cell_init:
 #ifdef CONFIG_AFS_FSCACHE
        fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-        afs_callback_update_kill();
-        afs_vlocation_purge();
-        afs_cell_purge();
        afs_proc_cleanup();
        rcu_barrier();
        printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..6d552686c498 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -38,6 +38,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
        .getattr        = afs_getattr,
 };
+const struct inode_operations afs_autocell_inode_operations = {
+        .follow_link    = afs_mntpt_follow_link,
+        .getattr        = afs_getattr,
+};
 static LIST_HEAD(afs_vfsmounts);
 static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
@@ -49,9 +54,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
 */
 int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
-        struct file file = {
-                .private_data = key,
-        };
        struct page *page;
        size_t size;
        char *buf;
@@ -61,7 +63,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
+        page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+                               afs_page_filler, key);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
@@ -138,20 +141,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
        struct afs_super_info *super;
        struct vfsmount *mnt;
+        struct afs_vnode *vnode;
        struct page *page;
-        size_t size;
+        char *devname, *options;
-        char *buf, *devname, *options;
+        bool rwpath = false;
        int ret;
        _enter("{%s}", mntpt->d_name.name);
        BUG_ON(!mntpt->d_inode);
-        ret = -EINVAL;
-        size = mntpt->d_inode->i_size;
-        if (size > PAGE_SIZE - 1)
-                goto error_no_devname;
        ret = -ENOMEM;
        devname = (char *) get_zeroed_page(GFP_KERNEL);
        if (!devname)
@@ -161,28 +160,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        if (!options)
                goto error_no_options;
-        /* read the contents of the AFS special symlink */
+        vnode = AFS_FS_I(mntpt->d_inode);
-        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+        if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
-        if (IS_ERR(page)) {
+                /* if the directory is a pseudo directory, use the d_name */
-                ret = PTR_ERR(page);
+                static const char afs_root_cell[] = ":root.cell.";
-                goto error_no_page;
+                unsigned size = mntpt->d_name.len;
+                ret = -ENOENT;
+                if (size < 2 || size > AFS_MAXCELLNAME)
+                        goto error_no_page;
+                if (mntpt->d_name.name[0] == '.') {
+                        devname[0] = '#';
+                        memcpy(devname + 1, mntpt->d_name.name, size - 1);
+                        memcpy(devname + size, afs_root_cell,
+                               sizeof(afs_root_cell));
+                        rwpath = true;
+                } else {
+                        devname[0] = '%';
+                        memcpy(devname + 1, mntpt->d_name.name, size);
+                        memcpy(devname + size + 1, afs_root_cell,
+                               sizeof(afs_root_cell));
+                }
+        } else {
+                /* read the contents of the AFS special symlink */
+                loff_t size = i_size_read(mntpt->d_inode);
+                char *buf;
+                ret = -EINVAL;
+                if (size > PAGE_SIZE - 1)
+                        goto error_no_page;
+                page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto error_no_page;
+                }
+                ret = -EIO;
+                if (PageError(page))
+                        goto error;
+                buf = kmap_atomic(page, KM_USER0);
+                memcpy(devname, buf, size);
+                kunmap_atomic(buf, KM_USER0);
+                page_cache_release(page);
+                page = NULL;
        }
-        ret = -EIO;
-        if (PageError(page))
-                goto error;
-        buf = kmap_atomic(page, KM_USER0);
-        memcpy(devname, buf, size);
-        kunmap_atomic(buf, KM_USER0);
-        page_cache_release(page);
-        page = NULL;
        /* work out what options we want */
        super = AFS_FS_S(mntpt->d_sb);
        memcpy(options, "cell=", 5);
        strcpy(options + 5, super->volume->cell->name);
-        if (super->volume->type == AFSVL_RWVOL)
+        if (super->volume->type == AFSVL_RWVOL || rwpath)
                strcat(options, ",rwpath");
        /* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a9..096b23f821a1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
        if (strcmp(kbuf, "add") == 0) {
                struct afs_cell *cell;
-                cell = afs_cell_create(name, args);
+                cell = afs_cell_create(name, strlen(name), args, false);
                if (IS_ERR(cell)) {
                        ret = PTR_ERR(cell);
                        goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd6..654d8fdbf01f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
        ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
        if (ret < 0) {
                sock_release(socket);
+                destroy_workqueue(afs_async_calls);
                _leave(" = %d [bind]", ret);
                return ret;
        }
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
                memcpy(&server->addr, addr, sizeof(struct in_addr));
                server->addr.s_addr = addr->s_addr;
+                _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        } else {
+                _leave(" = NULL [nomem]");
        }
-        _leave(" = %p{%d}", server, atomic_read(&server->usage));
        return server;
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e932e5a3a0c1..77e1e5a61154 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
@@ -48,8 +49,9 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
        .statfs         = afs_statfs,
        .alloc_inode    = afs_alloc_inode,
+        .drop_inode     = afs_drop_inode,
        .destroy_inode  = afs_destroy_inode,
-        .clear_inode    = afs_clear_inode,
+        .evict_inode    = afs_evict_inode,
        .put_super      = afs_put_super,
        .show_options   = generic_show_options,
 };
@@ -62,12 +64,14 @@ enum {
        afs_opt_cell,
        afs_opt_rwpath,
        afs_opt_vol,
+        afs_opt_autocell,
 };
 static const match_table_t afs_options_list = {
        { afs_opt_cell,         "cell=%s"       },
        { afs_opt_rwpath,       "rwpath"        },
        { afs_opt_vol,          "vol=%s"        },
+        { afs_opt_autocell,     "autocell"      },
        { afs_no_opt,           NULL            },
 };
@@ -151,7 +155,8 @@ static int afs_parse_options(struct afs_mount_params *params,
                switch (token) {
                case afs_opt_cell:
                        cell = afs_cell_lookup(args[0].from,
-                                               args[0].to - args[0].from);
+                                               args[0].to - args[0].from,
+                                               false);
                        if (IS_ERR(cell))
                                return PTR_ERR(cell);
                        afs_put_cell(params->cell);
@@ -166,6 +171,10 @@ static int afs_parse_options(struct afs_mount_params *params,
                        *devname = args[0].from;
                        break;
+                case afs_opt_autocell:
+                        params->autocell = 1;
+                        break;
                default:
                        printk(KERN_ERR "kAFS:"
                               " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +261,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
        /* lookup the cell record */
        if (cellname || !params->cell) {
-                cell = afs_cell_lookup(cellname, cellnamesz);
+                cell = afs_cell_lookup(cellname, cellnamesz, true);
                if (IS_ERR(cell)) {
-                        printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+                        printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
-                               cellname ?: "");
+                               cellnamesz, cellnamesz, cellname ?: "");
                        return PTR_ERR(cell);
                }
                afs_put_cell(params->cell);
@@ -321,6 +330,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
        if (IS_ERR(inode))
                goto error_inode;
+        if (params->autocell)
+                set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
        ret = -ENOMEM;
        root = d_alloc_root(inode);
        if (!root)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 {
        struct address_space *mapping = vnode->vfs_inode.i_mapping;
        struct writeback_control wbc = {
-                .bdi            = mapping->backing_dev_info,
                .sync_mode      = WB_SYNC_ALL,
                .nr_to_write    = LONG_MAX,
                .range_cyclic   = 1,
@@ -701,8 +700,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
 * - the return status from this call provides a reliable indication of
 *   whether any write errors occurred for this process.
 */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct afs_writeback *wb, *xwb;
        struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
        int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..250b0a73c8a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
-                        __fput(req->ki_filp);
+                        fput(req->ki_filp);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /*
         * Try to optimize the aio and eventfd file* puts, by avoiding to
-         * schedule work in case it is not __fput() time. In normal cases,
+         * schedule work in case it is not final fput() time. In normal cases,
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(!fput_atomic(req->ki_filp))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -711,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
         */
        ret = retry(iocb);
-        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
+        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+                /*
+                 * There's no easy way to restart the syscall since other AIO's
+                 * may be already running. Just fail this IO with EINTR.
+                 */
+                if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+                             ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
+                        ret = -EINTR;
                aio_complete(iocb, ret, 0);
+        }
 out:
        spin_lock_irq(&ctx->ctx_lock);
@@ -1276,7 +1285,7 @@ out:
 /* sys_io_destroy:
 *      Destroy the aio_context specified.  May cancel any outstanding 
 *      AIOs and block on completion.  Will fail with -ENOSYS if not
- *      implemented.  May fail with -EFAULT if the context pointed to
+ *      implemented.  May fail with -EINVAL if the context pointed to
 *      is invalid.
 */
 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1384,13 +1393,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
        return ret;
 }
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
        ssize_t ret;
-        ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
+#ifdef CONFIG_COMPAT
-                                    kiocb->ki_nbytes, 1,
+        if (compat)
-                                    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+                ret = compat_rw_copy_check_uvector(type,
+                                (struct compat_iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
+        else
+#endif
+                ret = rw_copy_check_uvector(type,
+                                (struct iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
        if (ret < 0)
                goto out;
@@ -1420,7 +1438,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
 *      Performs the initial checks and aio retry method
 *      setup for the kiocb at the time of io submission.
 */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
        struct file *file = kiocb->ki_filp;
        ssize_t ret = 0;
@@ -1469,7 +1487,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_READ);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(READ, kiocb);
+                ret = aio_setup_vectored_rw(READ, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1483,7 +1501,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_WRITE);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(WRITE, kiocb);
+                ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1548,7 +1566,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash)
+                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1609,7 +1628,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        ret = aio_setup_iocb(req);
+        ret = aio_setup_iocb(req, compat);
        if (ret)
                goto out_put_req;
@@ -1637,20 +1656,8 @@ out_put_req:
        return ret;
 }
-/* sys_io_submit:
+long do_io_submit(aio_context_t ctx_id, long nr,
- *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+                  struct iocb __user *__user *iocbpp, bool compat)
- *      the number of iocbs queued.  May return -EINVAL if the aio_context
- *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *      *iocbpp[0] is not properly initialized, if the operation specified
- *      is invalid for the file descriptor in the iocb.  May fail with
- *      -EFAULT if any of the data structures point to invalid data.  May
- *      fail with -EBADF if the file descriptor specified in the first
- *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *      fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-                struct iocb __user * __user *, iocbpp)
 {
        struct kioctx *ctx;
        long ret = 0;
@@ -1660,6 +1667,9 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        if (unlikely(nr < 0))
                return -EINVAL;
+        if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
+                nr = LONG_MAX/sizeof(*iocbpp);
        if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
                return -EFAULT;
@@ -1687,7 +1697,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
                if (ret)
                        break;
        }
@@ -1697,6 +1707,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        return i ? i : ret;
 }
+/* sys_io_submit:
+ *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *      the number of iocbs queued.  May return -EINVAL if the aio_context
+ *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *      *iocbpp[0] is not properly initialized, if the operation specified
+ *      is invalid for the file descriptor in the iocb.  May fail with
+ *      -EFAULT if any of the data structures point to invalid data.  May
+ *      fail with -EBADF if the file descriptor specified in the first
+ *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *      fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+                struct iocb __user * __user *, iocbpp)
+{
+        return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
@@ -1778,15 +1806,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 /* io_getevents:
 *      Attempts to read at least min_nr events and up to nr events from
- *      the completion queue for the aio_context specified by ctx_id.  May
+ *      the completion queue for the aio_context specified by ctx_id. If
- *      fail with -EINVAL if ctx_id is invalid, if min_nr is out of range,
+ *      it succeeds, the number of read events is returned. May fail with
- *      if nr is out of range, if when is out of range.  May fail with
+ *      -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
- *      -EFAULT if any of the memory specified to is invalid.  May return
+ *      out of range, if timeout is out of range.  May fail with -EFAULT
- *      0 or < min_nr if no events are available and the timeout specified
+ *      if any of the memory specified is invalid.  May return 0 or
- *      by when has elapsed, where when == NULL specifies an infinite
+ *      < min_nr if the timeout specified by timeout has elapsed
- *      timeout.  Note that the timeout pointed to by when is relative and
+ *      before sufficient events are available, where timeout == NULL
- *      will be updated if not NULL and the operation blocks.  Will fail
+ *      specifies an infinite timeout. Note that the timeout pointed to by
- *      with -ENOSYS if not implemented.
+ *      timeout is relative and will be updated if not NULL and the
+ *      operation blocks. Will fail with -ENOSYS if not implemented.
 */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
                long, min_nr,
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..7ca41811afa1 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -14,35 +14,53 @@
 #include <linux/fcntl.h>
 #include <linux/security.h>
-/* Taken over from the old code... */
+/**
+ * inode_change_ok - check if attribute changes to an inode are allowed
-/* POSIX UID/GID verification for setting inode attributes. */
+ * @inode:      inode to check
+ * @attr:       attributes to change
+ *
+ * Check if we are allowed to change the attributes contained in @attr
+ * in the given inode.  This includes the normal unix access permission
+ * checks, as well as checks for rlimits and others.
+ *
+ * Should be called as the first thing in ->setattr implementations,
+ * possibly after taking additional locks.
+ */
 int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
-        int retval = -EPERM;
        unsigned int ia_valid = attr->ia_valid;
+        /*
+         * First check size constraints.  These can't be overriden using
+         * ATTR_FORCE.
+         */
+        if (ia_valid & ATTR_SIZE) {
+                int error = inode_newsize_ok(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
        /* If force is set do it anyway. */
        if (ia_valid & ATTR_FORCE)
-                goto fine;
+                return 0;
        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) &&
            (current_fsuid() != inode->i_uid ||
             attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
-                goto error;
+                return -EPERM;
        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) &&
            (current_fsuid() != inode->i_uid ||
            (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
            !capable(CAP_CHOWN))
-                goto error;
+                return -EPERM;
        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
                if (!is_owner_or_cap(inode))
-                        goto error;
+                        return -EPERM;
                /* Also check the setgid bit! */
                if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
                                inode->i_gid) && !capable(CAP_FSETID))
@@ -52,12 +70,10 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
                if (!is_owner_or_cap(inode))
-                        goto error;
+                        return -EPERM;
        }
-fine:
-        retval = 0;
+        return 0;
-error:
-        return retval;
 }
 EXPORT_SYMBOL(inode_change_ok);
@@ -67,14 +83,14 @@ EXPORT_SYMBOL(inode_change_ok);
 * @offset:     the new size to assign to the inode
 * @Returns:    0 on success, -ve errno on failure
 *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
 */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +120,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * setattr_copy - copy simple metadata updates into the generic inode
+ * @inode:      the inode to be updated
+ * @attr:       the new attributes
+ *
+ * setattr_copy must be called with i_mutex held.
+ *
+ * setattr_copy updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void setattr_copy(struct inode *inode, const struct iattr *attr)
 {
        unsigned int ia_valid = attr->ia_valid;
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
-                int error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
@@ -135,11 +159,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
-        mark_inode_dirty(inode);
-        return 0;
 }
-EXPORT_SYMBOL(inode_setattr);
+EXPORT_SYMBOL(setattr_copy);
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
@@ -207,13 +228,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
        if (ia_valid & ATTR_SIZE)
                down_write(&dentry->d_inode->i_alloc_sem);
-        if (inode->i_op && inode->i_op->setattr) {
+        if (inode->i_op->setattr)
                error = inode->i_op->setattr(dentry, attr);
-        } else {
+        else
-                error = inode_change_ok(inode, attr);
+                error = simple_setattr(dentry, attr);
-                if (!error)
-                        error = inode_setattr(inode, attr);
-        }
        if (ia_valid & ATTR_SIZE)
                up_write(&dentry->d_inode->i_alloc_sem);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..11b1ea786d00 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/compat.h>
 #include <linux/smp_lock.h>
 #include "autofs_i.h"
@@ -25,12 +26,17 @@ static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
 static int autofs_root_unlink(struct inode *,struct dentry *);
 static int autofs_root_rmdir(struct inode *,struct dentry *);
 static int autofs_root_mkdir(struct inode *,struct dentry *,int);
-static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 const struct file_operations autofs_root_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = autofs_root_readdir,
-        .ioctl          = autofs_root_ioctl,
+        .unlocked_ioctl = autofs_root_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = autofs_root_compat_ioctl,
+#endif
 };
 const struct inode_operations autofs_root_inode_operations = {
@@ -491,6 +497,25 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
+                                         unsigned int __user *p)
+{
+        unsigned long ntimeout;
+        if (get_user(ntimeout, p) ||
+            put_user(sbi->exp_timeout / HZ, p))
+                return -EFAULT;
+        if (ntimeout > UINT_MAX/HZ)
+                sbi->exp_timeout = 0;
+        else
+                sbi->exp_timeout = ntimeout * HZ;
+        return 0;
+}
+#endif
 static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
                                         unsigned long __user *p)
 {
@@ -545,7 +570,7 @@ static inline int autofs_expire_run(struct super_block *sb,
 * ioctl()'s on the root directory is the chief method for the daemon to
 * generate kernel reactions
 */
-static int autofs_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs_do_root_ioctl(struct inode *inode, struct file *filp,
                             unsigned int cmd, unsigned long arg)
 {
        struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
@@ -570,6 +595,10 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
                return 0;
        case AUTOFS_IOC_PROTOVER: /* Get protocol version */
                return autofs_get_protover(argp);
+#ifdef CONFIG_COMPAT
+        case AUTOFS_IOC_SETTIMEOUT32:
+                return autofs_compat_get_set_timeout(sbi, argp);
+#endif
        case AUTOFS_IOC_SETTIMEOUT:
                return autofs_get_set_timeout(sbi, argp);
        case AUTOFS_IOC_EXPIRE:
@@ -578,4 +607,37 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
        default:
                return -ENOSYS;
        }
+}
+static long autofs_root_ioctl(struct file *filp,
+                             unsigned int cmd, unsigned long arg)
+{
+        int ret;
+        lock_kernel();
+        ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode,
+                                   filp, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
+#ifdef CONFIG_COMPAT
+static long autofs_root_compat_ioctl(struct file *filp,
+                             unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret;
+        lock_kernel();
+        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+                ret = autofs_do_root_ioctl(inode, filp, cmd, arg);
+        else
+                ret = autofs_do_root_ioctl(inode, filp, cmd,
+                        (unsigned long)compat_ptr(arg));
+        unlock_kernel();
+        return ret;
 }
+#endif
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-        struct autofs_dev_ioctl tmp, *ads;
+        struct autofs_dev_ioctl tmp;
        if (copy_from_user(&tmp, in, sizeof(tmp)))
                return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
        if (tmp.size < sizeof(tmp))
                return ERR_PTR(-EINVAL);
-        ads = kmalloc(tmp.size, GFP_KERNEL);
+        return memdup_user(in, tmp.size);
-        if (!ads)
-                return ERR_PTR(-ENOMEM);
-        if (copy_from_user(ads, in, tmp.size)) {
-                kfree(ads);
-                return ERR_PTR(-EFAULT);
-        }
-        return ads;
 }
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 static struct miscdevice _autofs_dev_ioctl_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = AUTOFS_MINOR,
        .name           = AUTOFS_DEVICE_NAME,
        .fops           = &_dev_ioctl_fops
 };
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..cb1bd38dc08c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,17 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +42,10 @@ const struct file_operations autofs4_root_operations = {
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
-        .ioctl          = autofs4_root_ioctl,
+        .unlocked_ioctl = autofs4_root_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = autofs4_root_compat_ioctl,
+#endif
 };
 const struct file_operations autofs4_dir_operations = {
@@ -197,8 +204,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
        }
        /* Initialize expiry counter after successful mount */
-        if (ino)
+        ino->last_used = jiffies;
-                ino->last_used = jiffies;
        spin_lock(&sbi->fs_lock);
        ino->flags &= ~AUTOFS_INF_PENDING;
@@ -839,6 +845,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /* Get/set timeout ioctl() operation */
+#ifdef CONFIG_COMPAT
+static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+                                         compat_ulong_t __user *p)
+{
+        int rv;
+        unsigned long ntimeout;
+        if ((rv = get_user(ntimeout, p)) ||
+             (rv = put_user(sbi->exp_timeout/HZ, p)))
+                return rv;
+        if (ntimeout > UINT_MAX/HZ)
+                sbi->exp_timeout = 0;
+        else
+                sbi->exp_timeout = ntimeout * HZ;
+        return 0;
+}
+#endif
 static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
                                         unsigned long __user *p)
 {
@@ -902,8 +928,8 @@ int is_autofs4_dentry(struct dentry *dentry)
 * ioctl()'s on the root directory is the chief method for the daemon to
 * generate kernel reactions
 */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
+                                       unsigned int cmd, unsigned long arg)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
        void __user *p = (void __user *)arg;
@@ -932,6 +958,10 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
                return autofs4_get_protosubver(sbi, p);
        case AUTOFS_IOC_SETTIMEOUT:
                return autofs4_get_set_timeout(sbi, p);
+#ifdef CONFIG_COMPAT
+        case AUTOFS_IOC_SETTIMEOUT32:
+                return autofs4_compat_get_set_timeout(sbi, p);
+#endif
        case AUTOFS_IOC_ASKUMOUNT:
                return autofs4_ask_umount(filp->f_path.mnt, p);
@@ -947,3 +977,35 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
                return -ENOSYS;
        }
 }
+static long autofs4_root_ioctl(struct file *filp,
+                               unsigned int cmd, unsigned long arg)
+{
+        long ret;
+        struct inode *inode = filp->f_dentry->d_inode;
+        lock_kernel();
+        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
+#ifdef CONFIG_COMPAT
+static long autofs4_root_compat_ioctl(struct file *filp,
+                             unsigned int cmd, unsigned long arg)
+{
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        int ret;
+        lock_kernel();
+        if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
+                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+        else
+                ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+                        (unsigned long)compat_ptr(arg));
+        unlock_kernel();
+        return ret;
+}
+#endif
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..f024d8aaddef 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -55,12 +55,6 @@ static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
        return POLLERR;
 }
-static int bad_file_ioctl (struct inode *inode, struct file *filp,
-                        unsigned int cmd, unsigned long arg)
-{
-        return -EIO;
-}
 static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
                        unsigned long arg)
 {
@@ -93,8 +87,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
        return -EIO;
 }
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
+static int bad_file_fsync(struct file *file, int datasync)
-                        int datasync)
 {
        return -EIO;
 }
@@ -160,7 +153,6 @@ static const struct file_operations bad_file_ops =
        .aio_write      = bad_file_aio_write,
        .readdir        = bad_file_readdir,
        .poll           = bad_file_poll,
-        .ioctl          = bad_file_ioctl,
        .unlocked_ioctl = bad_file_unlocked_ioctl,
        .compat_ioctl   = bad_file_compat_ioctl,
        .mmap           = bad_file_mmap,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 34ddda888e63..dc39d2824885 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -436,7 +436,7 @@ befs_init_inodecache(void)
                                              init_once);
        if (befs_inode_cachep == NULL) {
                printk(KERN_ERR "befs_init_inodecache: "
-                       "Couldn't initalize inode slabcache\n");
+                       "Couldn't initialize inode slabcache\n");
                return -ENOMEM;
        }
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 7109e451abf7..f7f87e233dd9 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -17,7 +17,6 @@ struct bfs_sb_info {
        unsigned long si_lf_eblk;
        unsigned long si_lasti;
        unsigned long *si_imap;
-        struct buffer_head *si_sbh;             /* buffer header w/superblock */
        struct mutex bfs_lock;
 };
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        }
        set_bit(ino, info->si_imap);
        info->si_freei--;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
        inode->i_op = &bfs_file_inops;
        inode->i_fop = &bfs_file_operations;
        inode->i_mapping->a_ops = &bfs_aops;
-        inode->i_mode = mode;
        inode->i_ino = ino;
        BFS_I(inode)->i_dsk_ino = ino;
        BFS_I(inode)->i_sblock = 0;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 88b9a3ff44e4..eb67edd0f8ea 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -70,7 +70,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
        struct super_block *sb = inode->i_sb;
        struct bfs_sb_info *info = BFS_SB(sb);
        struct bfs_inode_info *bi = BFS_I(inode);
-        struct buffer_head *sbh = info->si_sbh;
        phys = bi->i_sblock + block;
        if (!create) {
@@ -112,7 +111,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
                info->si_freeb -= phys - bi->i_eblock;
                info->si_lf_eblk = bi->i_eblock = phys;
                mark_inode_dirty(inode);
-                mark_buffer_dirty(sbh);
                err = 0;
                goto out;
        }
@@ -147,7 +145,6 @@ static int bfs_get_block(struct inode *inode, sector_t block,
         */
        info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks;
        mark_inode_dirty(inode);
-        mark_buffer_dirty(sbh);
        map_bh(bh_result, sb, phys);
 out:
        mutex_unlock(&info->bfs_lock);
@@ -168,9 +165,17 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return block_write_begin(file, mapping, pos, len, flags,
-                                        pagep, fsdata, bfs_get_block);
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                bfs_get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index f22a7d3dc362..c4daf0f5fc02 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -31,7 +31,6 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
-static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -99,6 +98,24 @@ error:
        return ERR_PTR(-EIO);
 }
+static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p)
+{
+        if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) {
+                printf("Bad inode number %s:%08x\n", sb->s_id, ino);
+                return ERR_PTR(-EIO);
+        }
+        ino -= BFS_ROOT_INO;
+        *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK);
+        if (!*p) {
+                printf("Unable to read inode %s:%08x\n", sb->s_id, ino);
+                return ERR_PTR(-EIO);
+        }
+        return (struct bfs_inode *)(*p)->b_data +  ino % BFS_INODES_PER_BLOCK;
+}
 static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
@@ -106,28 +123,15 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        unsigned long i_sblock;
        struct bfs_inode *di;
        struct buffer_head *bh;
-        int block, off;
        int err = 0;
        dprintf("ino=%08x\n", ino);
-        if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
+        di = find_inode(inode->i_sb, ino, &bh);
-                printf("Bad inode number %s:%08x\n", inode->i_sb->s_id, ino);
+        if (IS_ERR(di))
-                return -EIO;
+                return PTR_ERR(di);
-        }
        mutex_lock(&info->bfs_lock);
-        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
-        bh = sb_bread(inode->i_sb, block);
-        if (!bh) {
-                printf("Unable to read inode %s:%08x\n",
-                                inode->i_sb->s_id, ino);
-                mutex_unlock(&info->bfs_lock);
-                return -EIO;
-        }
-        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-        di = (struct bfs_inode *)bh->b_data + off;
        if (ino == BFS_ROOT_INO)
                di->i_vtype = cpu_to_le32(BFS_VDIR);
@@ -158,12 +162,11 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        return err;
 }
-static void bfs_delete_inode(struct inode *inode)
+static void bfs_evict_inode(struct inode *inode)
 {
        unsigned long ino = inode->i_ino;
        struct bfs_inode *di;
        struct buffer_head *bh;
-        int block, off;
        struct super_block *s = inode->i_sb;
        struct bfs_sb_info *info = BFS_SB(s);
        struct bfs_inode_info *bi = BFS_I(inode);
@@ -171,28 +174,19 @@ static void bfs_delete_inode(struct inode *inode)
        dprintf("ino=%08lx\n", ino);
        truncate_inode_pages(&inode->i_data, 0);
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
-        if ((ino < BFS_ROOT_INO) || (ino > info->si_lasti)) {
+        if (inode->i_nlink)
-                printf("invalid ino=%08lx\n", ino);
                return;
-        }
-        
-        inode->i_size = 0;
-        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        mutex_lock(&info->bfs_lock);
-        mark_inode_dirty(inode);
-        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
+        di = find_inode(s, inode->i_ino, &bh);
-        bh = sb_bread(s, block);
+        if (IS_ERR(di))
-        if (!bh) {
-                printf("Unable to read inode %s:%08lx\n",
-                                        inode->i_sb->s_id, ino);
-                mutex_unlock(&info->bfs_lock);
                return;
-        }
-        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+        mutex_lock(&info->bfs_lock);
-        di = (struct bfs_inode *)bh->b_data + off;
+        /* clear on-disk inode */
-        memset((void *)di, 0, sizeof(struct bfs_inode));
+        memset(di, 0, sizeof(struct bfs_inode));
        mark_buffer_dirty(bh);
        brelse(bh);
@@ -209,32 +203,9 @@ static void bfs_delete_inode(struct inode *inode)
         * "last block of the last file" even if there is no
         * real file there, saves us 1 gap.
         */
-        if (info->si_lf_eblk == bi->i_eblock) {
+        if (info->si_lf_eblk == bi->i_eblock)
                info->si_lf_eblk = bi->i_sblock - 1;
-                mark_buffer_dirty(info->si_sbh);
-        }
        mutex_unlock(&info->bfs_lock);
-        clear_inode(inode);
-}
-static int bfs_sync_fs(struct super_block *sb, int wait)
-{
-        struct bfs_sb_info *info = BFS_SB(sb);
-        mutex_lock(&info->bfs_lock);
-        mark_buffer_dirty(info->si_sbh);
-        sb->s_dirt = 0;
-        mutex_unlock(&info->bfs_lock);
-        return 0;
-}
-static void bfs_write_super(struct super_block *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                bfs_sync_fs(sb, 1);
-        else
-                sb->s_dirt = 0;
 }
 static void bfs_put_super(struct super_block *s)
@@ -246,10 +217,6 @@ static void bfs_put_super(struct super_block *s)
        lock_kernel();
-        if (s->s_dirt)
-                bfs_write_super(s);
-        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
@@ -319,10 +286,8 @@ static const struct super_operations bfs_sops = {
        .alloc_inode    = bfs_alloc_inode,
        .destroy_inode  = bfs_destroy_inode,
        .write_inode    = bfs_write_inode,
-        .delete_inode   = bfs_delete_inode,
+        .evict_inode    = bfs_evict_inode,
        .put_super      = bfs_put_super,
-        .write_super    = bfs_write_super,
-        .sync_fs        = bfs_sync_fs,
        .statfs         = bfs_statfs,
 };
@@ -349,7 +314,7 @@ void dump_imap(const char *prefix, struct super_block *s)
 static int bfs_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct buffer_head *bh;
+        struct buffer_head *bh, *sbh;
        struct bfs_super_block *bfs_sb;
        struct inode *inode;
        unsigned i, imap_len;
@@ -365,10 +330,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        sb_set_blocksize(s, BFS_BSIZE);
-        info->si_sbh = sb_bread(s, 0);
+        sbh = sb_bread(s, 0);
-        if (!info->si_sbh)
+        if (!sbh)
                goto out;
-        bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data;
+        bfs_sb = (struct bfs_super_block *)sbh->b_data;
        if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) {
                if (!silent)
                        printf("No BFS filesystem on %s (magic=%08x)\n", 
@@ -472,10 +437,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        info->si_lf_eblk = eblock;
        }
        brelse(bh);
-        if (!(s->s_flags & MS_RDONLY)) {
+        brelse(sbh);
-                mark_buffer_dirty(info->si_sbh);
-                s->s_dirt = 1;
-        } 
        dump_imap("read_super", s);
        return 0;
@@ -485,7 +447,7 @@ out3:
 out2:
        kfree(info->si_imap);
 out1:
-        brelse(info->si_sbh);
+        brelse(sbh);
 out:
        mutex_destroy(&info->bfs_lock);
        kfree(info);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
                if (!dump_write(file, dump_start, dump_size))
                        goto end_coredump;
        }
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
-        set_fs(KERNEL_DS);
-        if (!dump_write(file, current, sizeof(*current)))
-                goto end_coredump;
 end_coredump:
        set_fs(fs);
        return has_dumped;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                /* clear any space allocated but not loaded */
                if (phdr->p_filesz < phdr->p_memsz) {
-                        ret = clear_user((void *) (seg->addr + phdr->p_filesz),
+                        if (clear_user((void *) (seg->addr + phdr->p_filesz),
-                                         phdr->p_memsz - phdr->p_filesz);
+                                       phdr->p_memsz - phdr->p_filesz))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
                if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
        struct elf32_fdpic_loadseg *seg;
        struct elf32_phdr *phdr;
        unsigned long load_addr, delta_vaddr;
-        int loop, dvset, ret;
+        int loop, dvset;
        load_addr = params->load_addr;
        delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                 * PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        ret = clear_user((void __user *) maddr, disp);
+                        if (clear_user((void __user *) maddr, disp))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                        maddr += disp;
                }
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        ret = clear_user((void __user *) maddr + phdr->p_filesz,
+                        if (clear_user((void __user *) maddr + phdr->p_filesz,
-                                         excess1);
+                                       excess1))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #else
                if (excess > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess);
-                        ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+                        if (clear_user((void *) maddr + phdr->p_filesz, excess))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #endif
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..811384bec8de 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,16 +56,19 @@
 #endif
 /*
- * User data (stack, data section and bss) needs to be aligned
+ * User data (data section and bss) needs to be aligned.
- * for the same reasons as SLAB memory is, and to the same amount.
+ * We pick 0x20 here because it is the max value elf2flt has always
- * Avoid duplicating architecture specific code by using the same
+ * used in producing FLAT files, and because it seems to be large
- * macro as with SLAB allocation:
+ * enough to make all the gcc alignment related tests happy.
 */
-#ifdef ARCH_SLAB_MINALIGN
+#define FLAT_DATA_ALIGN (0x20)
-#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
-#else
+/*
-#define FLAT_DATA_ALIGN (sizeof(void *))
+ * User data (stack) also needs to be aligned.
-#endif
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
+ */
+#define FLAT_STACK_ALIGN        max_t(unsigned long, sizeof(void *), ARCH_SLAB_MINALIGN)
 #define RELOC_FAILED 0xff00ff01         /* Relocation incorrect somewhere */
 #define UNLOADED_LIB 0x7ff000ff         /* Placeholder for unused library */
@@ -129,7 +132,7 @@ static unsigned long create_flat_tables(
        sp = (unsigned long *)p;
        sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-        sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+        sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
        argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
        envp = argv + (argc + 1);
@@ -589,7 +592,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
                        do_munmap(current->mm, textpos, text_len);
-                        do_munmap(current->mm, realdatastart, data_len + extra);
+                        do_munmap(current->mm, realdatastart, len);
                        ret = result;
                        goto err;
                }
@@ -876,7 +879,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
        stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
        stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-        stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
+        stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
        if (IS_ERR_VALUE(res))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e83537ead7..fd0cc0bf9a40 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        Node *fmt;
        struct file * interp_file = NULL;
        char iname[BINPRM_BUF_SIZE];
-        char *iname_addr = iname;
+        const char *iname_addr = iname;
        int retval;
        int fd_binary = -1;
@@ -502,8 +502,9 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        return inode;
 }
-static void bm_clear_inode(struct inode *inode)
+static void bm_evict_inode(struct inode *inode)
 {
+        end_writeback(inode);
        kfree(inode->i_private);
 }
@@ -685,7 +686,7 @@ static const struct file_operations bm_status_operations = {
 static const struct super_operations s_ops = {
        .statfs         = simple_statfs,
-        .clear_inode    = bm_clear_inode,
+        .evict_inode    = bm_evict_inode,
 };
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -723,7 +724,7 @@ static int __init init_misc_binfmt(void)
 {
        int err = register_filesystem(&bm_fs_type);
        if (!err) {
-                err = register_binfmt(&misc_format);
+                err = insert_binfmt(&misc_format);
                if (err)
                        unregister_filesystem(&bm_fs_type);
        }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb22..396a9884591f 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
 static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 {
-        char *cp, *i_name, *i_arg;
+        const char *i_arg, *i_name;
+        char *cp;
        struct file *file;
        char interp[BINPRM_BUF_SIZE];
        int retval;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 612a5c38d3c1..4d0ff5ee27b8 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio)
        /* Allocate kernel buffer for protection data */
        len = sectors * blk_integrity_tuple_size(bi);
-        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
        if (unlikely(buf == NULL)) {
                printk(KERN_ERR "could not allocate integrity buffer\n");
-                return -EIO;
+                return -ENOMEM;
        }
        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
diff --git a/fs/bio.c b/fs/bio.c
index e7bf6ca64dcf..8abb2dfb2e7c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        if (!bio)
                goto out_bmd;
-        bio->bi_rw |= (!write_to_vm << BIO_RW);
+        if (!write_to_vm)
+                bio->bi_rw |= REQ_WRITE;
        ret = 0;
@@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
         * set data direction, and check if mapped pages need bouncing
         */
        if (!write_to_vm)
-                bio->bi_rw |= (1 << BIO_RW);
+                bio->bi_rw |= REQ_WRITE;
        bio->bi_bdev = bdev;
        bio->bi_flags |= (1 << BIO_USER_MAPPED);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..50e8c8582faa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
+        return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
+                                    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
        sb = get_active_super(bdev);
        if (!sb)
                goto out;
-        if (sb->s_flags & MS_RDONLY) {
+        error = freeze_super(sb);
-                sb->s_frozen = SB_FREEZE_TRANS;
+        if (error) {
-                up_write(&sb->s_umount);
+                deactivate_super(sb);
+                bdev->bd_fsfreeze_count--;
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return sb;
+                return ERR_PTR(error);
-        }
-        sb->s_frozen = SB_FREEZE_WRITE;
-        smp_wmb();
-        sync_filesystem(sb);
-        sb->s_frozen = SB_FREEZE_TRANS;
-        smp_wmb();
-        sync_blockdev(sb->s_bdev);
-        if (sb->s_op->freeze_fs) {
-                error = sb->s_op->freeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem freeze failed\n");
-                        sb->s_frozen = SB_UNFROZEN;
-                        deactivate_locked_super(sb);
-                        bdev->bd_fsfreeze_count--;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return ERR_PTR(error);
-                }
        }
-        up_write(&sb->s_umount);
+        deactivate_super(sb);
 out:
        sync_blockdev(bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (!bdev->bd_fsfreeze_count)
-                goto out_unlock;
+                goto out;
        error = 0;
        if (--bdev->bd_fsfreeze_count > 0)
-                goto out_unlock;
+                goto out;
        if (!sb)
-                goto out_unlock;
+                goto out;
-        BUG_ON(sb->s_bdev != bdev);
-        down_write(&sb->s_umount);
-        if (sb->s_flags & MS_RDONLY)
-                goto out_unfrozen;
-        if (sb->s_op->unfreeze_fs) {
-                error = sb->s_op->unfreeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem thaw failed\n");
-                        sb->s_frozen = SB_FREEZE_TRANS;
-                        bdev->bd_fsfreeze_count++;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return error;
-                }
-        }
-out_unfrozen:
-        sb->s_frozen = SB_UNFROZEN;
-        smp_wmb();
-        wake_up(&sb->s_wait_unfrozen);
-        if (sb)
+        error = thaw_super(sb);
-                deactivate_locked_super(sb);
+        if (error) {
-out_unlock:
+                bdev->bd_fsfreeze_count++;
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return error;
+        }
+out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return 0;
 }
@@ -349,9 +308,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        return block_write_begin(mapping, pos, len, flags, pagep,
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                 blkdev_get_block);
-                                blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -399,12 +357,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
        
-/*
+int blkdev_fsync(struct file *filp, int datasync)
- *      Filp is never NULL; the only case when ->fsync() is called with
- *      NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
        struct inode *bd_inode = filp->f_mapping->host;
        struct block_device *bdev = I_BDEV(bd_inode);
@@ -417,7 +370,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
-        error = blkdev_issue_flush(bdev, NULL);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
        if (error == -EOPNOTSUPP)
                error = 0;
@@ -473,10 +426,13 @@ static inline void __bd_forget(struct inode *inode)
        inode->i_mapping = &inode->i_data;
 }
-static void bdev_clear_inode(struct inode *inode)
+static void bdev_evict_inode(struct inode *inode)
 {
        struct block_device *bdev = &BDEV_I(inode)->bdev;
        struct list_head *p;
+        truncate_inode_pages(&inode->i_data, 0);
+        invalidate_inode_buffers(inode); /* is it needed here? */
+        end_writeback(inode);
        spin_lock(&bdev_lock);
        while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
                __bd_forget(list_entry(p, struct inode, i_devices));
@@ -490,7 +446,7 @@ static const struct super_operations bdev_sops = {
        .alloc_inode = bdev_alloc_inode,
        .destroy_inode = bdev_destroy_inode,
        .drop_inode = generic_delete_inode,
-        .clear_inode = bdev_clear_inode,
+        .evict_inode = bdev_evict_inode,
 };
 static int bd_get_sb(struct file_system_type *fs_type,
@@ -668,41 +624,233 @@ void bd_forget(struct inode *inode)
                iput(bdev->bd_inode);
 }
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                         void *holder)
 {
-        int res;
-        spin_lock(&bdev_lock);
-        /* first decide result */
        if (bdev->bd_holder == holder)
-                res = 0;         /* already a holder */
+                return true;     /* already a holder */
        else if (bdev->bd_holder != NULL)
-                res = -EBUSY;    /* held by someone else */
+                return false;    /* held by someone else */
        else if (bdev->bd_contains == bdev)
-                res = 0;         /* is a whole device which isn't held */
+                return true;     /* is a whole device which isn't held */
-        else if (bdev->bd_contains->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_claim)
-                res = 0;         /* is a partition of a device that is being partitioned */
+                return true;     /* is a partition of a device that is being partitioned */
-        else if (bdev->bd_contains->bd_holder != NULL)
+        else if (whole->bd_holder != NULL)
-                res = -EBUSY;    /* is a partition of a held device */
+                return false;    /* is a partition of a held device */
        else
-                res = 0;         /* is a partition of an un-held device */
+                return true;     /* is a partition of an un-held device */
+}
-        /* now impose change */
+/**
-        if (res==0) {
+ * bd_prepare_to_claim - prepare to claim a block device
-                /* note that for a whole device bd_holders
+ * @bdev: block device of interest
-                 * will be incremented twice, and bd_holder will
+ * @whole: the whole device containing @bdev, may equal @bdev
-                 * be set to bd_claim before being set to holder
+ * @holder: holder trying to claim @bdev
-                 */
+ *
-                bdev->bd_contains->bd_holders ++;
+ * Prepare to claim @bdev.  This function fails if @bdev is already
-                bdev->bd_contains->bd_holder = bd_claim;
+ * claimed by another holder and waits if another claiming is in
-                bdev->bd_holders++;
+ * progress.  This function doesn't actually claim.  On successful
-                bdev->bd_holder = holder;
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+                               struct block_device *whole, void *holder)
+{
+retry:
+        /* if someone else claimed, fail */
+        if (!bd_may_claim(bdev, whole, holder))
+                return -EBUSY;
+        /* if claiming is already in progress, wait for it to finish */
+        if (whole->bd_claiming) {
+                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+                DEFINE_WAIT(wait);
+                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&bdev_lock);
+                schedule();
+                finish_wait(wq, &wait);
+                spin_lock(&bdev_lock);
+                goto retry;
+        }
+        /* yay, all mine */
+        return 0;
+}
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+                                              void *holder)
+{
+        struct gendisk *disk;
+        struct block_device *whole;
+        int partno, err;
+        might_sleep();
+        /*
+         * @bdev might not have been initialized properly yet, look up
+         * and grab the outer block device the hard way.
+         */
+        disk = get_gendisk(bdev->bd_dev, &partno);
+        if (!disk)
+                return ERR_PTR(-ENXIO);
+        whole = bdget_disk(disk, 0);
+        module_put(disk->fops->owner);
+        put_disk(disk);
+        if (!whole)
+                return ERR_PTR(-ENOMEM);
+        /* prepare to claim, if successful, mark claiming in progress */
+        spin_lock(&bdev_lock);
+        err = bd_prepare_to_claim(bdev, whole, holder);
+        if (err == 0) {
+                whole->bd_claiming = holder;
+                spin_unlock(&bdev_lock);
+                return whole;
+        } else {
+                spin_unlock(&bdev_lock);
+                bdput(whole);
+                return ERR_PTR(err);
        }
+}
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        BUG_ON(whole->bd_claiming != holder);
+        whole->bd_claiming = NULL;
+        wake_up_bit(&whole->bd_claiming, 0);
        spin_unlock(&bdev_lock);
-        return res;
+        bdput(whole);
+}
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
+}
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+                                        void *holder)
+{
+        /* note that for a whole device bd_holders
+         * will be incremented twice, and bd_holder will
+         * be set to bd_claim before being set to holder
+         */
+        whole->bd_holders++;
+        whole->bd_holder = bd_claim;
+        bdev->bd_holders++;
+        bdev->bd_holder = holder;
 }
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+                                struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        BUG_ON(!bd_may_claim(bdev, whole, holder));
+        __bd_claim(bdev, whole, holder);
+        __bd_abort_claiming(whole, holder); /* not actually an abort */
+}
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+        struct block_device *whole = bdev->bd_contains;
+        int res;
+        might_sleep();
+        spin_lock(&bdev_lock);
+        res = bd_prepare_to_claim(bdev, whole, holder);
+        if (res == 0)
+                __bd_claim(bdev, whole, holder);
+        spin_unlock(&bdev_lock);
+        return res;
+}
 EXPORT_SYMBOL(bd_claim);
 void bd_release(struct block_device *bdev)
@@ -1192,19 +1340,20 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        /*
         * hooks: /n/, see "layering violations".
         */
-        ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+        if (!for_part) {
-        if (ret != 0) {
+                ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-                bdput(bdev);
+                if (ret != 0) {
-                return ret;
+                        bdput(bdev);
+                        return ret;
+                }
        }
-        lock_kernel();
 restart:
        ret = -ENXIO;
        disk = get_gendisk(bdev->bd_dev, &partno);
        if (!disk)
-                goto out_unlock_kernel;
+                goto out;
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
@@ -1284,7 +1433,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        if (for_part)
                bdev->bd_part_count++;
        mutex_unlock(&bdev->bd_mutex);
-        unlock_kernel();
        return 0;
 out_clear:
@@ -1297,9 +1445,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_contains = NULL;
 out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
- out_unlock_kernel:
+ out:
-        unlock_kernel();
        if (disk)
                module_put(disk->fops->owner);
        put_disk(disk);
@@ -1316,6 +1462,7 @@ EXPORT_SYMBOL(blkdev_get);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+        struct block_device *whole = NULL;
        struct block_device *bdev;
        int res;
@@ -1338,22 +1485,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
+        if (filp->f_mode & FMODE_EXCL) {
+                whole = bd_start_claiming(bdev, filp);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
        res = blkdev_get(bdev, filp->f_mode);
-        if (res)
-                return res;
-        if (filp->f_mode & FMODE_EXCL) {
+        if (whole) {
-                res = bd_claim(bdev, filp);
+                if (res == 0)
-                if (res)
+                        bd_finish_claiming(bdev, whole, filp);
-                        goto out_blkdev_put;
+                else
+                        bd_abort_claiming(whole, filp);
        }
-        return 0;
- out_blkdev_put:
-        blkdev_put(bdev, filp->f_mode);
        return res;
 }
@@ -1364,7 +1514,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
        struct block_device *victim = NULL;
        mutex_lock_nested(&bdev->bd_mutex, for_part);
-        lock_kernel();
        if (for_part)
                bdev->bd_part_count--;
@@ -1389,7 +1538,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                        victim = bdev->bd_contains;
                bdev->bd_contains = NULL;
        }
-        unlock_kernel();
        mutex_unlock(&bdev->bd_mutex);
        bdput(bdev);
        if (victim)
@@ -1564,27 +1712,34 @@ EXPORT_SYMBOL(lookup_bdev);
 */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-        struct block_device *bdev;
+        struct block_device *bdev, *whole;
-        int error = 0;
+        int error;
        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;
+        whole = bd_start_claiming(bdev, holder);
+        if (IS_ERR(whole)) {
+                bdput(bdev);
+                return whole;
+        }
        error = blkdev_get(bdev, mode);
        if (error)
-                return ERR_PTR(error);
+                goto out_abort_claiming;
        error = -EACCES;
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto blkdev_put;
+                goto out_blkdev_put;
-        error = bd_claim(bdev, holder);
-        if (error)
-                goto blkdev_put;
+        bd_finish_claiming(bdev, whole, holder);
        return bdev;
-        
-blkdev_put:
+out_blkdev_put:
        blkdev_put(bdev, mode);
+out_abort_claiming:
+        bd_abort_claiming(whole, holder);
        return ERR_PTR(error);
 }
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                return acl;
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
+        if (!is_owner_or_cap(dentry->d_inode))
+                return -EPERM;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        if (value) {
                acl = posix_acl_from_xattr(value, size);
                if (acl == NULL) {
@@ -282,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
        return ret;
 }
-struct xattr_handler btrfs_xattr_acl_default_handler = {
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = btrfs_xattr_acl_get,
        .set    = btrfs_xattr_acl_set,
 };
-struct xattr_handler btrfs_xattr_acl_access_handler = {
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
                                if (!list_empty(&worker->pending) ||
                                    !list_empty(&worker->prio_pending)) {
                                        spin_unlock_irq(&worker->lock);
+                                        set_current_state(TASK_RUNNING);
                                        goto again;
                                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
         * of extent items we've reserved metadata for.
         */
        spinlock_t accounting_lock;
+        atomic_t outstanding_extents;
        int reserved_extents;
-        int outstanding_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
         * of these.
         */
        unsigned ordered_data_close:1;
+        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct extent_buffer *buf,
-                                       struct extent_buffer *cow)
+                                       struct extent_buffer *cow,
+                                       int *last_ref)
 {
        u64 refs;
        u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
+                *last_ref = 1;
        }
        return 0;
 }
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
        int level;
+        int last_ref = 0;
        int unlock_orig = 0;
        u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        update_ref_for_cow(trans, root, buf, cow);
+        update_ref_for_cow(trans, root, buf, cow, &last_ref);
+        if (root->ref_cows)
+                btrfs_reloc_cow_block(trans, root, buf, cow);
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return bin_search(eb, key, level, slot);
 }
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) + size);
+        spin_unlock(&root->accounting_lock);
+}
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) - size);
+        spin_unlock(&root->accounting_lock);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
-                BUG_ON(ret);
+                if (ret) {
+                        btrfs_tree_unlock(child);
+                        free_extent_buffer(child);
+                        goto enospc;
+                }
                spin_lock(&root->node_lock);
                root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
-                ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-                                            0, root->root_key.objectid, level);
+                root_sub_used(root, mid->len);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
-                return ret;
+                return 0;
        }
        if (btrfs_header_nritems(mid) >
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                        u64 bytenr = right->start;
-                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        free_extent_buffer(right);
-                        right = NULL;
                        wret = del_ptr(trans, root, path, level + 1, pslot +
                                       1);
                        if (wret)
                                ret = wret;
-                        wret = btrfs_free_tree_block(trans, root,
+                        root_sub_used(root, right->len);
-                                                     bytenr, blocksize, 0,
+                        btrfs_free_tree_block(trans, root, right, 0, 1);
-                                                     root->root_key.objectid,
+                        free_extent_buffer(right);
-                                                     level);
+                        right = NULL;
-                        if (wret)
-                                ret = wret;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-                /* we've managed to empty the middle node, drop it */
-                u64 bytenr = mid->start;
-                u32 blocksize = mid->len;
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                free_extent_buffer(mid);
-                mid = NULL;
                wret = del_ptr(trans, root, path, level + 1, pslot);
                if (wret)
                        ret = wret;
-                wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+                root_sub_used(root, mid->len);
-                                         0, root->root_key.objectid, level);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
-                if (wret)
+                free_extent_buffer(mid);
-                        ret = wret;
+                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(NULL, p);
        ret = -EAGAIN;
-        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        tmp = read_tree_block(root, blocknr, blocksize, 0);
        if (tmp) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
                        if (err) {
-                                free_extent_buffer(b);
                                ret = err;
                                goto done;
                        }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        if (IS_ERR(c))
                return PTR_ERR(c);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        int nritems;
        BUG_ON(!path->nodes[level]);
+        btrfs_assert_tree_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        if (IS_ERR(split))
                return PTR_ERR(split);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
@@ -2286,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      int data_size, int empty,
                                      struct extent_buffer *right,
-                                      int free_space, u32 left_nritems)
+                                      int free_space, u32 left_nritems,
+                                      u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *upper = path->nodes[1];
@@ -2309,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (empty)
                nr = 0;
        else
-                nr = 1;
+                nr = max_t(u32, 1, min_slot);
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
@@ -2415,6 +2438,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
+        else
+                clean_tree_block(trans, root, left);
        btrfs_mark_buffer_dirty(right);
        btrfs_item_key(right, &disk_key, 0);
@@ -2448,10 +2474,14 @@ out_unlock:
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
 */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *root, struct btrfs_path *path, int data_size,
+                           *root, struct btrfs_path *path,
-                           int empty)
+                           int min_data_size, int data_size,
+                           int empty, u32 min_slot)
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *right;
@@ -2493,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (left_nritems == 0)
                goto out_unlock;
-        return __push_leaf_right(trans, root, path, data_size, empty,
+        return __push_leaf_right(trans, root, path, min_data_size, empty,
-                                right, free_space, left_nritems);
+                                right, free_space, left_nritems, min_slot);
 out_unlock:
        btrfs_tree_unlock(right);
        free_extent_buffer(right);
@@ -2504,12 +2534,17 @@ out_unlock:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
 */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct btrfs_path *path, int data_size,
                                     int empty, struct extent_buffer *left,
-                                     int free_space, int right_nritems)
+                                     int free_space, u32 right_nritems,
+                                     u32 max_slot)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
@@ -2528,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        slot = path->slots[1];
        if (empty)
-                nr = right_nritems;
+                nr = min(right_nritems, max_slot);
        else
-                nr = right_nritems - 1;
+                nr = min(right_nritems - 1, max_slot);
        for (i = 0; i < nr; i++) {
                item = btrfs_item_nr(right, i);
@@ -2660,6 +2695,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(left);
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
+        else
+                clean_tree_block(trans, root, right);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2706,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
-                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                        clean_tree_block(trans, root, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = left;
@@ -2691,10 +2726,14 @@ out:
 /*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
 */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, struct btrfs_path *path, int data_size,
+                          *root, struct btrfs_path *path, int min_data_size,
-                          int empty)
+                          int data_size, int empty, u32 max_slot)
 {
        struct extent_buffer *right = path->nodes[0];
        struct extent_buffer *left;
@@ -2740,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
                goto out;
        }
-        return __push_leaf_left(trans, root, path, data_size,
+        return __push_leaf_left(trans, root, path, min_data_size,
-                               empty, left, free_space, right_nritems);
+                               empty, left, free_space, right_nritems,
+                               max_slot);
 out:
        btrfs_tree_unlock(left);
        free_extent_buffer(left);
@@ -2834,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 }
 /*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          int data_size)
+{
+        int ret;
+        int progress = 0;
+        int slot;
+        u32 nritems;
+        slot = path->slots[0];
+        /*
+         * try to push all the items after our slot into the
+         * right leaf
+         */
+        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        /*
+         * our goal is to get our slot at the start or end of a leaf.  If
+         * we've done so we're done
+         */
+        if (path->slots[0] == 0 || path->slots[0] == nritems)
+                return 0;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        /* try to push all the items before our slot into the next leaf */
+        slot = path->slots[0];
+        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                progress++;
+        if (progress)
+                return 0;
+        return 1;
+}
+/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
@@ -2855,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int wret;
        int split;
        int num_doubles = 0;
+        int tried_avoid_double = 0;
        l = path->nodes[0];
        slot = path->slots[0];
@@ -2863,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                return -EOVERFLOW;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+        if (data_size) {
-                wret = push_leaf_right(trans, root, path, data_size, 0);
+                wret = push_leaf_right(trans, root, path, data_size,
+                                       data_size, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
-                        wret = push_leaf_left(trans, root, path, data_size, 0);
+                        wret = push_leaf_left(trans, root, path, data_size,
+                                              data_size, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
@@ -2902,6 +3003,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2;
                                }
                        }
@@ -2918,6 +3021,8 @@ again:
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        if (data_size && !tried_avoid_double)
+                                                goto push_for_double;
                                        split = 2 ;
                                }
                        }
@@ -2932,10 +3037,10 @@ again:
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
                                        &disk_key, 0, l->start, 0);
-        if (IS_ERR(right)) {
+        if (IS_ERR(right))
-                BUG_ON(1);
                return PTR_ERR(right);
-        }
+        root_add_used(root, root->leafsize);
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -2998,6 +3103,13 @@ again:
        }
        return ret;
+push_for_double:
+        push_for_double_split(trans, root, path, data_size);
+        tried_avoid_double = 1;
+        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+                return 0;
+        goto again;
 }
 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3054,7 +3166,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &key, path, ins_len, 1);
-        BUG_ON(ret);
+        if (ret)
+                goto err;
        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3909,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+        root_sub_used(root, leaf->len);
-                                    0, root->root_key.objectid, 0);
-        return ret;
+        btrfs_free_tree_block(trans, root, leaf, 0, 1);
+        return 0;
 }
 /*
 * delete the item at the leaf level in path.  If that empties
@@ -3865,6 +3979,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
+                        btrfs_set_path_blocking(path);
+                        clean_tree_block(trans, root, leaf);
                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
@@ -3890,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        extent_buffer_get(leaf);
                        btrfs_set_path_blocking(path);
-                        wret = push_leaf_left(trans, root, path, 1, 1);
+                        wret = push_leaf_left(trans, root, path, 1, 1,
+                                              1, (u32)-1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
                        if (path->nodes[0] == leaf &&
                            btrfs_header_nritems(leaf)) {
-                                wret = push_leaf_right(trans, root, path, 1, 1);
+                                wret = push_leaf_right(trans, root, path, 1,
+                                                       1, 1, 0);
                                if (wret < 0 && wret != -ENOSPC)
                                        ret = wret;
                        }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..eaf286abad17 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES        5
 struct btrfs_block_group_item {
        __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
        u64 flags;
        u64 total_bytes;        /* total bytes in the space */
-        u64 bytes_used;         /* total bytes used on disk */
+        u64 bytes_used;         /* total bytes used,
+                                   this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
-        u64 bytes_super;        /* total bytes reserved for the super blocks */
-        u64 bytes_root;         /* the number of bytes needed to commit a
-                                   transaction */
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
-        u64 bytes_delalloc;     /* number of bytes currently reserved for
+        u64 disk_used;          /* total bytes used on disk */
-                                   delayed allocation */
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
-        int force_delalloc;     /* make people start doing filemap_flush until
-                                   we're under a threshold */
        struct list_head list;
-        /* for controlling how we free up space for allocations */
-        wait_queue_head_t allocate_wait;
-        wait_queue_head_t flush_wait;
-        int allocating_chunk;
-        int flushing;
        /* for block groups in our same type */
-        struct list_head block_groups;
+        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
        atomic_t caching_threads;
 };
+struct btrfs_block_rsv {
+        u64 size;
+        u64 reserved;
+        u64 freed[2];
+        struct btrfs_space_info *space_info;
+        struct list_head list;
+        spinlock_t lock;
+        atomic_t usage;
+        unsigned int priority:8;
+        unsigned int durable:1;
+        unsigned int refill_used:1;
+        unsigned int full:1;
+};
 /*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
+        /* block reservation for extent, checksum and root tree */
+        struct btrfs_block_rsv global_block_rsv;
+        /* block reservation for delay allocation */
+        struct btrfs_block_rsv delalloc_block_rsv;
+        /* block reservation for metadata operations */
+        struct btrfs_block_rsv trans_block_rsv;
+        /* block reservation for chunk tree */
+        struct btrfs_block_rsv chunk_block_rsv;
+        struct btrfs_block_rsv empty_block_rsv;
+        /* list of block reservations that cross multiple transactions */
+        struct list_head durable_block_rsv_list;
+        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers submit_workers;
-        struct btrfs_workers enospc_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+        int enospc_unlink;
        u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
        struct completion kobj_unregister;
        struct mutex objectid_mutex;
+        spinlock_t accounting_lock;
+        struct btrfs_block_rsv *block_rsv;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
-        int clean_orphans;
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
        struct list_head root_list;
-        spinlock_t list_lock;
+        spinlock_t orphan_lock;
        struct list_head orphan_list;
+        struct btrfs_block_rsv *orphan_block_rsv;
+        int orphan_item_inserted;
+        int orphan_cleanup_state;
        spinlock_t inode_lock;
        /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+                           struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
+                           struct extent_buffer *buf,
-                          u64 parent, u64 root_objectid, int level);
+                           u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *group);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                          struct inode *inode, int num_items);
+                                int num_items, int *retries);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                        struct inode *inode, int num_items);
+                                struct btrfs_root *root);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                u64 bytes);
+                                  struct inode *inode);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_orphan_release_metadata(struct inode *inode);
-                                    struct inode *inode, u64 bytes);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                struct btrfs_pending_snapshot *pending);
-                                 u64 bytes);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-                              u64 bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2326,13 +2389,13 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
                              pgoff_t offset, pgoff_t last_index);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
-void btrfs_delete_inode(struct inode *inode);
+void btrfs_evict_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
-void btrfs_drop_inode(struct inode *inode);
+int btrfs_drop_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
 }
 /*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_head *head;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        u32 item_size;
-        u64 num_refs;
-        u64 extent_flags;
-        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = num_bytes;
-        delayed_refs = &trans->transaction->delayed_refs;
-again:
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-                                &key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                leaf = path->nodes[0];
-                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                if (item_size >= sizeof(*ei)) {
-                        ei = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_extent_item);
-                        num_refs = btrfs_extent_refs(leaf, ei);
-                        extent_flags = btrfs_extent_flags(leaf, ei);
-                } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                        struct btrfs_extent_item_v0 *ei0;
-                        BUG_ON(item_size != sizeof(*ei0));
-                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
-                                             struct btrfs_extent_item_v0);
-                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
-                        /* FIXME: this isn't correct for data */
-                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-                        BUG();
-#endif
-                }
-                BUG_ON(num_refs == 0);
-        } else {
-                num_refs = 0;
-                extent_flags = 0;
-                ret = 0;
-        }
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                head = btrfs_delayed_node_to_head(ref);
-                if (!mutex_trylock(&head->mutex)) {
-                        atomic_inc(&ref->refs);
-                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
-                        mutex_lock(&head->mutex);
-                        mutex_unlock(&head->mutex);
-                        btrfs_put_delayed_ref(ref);
-                        goto again;
-                }
-                if (head->extent_op && head->extent_op->update_flags)
-                        extent_flags |= head->extent_op->flags_to_set;
-                else
-                        BUG_ON(num_refs == 0);
-                num_refs += ref->ref_mod;
-                mutex_unlock(&head->mutex);
-        }
-        WARN_ON(num_refs == 0);
-        if (refs)
-                *refs = num_refs;
-        if (flags)
-                *flags = extent_flags;
-out:
-        spin_unlock(&delayed_refs->lock);
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..64f10082f048 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
        int rw;
        int mirror_num;
        unsigned long bio_flags;
+        /*
+         * bio_offset is optional, can be used if the pages in the bio
+         * can't tell us where in the file the bio should go
+         */
+        u64 bio_offset;
        struct btrfs_work work;
 };
@@ -475,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
        end_io_wq->work.func = end_workqueue_fn;
        end_io_wq->work.flags = 0;
-        if (bio->bi_rw & (1 << BIO_RW)) {
+        if (bio->bi_rw & REQ_WRITE) {
                if (end_io_wq->metadata)
                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
                                           &end_io_wq->work);
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
        async = container_of(work, struct  async_submit_bio, work);
        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
                wake_up(&fs_info->async_submit_wait);
        async->submit_bio_done(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        unsigned long bio_flags,
+                        u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -592,10 +600,11 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->work.flags = 0;
        async->bio_flags = bio_flags;
+        async->bio_offset = bio_offset;
        atomic_inc(&fs_info->nr_async_submits);
-        if (rw & (1 << BIO_RW_SYNCIO))
+        if (rw & REQ_SYNC)
                btrfs_set_work_high_prio(&async->work);
        btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
 static int __btree_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 }
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        int ret;
@@ -656,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                          bio, 1);
        BUG_ON(ret);
-        if (!(rw & (1 << BIO_RW))) {
+        if (!(rw & REQ_WRITE)) {
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         */
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num, 0,
+                                   bio_offset,
                                   __btree_submit_bio_start,
                                   __btree_submit_bio_done);
 }
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->ref_cows = 0;
        root->track_dirty = 0;
        root->in_radix = 0;
-        root->clean_orphans = 0;
+        root->orphan_item_inserted = 0;
+        root->orphan_cleanup_state = 0;
        root->fs_info = fs_info;
        root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        root->block_rsv = NULL;
+        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
-        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+        spin_lock_init(&root->accounting_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info)
-{
-        struct extent_buffer *eb;
-        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
-        u64 start = 0;
-        u64 end = 0;
-        int ret;
-        if (!log_root_tree)
-                return 0;
-        while (1) {
-                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
-                if (ret)
-                        break;
-                clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
-        }
-        eb = fs_info->log_root_tree->node;
-        WARN_ON(btrfs_header_level(eb) != 0);
-        WARN_ON(btrfs_header_nritems(eb) != 0);
-        ret = btrfs_free_reserved_extent(fs_info->tree_root,
-                                eb->start, eb->len);
-        BUG_ON(ret);
-        free_extent_buffer(eb);
-        kfree(fs_info->log_root_tree);
-        fs_info->log_root_tree = NULL;
-        return 0;
-}
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1191,19 +1172,23 @@ again:
        if (root)
                return root;
-        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-        if (ret == 0)
-                ret = -ENOENT;
-        if (ret < 0)
-                return ERR_PTR(ret);
        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
-        WARN_ON(btrfs_root_refs(&root->root_item) == 0);
        set_anon_super(&root->anon_super, NULL);
+        if (btrfs_root_refs(&root->root_item) == 0) {
+                ret = -ENOENT;
+                goto fail;
+        }
+        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+        if (ret < 0)
+                goto fail;
+        if (ret == 0)
+                root->orphan_item_inserted = 1;
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto fail;
@@ -1212,10 +1197,9 @@ again:
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
-        if (ret == 0) {
+        if (ret == 0)
                root->in_radix = 1;
-                root->clean_orphans = 1;
-        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();
        if (ret) {
@@ -1443,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
         * ram and up to date before trying to verify things.  For
         * blocksize <= pagesize, it is basically a noop
         */
-        if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+        if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
            !bio_ready_for_csum(bio)) {
                btrfs_queue_worker(&fs_info->endio_meta_workers,
                                   &end_io_wq->work);
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        smp_mb();
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule();
+                        if (!kthread_should_stop())
+                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
+        u64 transid;
        unsigned long now;
        unsigned long delay;
        int ret;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                delay = HZ * 30;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
+                spin_lock(&root->fs_info->new_trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        goto sleep;
                }
                now = get_seconds();
-                if (now < cur->start_time || now - cur->start_time < 30) {
+                if (!cur->blocked &&
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                    (now < cur->start_time || now - cur->start_time < 30)) {
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
+                transid = cur->transid;
-                trans = btrfs_start_transaction(root, 1);
+                spin_unlock(&root->fs_info->new_trans_lock);
-                ret = btrfs_commit_transaction(trans, root);
+                trans = btrfs_join_transaction(root, 1);
+                if (transid == trans->transid) {
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                } else {
+                        btrfs_end_transaction(trans, root);
+                }
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule_timeout(delay);
+                        if (!kthread_should_stop() &&
+                            !btrfs_transaction_blocked(root->fs_info))
+                                schedule_timeout(delay);
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
+        btrfs_init_block_rsv(&fs_info->global_block_rsv);
+        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->enospc_workers, "enospc",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
-        btrfs_start_workers(&fs_info->enospc_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        csum_root->track_dirty = 1;
+        fs_info->generation = generation;
+        fs_info->last_trans_committed = generation;
+        fs_info->data_alloc_profile = (u64)-1;
+        fs_info->metadata_alloc_profile = (u64)-1;
+        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                goto fail_block_groups;
        }
-        fs_info->generation = generation;
-        fs_info->last_trans_committed = generation;
-        fs_info->data_alloc_profile = (u64)-1;
-        fs_info->metadata_alloc_profile = (u64)-1;
-        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
-                log_tree_root = kzalloc(sizeof(struct btrfs_root),
+                log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-                                                      GFP_NOFS);
+                if (!log_tree_root) {
+                        err = -ENOMEM;
+                        goto fail_trans_kthread;
+                }
                __setup_root(nodesize, leafsize, sectorsize, stripesize,
                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        BUG_ON(ret);
        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_fs_roots(fs_info);
+                BUG_ON(ret);
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
                        printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
+        if (IS_ERR(fs_info->fs_root)) {
+                err = PTR_ERR(fs_info->fs_root);
+                goto fail_trans_kthread;
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
-        kthread_stop(root->fs_info->transaction_kthread);
-        kthread_stop(root->fs_info->cleaner_kthread);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
        fs_info->closing = 2;
        smp_mb();
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
-                        unsigned long bio_flags,
+                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc);
-                              int mark_free);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo);
-                                   u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
-        if (atomic_dec_and_test(&cache->count))
+        if (atomic_dec_and_test(&cache->count)) {
+                WARN_ON(cache->pinned > 0);
+                WARN_ON(cache->reserved > 0);
+                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache);
+        }
 }
 /*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
        exclude_super_stripes(extent_root, block_group);
        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_super += block_group->bytes_super;
+        block_group->space_info->bytes_readonly += block_group->bytes_super;
        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
+        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                 BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 }
 /*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        if (!trans) {
+                path->skip_locking = 1;
+                path->search_commit_root = 1;
+        }
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out_free;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
+        } else {
+                num_refs = 0;
+                extent_flags = 0;
+                ret = 0;
+        }
+        if (!trans)
+                goto out;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (head) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
+                num_refs += head->node.ref_mod;
+                mutex_unlock(&head->mutex);
+        }
+        spin_unlock(&delayed_refs->lock);
+out:
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
+out_free:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
@@ -1589,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                             DISCARD_FL_BARRIER);
+                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
-                        int mark_free = 0;
+                        btrfs_pin_extent(root, node->bytenr,
-                        struct extent_buffer *must_clean = NULL;
+                                         node->num_bytes, 1);
-                        ret = pin_down_bytes(trans, root, NULL,
-                                             node->bytenr, node->num_bytes,
-                                             head->is_data, 1, &must_clean);
-                        if (ret > 0)
-                                mark_free = 1;
-                        if (must_clean) {
-                                clean_tree_block(NULL, root, must_clean);
-                                btrfs_tree_unlock(must_clean);
-                                free_extent_buffer(must_clean);
-                        }
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                        if (mark_free) {
-                                ret = btrfs_free_reserved_extent(root,
-                                                        node->bytenr,
-                                                        node->num_bytes);
-                                BUG_ON(ret);
-                        }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                ret = 0;
 out:
        btrfs_free_path(path);
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                WARN_ON(ret > 0);
        return ret;
 }
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             struct btrfs_space_info **space_info)
 {
        struct btrfs_space_info *found;
+        int i;
+        int factor;
+        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                     BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+                found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
-        INIT_LIST_HEAD(&found->block_groups);
+        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
-        init_waitqueue_head(&found->flush_wait);
-        init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
-        found->flags = flags;
+        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                                BTRFS_BLOCK_GROUP_SYSTEM |
+                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
-        found->bytes_delalloc = 0;
+        found->bytes_may_use = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        }
 }
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
-        spin_lock(&cache->space_info->lock);
-        spin_lock(&cache->lock);
-        if (!cache->ro) {
-                cache->space_info->bytes_readonly += cache->key.offset -
-                                        btrfs_block_group_used(&cache->item);
-                cache->ro = 1;
-        }
-        spin_unlock(&cache->lock);
-        spin_unlock(&cache->space_info->lock);
-}
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        u64 alloc_profile;
-        if (data) {
-                alloc_profile = info->avail_data_alloc_bits &
-                        info->data_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-        } else if (root == root->fs_info->chunk_root) {
-                alloc_profile = info->avail_system_alloc_bits &
-                        info->system_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-        } else {
-                alloc_profile = info->avail_metadata_alloc_bits &
-                        info->metadata_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-        }
-        return btrfs_reduce_alloc_profile(root, data);
-}
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-        u64 alloc_target;
-        alloc_target = btrfs_get_alloc_profile(root, 1);
-        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                       alloc_target);
-}
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
-        u64 num_bytes;
-        int level;
-        level = BTRFS_MAX_LEVEL - 2;
-        /*
-         * NOTE: these calculations are absolutely the worst possible case.
-         * This assumes that _every_ item we insert will require a new leaf, and
-         * that the tree has grown to its maximum level size.
-         */
-        /*
-         * for every item we insert we could insert both an extent item and a
-         * extent ref item.  Then for ever item we insert, we will need to cow
-         * both the original leaf, plus the leaf to the left and right of it.
-         *
-         * Unless we are talking about the extent root, then we just want the
-         * number of items * 2, since we just need the extent item plus its ref.
-         */
-        if (root == root->fs_info->extent_root)
-                num_bytes = num_items * 2;
-        else
-                num_bytes = (num_items + (2 * num_items)) * 3;
-        /*
-         * num_bytes is total number of leaves we could need times the leaf
-         * size, and then for every leaf we could end up cow'ing 2 nodes per
-         * level, down to the leaf level.
-         */
-        num_bytes = (num_bytes * root->leafsize) +
-                (num_bytes * (level * 2)) * root->nodesize;
-        return num_bytes;
-}
-/*
- * Unreserve metadata space for delalloc.  If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-                                          struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-        spin_lock(&meta_sinfo->lock);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        if (BTRFS_I(inode)->reserved_extents <=
-            BTRFS_I(inode)->outstanding_extents) {
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                spin_unlock(&meta_sinfo->lock);
-                return 0;
-        }
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->reserved_extents -= num_items;
-        BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-        if (meta_sinfo->bytes_delalloc < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_delalloc = 0;
-        } else {
-                meta_sinfo->bytes_delalloc -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
 {
-        u64 thresh;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
+                flags |= root->fs_info->avail_data_alloc_bits &
-        thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+                         root->fs_info->data_alloc_profile;
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+                flags |= root->fs_info->avail_system_alloc_bits &
-                meta_sinfo->bytes_may_use;
+                         root->fs_info->system_alloc_profile;
+        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-        thresh = meta_sinfo->total_bytes - thresh;
+                flags |= root->fs_info->avail_metadata_alloc_bits &
-        thresh *= 80;
+                         root->fs_info->metadata_alloc_profile;
-        do_div(thresh, 100);
+        return btrfs_reduce_alloc_profile(root, flags);
-        if (thresh <= meta_sinfo->bytes_delalloc)
-                meta_sinfo->force_delalloc = 1;
-        else
-                meta_sinfo->force_delalloc = 0;
 }
-struct async_flush {
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        struct btrfs_work work;
-};
-static noinline void flush_delalloc_async(struct btrfs_work *work)
 {
-        struct async_flush *async;
+        u64 flags;
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        async = container_of(work, struct async_flush, work);
-        root = async->root;
-        info = async->info;
-        btrfs_start_delalloc_inodes(root, 0);
-        wake_up(&info->flush_wait);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-        kfree(async);
-}
-static void wait_on_flush(struct btrfs_space_info *info)
-{
-        DEFINE_WAIT(wait);
-        u64 used;
-        while (1) {
-                prepare_to_wait(&info->flush_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                spin_lock(&info->lock);
-                if (!info->flushing) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                used = info->bytes_used + info->bytes_reserved +
-                        info->bytes_pinned + info->bytes_readonly +
-                        info->bytes_super + info->bytes_root +
-                        info->bytes_may_use + info->bytes_delalloc;
-                if (used < info->total_bytes) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                spin_unlock(&info->lock);
-                schedule();
-        }
-        finish_wait(&info->flush_wait, &wait);
-}
-static void flush_delalloc(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct async_flush *async;
-        bool wait = false;
-        spin_lock(&info->lock);
-        if (!info->flushing)
+        if (data)
-                info->flushing = 1;
+                flags = BTRFS_BLOCK_GROUP_DATA;
+        else if (root == root->fs_info->chunk_root)
+                flags = BTRFS_BLOCK_GROUP_SYSTEM;
        else
-                wait = true;
+                flags = BTRFS_BLOCK_GROUP_METADATA;
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_on_flush(info);
-                return;
-        }
-        async = kzalloc(sizeof(*async), GFP_NOFS);
-        if (!async)
-                goto flush;
-        async->root = root;
-        async->info = info;
-        async->work.func = flush_delalloc_async;
-        btrfs_queue_worker(&root->fs_info->enospc_workers,
-                           &async->work);
-        wait_on_flush(info);
-        return;
-flush:
-        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-}
-static int maybe_allocate_chunk(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-        struct btrfs_trans_handle *trans;
-        bool wait = false;
-        int ret = 0;
-        u64 min_metadata;
-        u64 free_space;
-        free_space = btrfs_super_total_bytes(disk_super);
-        /*
-         * we allow the metadata to grow to a max of either 10gb or 5% of the
-         * space in the volume.
-         */
-        min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-                             div64_u64(free_space * 5, 100));
-        if (info->total_bytes >= min_metadata) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (info->full) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (!info->allocating_chunk) {
-                info->force_alloc = 1;
-                info->allocating_chunk = 1;
-        } else {
-                wait = true;
-        }
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_event(info->allocate_wait,
-                           !info->allocating_chunk);
-                return 1;
-        }
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             4096 + 2 * 1024 * 1024,
-                             info->flags, 0);
-        btrfs_end_transaction(trans, root);
-        if (ret)
-                goto out;
-out:
-        spin_lock(&info->lock);
-        info->allocating_chunk = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->allocate_wait);
-        if (ret)
-                return 0;
-        return 1;
-}
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-                                        struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int flushed = 0;
-        int force_delalloc;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        force_delalloc = meta_sinfo->force_delalloc;
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!flushed)
-                meta_sinfo->bytes_delalloc += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                flushed++;
-                if (flushed == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        flushed++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (flushed == 2) {
-                        filemap_flush(inode->i_mapping);
-                        goto again;
-                } else if (flushed == 3) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_delalloc -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                       BTRFS_I(inode)->outstanding_extents,
-                       BTRFS_I(inode)->reserved_extents);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        BTRFS_I(inode)->reserved_extents += num_items;
+        return get_alloc_profile(root, flags);
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        if (!flushed && force_delalloc)
-                filemap_flush(inode->i_mapping);
-        return 0;
 }
-/*
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
- * unreserve num_items number of items worth of metadata space.  This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-        spin_lock(&meta_sinfo->lock);
-        if (meta_sinfo->bytes_may_use < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_may_use = 0;
-        } else {
-                meta_sinfo->bytes_may_use -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-/*
- * Reserve some metadata space for use.  We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items.  If we
- * have space, fantastic, if not, you get -ENOSPC.  Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space.  THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-        struct btrfs_space_info *meta_sinfo;
+                                                       BTRFS_BLOCK_GROUP_DATA);
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int retries = 0;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!retries)
-                meta_sinfo->bytes_may_use += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                retries++;
-                if (retries == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        retries++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (retries == 2) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_may_use -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        return 0;
 }
 /*
 * This will check the space that the inode allocates from to make sure we have
 * enough space for bytes.
 */
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
-                                u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-        int ret = 0, committed = 0, flushed = 0;
+        int ret = 0, committed = 0;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
-        used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
+        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
-                data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
+                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
-                data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
+                data_sinfo->bytes_may_use;
-                data_sinfo->bytes_super;
        if (used + bytes > data_sinfo->total_bytes) {
                struct btrfs_trans_handle *trans;
-                if (!flushed) {
-                        spin_unlock(&data_sinfo->lock);
-                        flush_delalloc(root, data_sinfo);
-                        flushed = 1;
-                        goto again;
-                }
                /*
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret)
+                        if (ret < 0)
                                return ret;
                        if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
                        if (ret)
                                return ret;
                        goto again;
                }
-                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+#if 0 /* I hope we never need this code again, just in case */
-                       ", %llu bytes_used, %llu bytes_reserved, "
+                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
+                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu total\n", (unsigned long long)bytes,
+                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)data_sinfo->bytes_delalloc,
+                       (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_used,
                       (unsigned long long)data_sinfo->bytes_reserved,
                       (unsigned long long)data_sinfo->bytes_pinned,
                       (unsigned long long)data_sinfo->bytes_readonly,
                       (unsigned long long)data_sinfo->bytes_may_use,
                       (unsigned long long)data_sinfo->total_bytes);
+#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
 }
 /*
- * if there was an error for whatever reason after calling
+ * called when we are clearing an delalloc extent from the
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
 */
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-                                    struct inode *inode, u64 bytes)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
        /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
        spin_unlock(&data_sinfo->lock);
 }
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-                                  u64 bytes)
-{
-        struct btrfs_space_info *data_sinfo;
-        /* get the space info for where this inode will be storing its data */
-        data_sinfo = BTRFS_I(inode)->space_info;
-        /* make sure we have enough space to handle the data first */
-        spin_lock(&data_sinfo->lock);
-        data_sinfo->bytes_delalloc += bytes;
-        /*
-         * we are adding a delalloc extent without calling
-         * btrfs_check_data_free_space first.  This happens on a weird
-         * writepage condition, but shouldn't hurt our accounting
-         */
-        if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-                data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-                BTRFS_I(inode)->reserved_bytes = 0;
-        } else {
-                data_sinfo->bytes_may_use -= bytes;
-                BTRFS_I(inode)->reserved_bytes -= bytes;
-        }
-        spin_unlock(&data_sinfo->lock);
-}
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-                              u64 bytes)
-{
-        struct btrfs_space_info *info;
-        info = BTRFS_I(inode)->space_info;
-        spin_lock(&info->lock);
-        info->bytes_delalloc -= bytes;
-        spin_unlock(&info->lock);
-}
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+                              u64 alloc_bytes)
+{
+        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 0;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes < div_factor(num_bytes, 8))
+                return 0;
+        return 1;
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
-        u64 thresh;
        int ret = 0;
        mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
-        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
-        thresh = div_factor(thresh, 8);
-        if (!force &&
-           (space_info->bytes_used + space_info->bytes_pinned +
-            space_info->bytes_reserved + alloc_bytes) < thresh) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+        else
+                ret = 1;
        space_info->force_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
@@ -3461,13 +3073,713 @@ out:
        return ret;
 }
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+        int ret;
+        int end_trans = 0;
+        if (sinfo->full)
+                return 0;
+        spin_lock(&sinfo->lock);
+        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+        spin_unlock(&sinfo->lock);
+        if (!ret)
+                return 0;
+        if (!trans) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                end_trans = 1;
+        }
+        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                             num_bytes + 2 * 1024 * 1024,
+                             get_alloc_profile(root, sinfo->flags), 0);
+        if (end_trans)
+                btrfs_end_transaction(trans, root);
+        return ret == 1 ? 1 : 0;
+}
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 to_reclaim)
+{
+        struct btrfs_block_rsv *block_rsv;
+        u64 reserved;
+        u64 max_reclaim;
+        u64 reclaimed = 0;
+        int pause = 1;
+        int ret;
+        block_rsv = &root->fs_info->delalloc_block_rsv;
+        spin_lock(&block_rsv->lock);
+        reserved = block_rsv->reserved;
+        spin_unlock(&block_rsv->lock);
+        if (reserved == 0)
+                return 0;
+        max_reclaim = min(reserved, to_reclaim);
+        while (1) {
+                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                if (!ret) {
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(pause);
+                        pause <<= 1;
+                        if (pause > HZ / 10)
+                                pause = HZ / 10;
+                } else {
+                        pause = 1;
+                }
+                spin_lock(&block_rsv->lock);
+                if (reserved > block_rsv->reserved)
+                        reclaimed = reserved - block_rsv->reserved;
+                reserved = block_rsv->reserved;
+                spin_unlock(&block_rsv->lock);
+                if (reserved == 0 || reclaimed >= max_reclaim)
+                        break;
+                if (trans && trans->transaction->blocked)
+                        return -EAGAIN;
+        }
+        return reclaimed >= to_reclaim;
+}
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int *retries)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        int ret;
+        if ((*retries) > 2)
+                return -ENOSPC;
+        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        if (ret)
+                return 1;
+        if (trans && trans->transaction->in_commit)
+                return -ENOSPC;
+        ret = shrink_delalloc(trans, root, num_bytes);
+        if (ret)
+                return ret;
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned < num_bytes)
+                ret = 1;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                return -ENOSPC;
+        (*retries)++;
+        if (trans)
+                return -EAGAIN;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        return 1;
+}
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+                                  u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        u64 unused;
+        int ret = -ENOSPC;
+        spin_lock(&space_info->lock);
+        unused = space_info->bytes_used + space_info->bytes_reserved +
+                 space_info->bytes_pinned + space_info->bytes_readonly;
+        if (unused < space_info->total_bytes)
+                unused = space_info->total_bytes - unused;
+        else
+                unused = 0;
+        if (unused >= num_bytes) {
+                if (block_rsv->priority >= 10) {
+                        space_info->bytes_reserved += num_bytes;
+                        ret = 0;
+                } else {
+                        if ((unused + block_rsv->reserved) *
+                            block_rsv->priority >=
+                            (num_bytes + block_rsv->reserved) * 10) {
+                                space_info->bytes_reserved += num_bytes;
+                                ret = 0;
+                        }
+                }
+        }
+        spin_unlock(&space_info->lock);
+        return ret;
+}
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        if (root->ref_cows)
+                block_rsv = trans->block_rsv;
+        else
+                block_rsv = root->block_rsv;
+        if (!block_rsv)
+                block_rsv = &root->fs_info->empty_block_rsv;
+        return block_rsv;
+}
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+        int ret = -ENOSPC;
+        spin_lock(&block_rsv->lock);
+        if (block_rsv->reserved >= num_bytes) {
+                block_rsv->reserved -= num_bytes;
+                if (block_rsv->reserved < block_rsv->size)
+                        block_rsv->full = 0;
+                ret = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        return ret;
+}
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int update_size)
+{
+        spin_lock(&block_rsv->lock);
+        block_rsv->reserved += num_bytes;
+        if (update_size)
+                block_rsv->size += num_bytes;
+        else if (block_rsv->reserved >= block_rsv->size)
+                block_rsv->full = 1;
+        spin_unlock(&block_rsv->lock);
+}
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+                             struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        spin_lock(&block_rsv->lock);
+        if (num_bytes == (u64)-1)
+                num_bytes = block_rsv->size;
+        block_rsv->size -= num_bytes;
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        } else {
+                num_bytes = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (num_bytes > 0) {
+                if (dest) {
+                        block_rsv_add_bytes(dest, num_bytes, 0);
+                } else {
+                        spin_lock(&space_info->lock);
+                        space_info->bytes_reserved -= num_bytes;
+                        spin_unlock(&space_info->lock);
+                }
+        }
+}
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+                                   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+        int ret;
+        ret = block_rsv_use_bytes(src, num_bytes);
+        if (ret)
+                return ret;
+        block_rsv_add_bytes(dst, num_bytes, 1);
+        return 0;
+}
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+        memset(rsv, 0, sizeof(*rsv));
+        spin_lock_init(&rsv->lock);
+        atomic_set(&rsv->usage, 1);
+        rsv->priority = 6;
+        INIT_LIST_HEAD(&rsv->list);
+}
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 alloc_target;
+        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+        if (!block_rsv)
+                return NULL;
+        btrfs_init_block_rsv(block_rsv);
+        alloc_target = btrfs_get_alloc_profile(root, 0);
+        block_rsv->space_info = __find_space_info(fs_info,
+                                                  BTRFS_BLOCK_GROUP_METADATA);
+        return block_rsv;
+}
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv)
+{
+        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+                btrfs_block_rsv_release(root, rsv, (u64)-1);
+                if (!rsv->durable)
+                        kfree(rsv);
+        }
+}
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *block_rsv)
+{
+        block_rsv->durable = 1;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
+        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries)
+{
+        int ret;
+        if (num_bytes == 0)
+                return 0;
+again:
+        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        if (!ret) {
+                block_rsv_add_bytes(block_rsv, num_bytes, 1);
+                return 0;
+        }
+        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+        if (ret > 0)
+                goto again;
+        return ret;
+}
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor)
+{
+        u64 num_bytes = 0;
+        int commit_trans = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        if (min_factor > 0)
+                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (min_reserved > num_bytes)
+                num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes) {
+                ret = 0;
+        } else {
+                num_bytes -= block_rsv->reserved;
+                if (block_rsv->durable &&
+                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+                        commit_trans = 1;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (!ret)
+                return 0;
+        if (block_rsv->refill_used) {
+                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                if (!ret) {
+                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
+                        return 0;
+                }
+        }
+        if (commit_trans) {
+                if (trans)
+                        return -EAGAIN;
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                ret = btrfs_commit_transaction(trans, root);
+                return 0;
+        }
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return -ENOSPC;
+}
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes)
+{
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes)
+{
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+        if (global_rsv->full || global_rsv == block_rsv ||
+            block_rsv->space_info != global_rsv->space_info)
+                global_rsv = NULL;
+        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *sinfo;
+        u64 num_bytes;
+        u64 meta_used;
+        u64 data_used;
+        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+        /*
+         * per tree used space accounting can be inaccuracy, so we
+         * can't rely on it.
+         */
+        spin_lock(&fs_info->extent_root->accounting_lock);
+        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+        spin_unlock(&fs_info->extent_root->accounting_lock);
+        spin_lock(&fs_info->csum_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+        spin_unlock(&fs_info->csum_root->accounting_lock);
+        spin_lock(&fs_info->tree_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+        spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+        spin_lock(&sinfo->lock);
+        data_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        spin_lock(&sinfo->lock);
+        meta_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+                    csum_size * 2;
+        num_bytes += div64_u64(data_used + meta_used, 50);
+        if (num_bytes * 3 > meta_used)
+                num_bytes = div64_u64(meta_used, 3);
+        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        struct btrfs_space_info *sinfo = block_rsv->space_info;
+        u64 num_bytes;
+        num_bytes = calc_global_metadata_size(fs_info);
+        spin_lock(&block_rsv->lock);
+        spin_lock(&sinfo->lock);
+        block_rsv->size = num_bytes;
+        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+        if (sinfo->total_bytes > num_bytes) {
+                num_bytes = sinfo->total_bytes - num_bytes;
+                block_rsv->reserved += num_bytes;
+                sinfo->bytes_reserved += num_bytes;
+        }
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                sinfo->bytes_reserved -= num_bytes;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        }
+#if 0
+        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+                block_rsv->size, block_rsv->reserved);
+#endif
+        spin_unlock(&sinfo->lock);
+        spin_unlock(&block_rsv->lock);
+}
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+        fs_info->chunk_block_rsv.space_info = space_info;
+        fs_info->chunk_block_rsv.priority = 10;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        fs_info->global_block_rsv.space_info = space_info;
+        fs_info->global_block_rsv.priority = 10;
+        fs_info->global_block_rsv.refill_used = 1;
+        fs_info->delalloc_block_rsv.space_info = space_info;
+        fs_info->trans_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.priority = 10;
+        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+        update_global_block_rsv(fs_info);
+}
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+        WARN_ON(fs_info->trans_block_rsv.size > 0);
+        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+        WARN_ON(fs_info->chunk_block_rsv.size > 0);
+        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 int num_items, int *retries)
+{
+        u64 num_bytes;
+        int ret;
+        if (num_items == 0 || root->fs_info->chunk_root == root)
+                return 0;
+        num_bytes = calc_trans_metadata_size(root, num_items);
+        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+                                  num_bytes, retries);
+        if (!ret) {
+                trans->bytes_reserved += num_bytes;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+        }
+        return ret;
+}
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        if (!trans->bytes_reserved)
+                return;
+        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv,
+                                trans->bytes_reserved);
+        trans->bytes_reserved = 0;
+}
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+                                  struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+        /*
+         * one for deleting orphan item, one for updating inode and
+         * two for calling btrfs_truncate_inode_items.
+         *
+         * btrfs_truncate_inode_items is a delete operation, it frees
+         * more space than it uses in most cases. So two units of
+         * metadata space should be enough for calling it many times.
+         * If all of the metadata space is used, we can commit
+         * transaction and use space it freed.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+        /*
+         * two for root back/forward refs, two for directory entries
+         * and one for root of the snapshot.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        dst_rsv->space_info = src_rsv->space_info;
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+        return num_bytes >>= 3;
+}
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+        u64 to_reserve;
+        int nr_extents;
+        int retries = 0;
+        int ret;
+        if (btrfs_transaction_in_commit(root->fs_info))
+                schedule_timeout(1);
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+                nr_extents -= BTRFS_I(inode)->reserved_extents;
+                to_reserve = calc_trans_metadata_size(root, nr_extents);
+        } else {
+                nr_extents = 0;
+                to_reserve = 0;
+        }
+        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        if (ret) {
+                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+                                           &retries);
+                if (ret > 0)
+                        goto again;
+                return ret;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        block_rsv_add_bytes(block_rsv, to_reserve, 1);
+        if (block_rsv->size > 512 * 1024 * 1024)
+                shrink_delalloc(NULL, root, to_reserve);
+        return 0;
+}
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 to_free;
+        int nr_extents;
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+                BTRFS_I(inode)->reserved_extents -= nr_extents;
+        } else {
+                nr_extents = 0;
+        }
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        to_free = calc_csum_metadata_size(inode, num_bytes);
+        if (nr_extents > 0)
+                to_free += calc_trans_metadata_size(root, nr_extents);
+        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+                                to_free);
+}
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+        int ret;
+        ret = btrfs_check_data_free_space(inode, num_bytes);
+        if (ret)
+                return ret;
+        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+        if (ret) {
+                btrfs_free_reserved_data_space(inode, num_bytes);
+                return ret;
+        }
+        return 0;
+}
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+        btrfs_delalloc_release_metadata(inode, num_bytes);
+        btrfs_free_reserved_data_space(inode, num_bytes);
+}
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc)
-                              int mark_free)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                    BTRFS_BLOCK_GROUP_RAID1 |
+                                    BTRFS_BLOCK_GROUP_RAID10))
+                        factor = 2;
+                else
+                        factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        if (cache->ro)
+                        cache->space_info->bytes_used += num_bytes;
-                                cache->space_info->bytes_readonly -= num_bytes;
+                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
-                        cache->space_info->bytes_used -= num_bytes;
-                        if (cache->ro)
-                                cache->space_info->bytes_readonly += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
+                        cache->pinned += num_bytes;
+                        cache->space_info->bytes_pinned += num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        if (mark_free) {
-                                int ret;
-                                ret = btrfs_discard_extent(root, bytenr,
-                                                           num_bytes);
-                                WARN_ON(ret);
-                                ret = btrfs_add_free_space(cache, bytenr,
+                        set_extent_dirty(info->pinned_extents,
-                                                           num_bytes);
+                                         bytenr, bytenr + num_bytes - 1,
-                                WARN_ON(ret);
+                                         GFP_NOFS | __GFP_NOFAIL);
-                        }
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        return bytenr;
 }
-/*
+static int pin_down_extent(struct btrfs_root *root,
- * this function must be called within transaction
+                           struct btrfs_block_group_cache *cache,
- */
+                           u64 bytenr, u64 num_bytes, int reserved)
-int btrfs_pin_extent(struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, int reserved)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_block_group_cache *cache;
-        cache = btrfs_lookup_block_group(fs_info, bytenr);
-        BUG_ON(!cache);
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
-        btrfs_put_block_group(cache);
+        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+        return 0;
+}
-        set_extent_dirty(fs_info->pinned_extents,
+/*
-                         bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, int reserved)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+        btrfs_put_block_group(cache);
        return 0;
 }
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+/*
-                                   u64 num_bytes, int reserve)
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo)
 {
-        spin_lock(&cache->space_info->lock);
+        int ret = 0;
-        spin_lock(&cache->lock);
+        if (sinfo) {
-        if (reserve) {
+                struct btrfs_space_info *space_info = cache->space_info;
-                cache->reserved += num_bytes;
+                spin_lock(&space_info->lock);
-                cache->space_info->bytes_reserved += num_bytes;
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        if (cache->ro) {
+                                ret = -EAGAIN;
+                        } else {
+                                cache->reserved += num_bytes;
+                                space_info->bytes_reserved += num_bytes;
+                        }
+                } else {
+                        if (cache->ro)
+                                space_info->bytes_readonly += num_bytes;
+                        cache->reserved -= num_bytes;
+                        space_info->bytes_reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&space_info->lock);
        } else {
-                cache->reserved -= num_bytes;
+                spin_lock(&cache->lock);
-                cache->space_info->bytes_reserved -= num_bytes;
+                if (cache->ro) {
+                        ret = -EAGAIN;
+                } else {
+                        if (reserve)
+                                cache->reserved += num_bytes;
+                        else
+                                cache->reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
        }
-        spin_unlock(&cache->lock);
+        return ret;
-        spin_unlock(&cache->space_info->lock);
-        return 0;
 }
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                fs_info->pinned_extents = &fs_info->freed_extents[0];
        up_write(&fs_info->extent_commit_sem);
+        update_global_block_rsv(fs_info);
        return 0;
 }
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                        btrfs_add_free_space(cache, start, len);
                }
+                start += len;
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
+                if (cache->ro) {
+                        cache->space_info->bytes_readonly += len;
+                } else if (cache->reserved_pinned > 0) {
+                        len = min(len, cache->reserved_pinned);
+                        cache->reserved_pinned -= len;
+                        cache->space_info->bytes_reserved += len;
+                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                start += len;
        }
        if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
+        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        return ret;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
-}
+        list_for_each_entry_safe(block_rsv, next_rsv,
+                                 &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean)
-{
-        int err = 0;
-        struct extent_buffer *buf;
-        if (is_data)
+                idx = trans->transid & 0x1;
-                goto pinit;
+                if (block_rsv->freed[idx] > 0) {
+                        block_rsv_add_bytes(block_rsv,
-        /*
+                                            block_rsv->freed[idx], 0);
-         * discard is sloooow, and so triggering discards on
+                        block_rsv->freed[idx] = 0;
-         * individual btree blocks isn't a good plan.  Just
+                }
-         * pin everything in discard mode.
+                if (atomic_read(&block_rsv->usage) == 0) {
-         */
+                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-        if (btrfs_test_opt(root, DISCARD))
-                goto pinit;
-        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-        if (!buf)
-                goto pinit;
-        /* we can reuse a block if it hasn't been written
+                        if (block_rsv->freed[0] == 0 &&
-         * and it is from this transaction.  We can't
+                            block_rsv->freed[1] == 0) {
-         * reuse anything from the tree log root because
+                                list_del_init(&block_rsv->list);
-         * it has tiny sub-transactions.
+                                kfree(block_rsv);
-         */
+                        }
-        if (btrfs_buffer_uptodate(buf, 0) &&
+                } else {
-            btrfs_try_tree_lock(buf)) {
+                        btrfs_block_rsv_release(root, block_rsv, 0);
-                u64 header_owner = btrfs_header_owner(buf);
-                u64 header_transid = btrfs_header_generation(buf);
-                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_transid == trans->transid &&
-                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        *must_clean = buf;
-                        return 1;
                }
-                btrfs_tree_unlock(buf);
        }
-        free_extent_buffer(buf);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
-pinit:
-        if (path)
-                btrfs_set_path_blocking(path);
-        /* unlocks the pinned mutex */
-        btrfs_pin_extent(root, bytenr, num_bytes, reserved);
-        BUG_ON(err < 0);
        return 0;
 }
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
        } else {
-                int mark_free = 0;
-                struct extent_buffer *must_clean = NULL;
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
                               extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = pin_down_bytes(trans, root, path, bytenr,
-                                     num_bytes, is_data, 0, &must_clean);
-                if (ret > 0)
-                        mark_free = 1;
-                BUG_ON(ret < 0);
-                /*
-                 * it is going to be very rare for someone to be waiting
-                 * on the block we're freeing.  del_items might need to
-                 * schedule, so rather than get fancy, just force it
-                 * to blocking here
-                 */
-                if (must_clean)
-                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
-                if (must_clean) {
-                        clean_tree_block(NULL, root, must_clean);
-                        btrfs_tree_unlock(must_clean);
-                        free_extent_buffer(must_clean);
-                }
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
-                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-                                         mark_free);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
-        int ret;
+        int ret = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
-        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+        BUG_ON(head->extent_op);
-                                  &head->node, head->extent_op,
+        if (head->must_insert_reserved)
-                                  head->must_insert_reserved);
+                ret = 1;
-        BUG_ON(ret);
+        mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref(&head->node);
-        return 0;
+        return ret;
 out:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct extent_buffer *buf,
+                           u64 parent, int last_ref)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_group_cache *cache = NULL;
+        int ret;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                                                parent, root->root_key.objectid,
+                                                btrfs_header_level(buf),
+                                                BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
+        }
+        if (!last_ref)
+                return;
+        block_rsv = get_block_rsv(trans, root);
+        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+        if (block_rsv->space_info != cache->space_info)
+                goto out;
+        if (btrfs_header_generation(buf) == trans->transid) {
+                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                        ret = check_ref_cleanup(trans, root, buf->start);
+                        if (!ret)
+                                goto pin;
+                }
+                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                        goto pin;
+                }
+                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+                btrfs_add_free_space(cache, buf->start, buf->len);
+                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                if (ret == -EAGAIN) {
+                        /* block group became read-only */
+                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        goto out;
+                }
+                ret = 1;
+                spin_lock(&block_rsv->lock);
+                if (block_rsv->reserved < block_rsv->size) {
+                        block_rsv->reserved += buf->len;
+                        ret = 0;
+                }
+                spin_unlock(&block_rsv->lock);
+                if (ret) {
+                        spin_lock(&cache->space_info->lock);
+                        cache->space_info->bytes_reserved -= buf->len;
+                        spin_unlock(&cache->space_info->lock);
+                }
+                goto out;
+        }
+pin:
+        if (block_rsv->durable && !cache->ro) {
+                ret = 0;
+                spin_lock(&cache->lock);
+                if (!cache->ro) {
+                        cache->reserved_pinned += buf->len;
+                        ret = 1;
+                }
+                spin_unlock(&cache->lock);
+                if (ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->freed[trans->transid & 0x1] += buf->len;
+                        spin_unlock(&block_rsv->lock);
+                }
+        }
+out:
+        btrfs_put_block_group(cache);
+}
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        parent, root_objectid, (int)owner,
                                        BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret);
-                ret = check_ref_cleanup(trans, root, bytenr);
-                BUG_ON(ret);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
                                        parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
-                          u64 parent, u64 root_objectid, int level)
-{
-        u64 used;
-        spin_lock(&root->node_lock);
-        used = btrfs_root_used(&root->root_item) - blocksize;
-        btrfs_set_root_used(&root->root_item, used);
-        spin_unlock(&root->node_lock);
-        return btrfs_free_extent(trans, root, bytenr, blocksize,
-                                 parent, root_objectid, level, 0);
-}
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+        int index;
+        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+                index = 0;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+                index = 1;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+                index = 2;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+                index = 3;
+        else
+                index = 4;
+        return index;
+}
 enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     u64 exclude_start, u64 exclude_nr,
                                     int data)
 {
        int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        int index = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                                index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
        }
 search:
        down_read(&space_info->groups_sem);
-        list_for_each_entry(block_group, &space_info->block_groups, list) {
+        list_for_each_entry(block_group, &space_info->block_groups[index],
+                            list) {
                u64 offset;
                int cached;
@@ -4436,23 +4822,22 @@ checks:
                        goto loop;
                }
-                if (exclude_nr > 0 &&
+                ins->objectid = search_start;
-                    (search_start + num_bytes > exclude_start &&
+                ins->offset = num_bytes;
-                     search_start < exclude_start + exclude_nr)) {
-                        search_start = exclude_start + exclude_nr;
+                if (offset < search_start)
+                        btrfs_add_free_space(block_group, offset,
+                                             search_start - offset);
+                BUG_ON(offset > search_start);
+                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        /*
-                         * if search_start is still in this block group
-                         * then we just re-search this block group
-                         */
-                        if (search_start >= block_group->key.objectid &&
-                            search_start < (block_group->key.objectid +
-                                            block_group->key.offset))
-                                goto have_block_group;
                        goto loop;
                }
+                /* we are all good, lets return */
                ins->objectid = search_start;
                ins->offset = num_bytes;
@@ -4460,18 +4845,18 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                update_reserved_extents(block_group, num_bytes, 1);
-                /* we are all good, lets return */
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+                BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+                goto search;
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
+        int index = 0;
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
-                                    info->bytes_super),
+                                    info->bytes_readonly),
               (info->full) ? "" : "not ");
-        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
-               " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+               "reserved=%llu, may_use=%llu, readonly=%llu\n",
-               "\n",
               (unsigned long long)info->total_bytes,
+               (unsigned long long)info->bytes_used,
               (unsigned long long)info->bytes_pinned,
-               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_reserved,
               (unsigned long long)info->bytes_may_use,
-               (unsigned long long)info->bytes_used,
+               (unsigned long long)info->bytes_readonly);
-               (unsigned long long)info->bytes_root,
-               (unsigned long long)info->bytes_super,
-               (unsigned long long)info->bytes_reserved);
        spin_unlock(&info->lock);
        if (!dump_block_groups)
                return;
        down_read(&info->groups_sem);
-        list_for_each_entry(cache, &info->block_groups, list) {
+again:
+        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+        if (++index < BTRFS_NR_RAID_TYPES)
+                goto again;
        up_read(&info->groups_sem);
 }
@@ -4628,9 +5015,8 @@ again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                               search_start, search_end, hint_byte, ins,
+                               search_start, search_end, hint_byte,
-                               trans->alloc_exclude_start,
+                               ins, data);
-                               trans->alloc_exclude_nr, data);
        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
                num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_extents(cache, len, 0);
+        update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
        return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        update_reserved_extents(block_group, ins->offset, 1);
+        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        return ret;
 }
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            u64 num_bytes, u64 parent, u64 root_objectid,
-                            struct btrfs_disk_key *key, int level,
-                            u64 empty_size, u64 hint_byte, u64 search_end,
-                            struct btrfs_key *ins)
-{
-        int ret;
-        u64 flags = 0;
-        ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                   empty_size, hint_byte, search_end,
-                                   ins, 0);
-        if (ret)
-                return ret;
-        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                if (parent == 0)
-                        parent = ins->objectid;
-                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        } else
-                BUG_ON(parent > 0);
-        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                struct btrfs_delayed_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                if (key)
-                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
-                else
-                        memset(&extent_op->key, 0, sizeof(extent_op->key));
-                extent_op->flags_to_set = flags;
-                extent_op->update_key = 1;
-                extent_op->update_flags = 1;
-                extent_op->is_data = 0;
-                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-                                        ins->offset, parent, root_objectid,
-                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op);
-                BUG_ON(ret);
-        }
-        if (root_objectid == root->root_key.objectid) {
-                u64 used;
-                spin_lock(&root->node_lock);
-                used = btrfs_root_used(&root->root_item) + num_bytes;
-                btrfs_set_root_used(&root->root_item, used);
-                spin_unlock(&root->node_lock);
-        }
-        return ret;
-}
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        return buf;
 }
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+              struct btrfs_root *root, u32 blocksize)
+{
+        struct btrfs_block_rsv *block_rsv;
+        int ret;
+        block_rsv = get_block_rsv(trans, root);
+        if (block_rsv->size == 0) {
+                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                if (ret)
+                        return ERR_PTR(ret);
+                return block_rsv;
+        }
+        ret = block_rsv_use_bytes(block_rsv, blocksize);
+        if (!ret)
+                return block_rsv;
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return ERR_PTR(-ENOSPC);
+}
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+        block_rsv_add_bytes(block_rsv, blocksize, 0);
+        block_rsv_release_bytes(block_rsv, NULL, 0);
+}
 /*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
-        int ret;
+        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+        u64 flags = 0;
+        int ret;
-        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+        block_rsv = use_block_rsv(trans, root, blocksize);
-                               key, level, empty_size, hint, (u64)-1, &ins);
+        if (IS_ERR(block_rsv))
+                return ERR_CAST(block_rsv);
+        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-                BUG_ON(ret > 0);
+                unuse_block_rsv(block_rsv, blocksize);
                return ERR_PTR(ret);
        }
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
+        BUG_ON(IS_ERR(buf));
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins.objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                struct btrfs_delayed_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                                        ins.offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
+                BUG_ON(ret);
+        }
        return buf;
 }
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
 {
-        int ret = 0;
+        int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
-        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
-                                root->root_key.objectid, level, 0);
-        BUG_ON(ret);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
-        return ret;
+        return 0;
 }
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        BUG_ON(!wc);
-        trans = btrfs_start_transaction(tree_root, 1);
+        trans = btrfs_start_transaction(tree_root, 0);
+        if (block_rsv)
+                trans->block_rsv = block_rsv;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                }
                BUG_ON(wc->level == 0);
-                if (trans->transaction->in_commit ||
+                if (btrfs_should_end_transaction(trans, tree_root)) {
-                    trans->transaction->delayed_refs.flushing) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, tree_root);
+                        btrfs_end_transaction_throttle(trans, tree_root);
-                        trans = btrfs_start_transaction(tree_root, 1);
+                        trans = btrfs_start_transaction(tree_root, 0);
-                } else {
+                        if (block_rsv)
-                        unsigned long update;
+                                trans->block_rsv = block_rsv;
-                        update = trans->delayed_ref_updates;
-                        trans->delayed_ref_updates = 0;
-                        if (update)
-                                btrfs_run_delayed_refs(trans, tree_root,
-                                                       update);
                }
        }
        btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                kfree(root);
        }
 out:
-        btrfs_end_transaction(trans, tree_root);
+        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
        return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
-                     struct btrfs_block_group_cache *shrink_block_group,
-                     int force)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_space_info *sinfo = cache->space_info;
-        u64 new_alloc_flags;
+        u64 num_bytes;
-        u64 calc;
+        int ret = -ENOSPC;
-        spin_lock(&shrink_block_group->lock);
+        if (cache->ro)
-        if (btrfs_block_group_used(&shrink_block_group->item) +
+                return 0;
-            shrink_block_group->reserved > 0) {
-                spin_unlock(&shrink_block_group->lock);
-                trans = btrfs_start_transaction(root, 1);
+        spin_lock(&sinfo->lock);
-                spin_lock(&shrink_block_group->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+                sinfo->bytes_readonly += num_bytes;
+                sinfo->bytes_reserved += cache->reserved_pinned;
+                cache->reserved_pinned = 0;
+                cache->ro = 1;
+                ret = 0;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
+        return ret;
+}
-                new_alloc_flags = update_block_group_flags(root,
+int btrfs_set_block_group_ro(struct btrfs_root *root,
-                                                   shrink_block_group->flags);
+                             struct btrfs_block_group_cache *cache)
-                if (new_alloc_flags != shrink_block_group->flags) {
-                        calc =
-                             btrfs_block_group_used(&shrink_block_group->item);
-                } else {
-                        calc = shrink_block_group->key.offset;
-                }
-                spin_unlock(&shrink_block_group->lock);
-                do_chunk_alloc(trans, root->fs_info->extent_root,
+{
-                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+        struct btrfs_trans_handle *trans;
+        u64 alloc_flags;
+        int ret;
-                btrfs_end_transaction(trans, root);
+        BUG_ON(cache->ro);
-        } else
-                spin_unlock(&shrink_block_group->lock);
+        trans = btrfs_join_transaction(root, 1);
-        return 0;
+        BUG_ON(IS_ERR(trans));
-}
+        alloc_flags = update_block_group_flags(root, cache->flags);
+        if (alloc_flags != cache->flags)
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+        ret = set_block_group_ro(cache);
-                                         struct btrfs_block_group_cache *group)
+        if (!ret)
+                goto out;
+        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        if (ret < 0)
+                goto out;
+        ret = set_block_group_ro(cache);
+out:
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                              struct btrfs_block_group_cache *cache)
 {
-        __alloc_chunk_for_shrink(root, group, 1);
+        struct btrfs_space_info *sinfo = cache->space_info;
-        set_block_group_readonly(group);
+        u64 num_bytes;
+        BUG_ON(!cache->ro);
+        spin_lock(&sinfo->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        sinfo->bytes_readonly -= num_bytes;
+        cache->ro = 0;
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
        return 0;
 }
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         */
        synchronize_rcu();
+        release_global_block_rsv(info);
        while(!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
+                if (space_info->bytes_pinned > 0 ||
+                    space_info->bytes_reserved > 0) {
+                        WARN_ON(1);
+                        dump_space_info(space_info, 0, 0);
+                }
                list_del(&space_info->list);
                kfree(space_info);
        }
        return 0;
 }
+static void __link_block_group(struct btrfs_space_info *space_info,
+                               struct btrfs_block_group_cache *cache)
+{
+        int index = get_block_group_index(cache);
+        down_write(&space_info->groups_sem);
+        list_add_tail(&cache->list, &space_info->block_groups[index]);
+        up_write(&space_info->groups_sem);
+}
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        while (1) {
                ret = find_first_block_group(root, path, &key);
-                if (ret > 0) {
+                if (ret > 0)
-                        ret = 0;
+                        break;
-                        goto error;
-                }
                if (ret != 0)
                        goto error;
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
                if (!cache) {
                        ret = -ENOMEM;
-                        break;
+                        goto error;
                }
                atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                BUG_ON(ret);
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
-                cache->space_info->bytes_super += cache->bytes_super;
+                cache->space_info->bytes_readonly += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
-                down_write(&space_info->groups_sem);
+                __link_block_group(space_info, cache);
-                list_add_tail(&cache->list, &space_info->block_groups);
-                up_write(&space_info->groups_sem);
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                        set_block_group_readonly(cache);
+                        set_block_group_ro(cache);
        }
+        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+                if (!(get_alloc_profile(root, space_info->flags) &
+                      (BTRFS_BLOCK_GROUP_RAID10 |
+                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_DUP)))
+                        continue;
+                /*
+                 * avoid allocating from un-mirrored block group if there are
+                 * mirrored block groups.
+                 */
+                list_for_each_entry(cache, &space_info->block_groups[3], list)
+                        set_block_group_ro(cache);
+                list_for_each_entry(cache, &space_info->block_groups[4], list)
+                        set_block_group_ro(cache);
+        }
+        init_global_block_rsv(info);
        ret = 0;
 error:
        btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        spin_lock(&cache->space_info->lock);
-        cache->space_info->bytes_super += cache->bytes_super;
+        cache->space_info->bytes_readonly += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
-        down_write(&cache->space_info->groups_sem);
+        __link_block_group(cache->space_info, cache);
-        list_add_tail(&cache->list, &cache->space_info->block_groups);
-        up_write(&cache->space_info->groups_sem);
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        return state;
 }
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
 }
 static int set_state_cb(struct extent_io_tree *tree,
-                         struct extent_state *state,
+                         struct extent_state *state, int *bits)
-                         unsigned long bits)
 {
        if (tree->ops && tree->ops->set_bit_hook) {
                return tree->ops->set_bit_hook(tree->mapping->host,
-                                               state->start, state->end,
+                                               state, bits);
-                                               state->state, bits);
        }
        return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
-                           struct extent_state *state,
+                           struct extent_state *state, int *bits)
-                           unsigned long bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
 */
 static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
-                        int bits)
+                        int *bits)
 {
        struct rb_node *node;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        int ret;
        if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
        if (ret)
                return ret;
-        if (bits & EXTENT_DIRTY)
+        if (bits_to_set & EXTENT_DIRTY)
                tree->dirty_bytes += end - start + 1;
-        state->state |= bits;
+        state->state |= bits_to_set;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 * struct is freed and removed from the tree
 */
 static int clear_state_bit(struct extent_io_tree *tree,
-                            struct extent_state *state, int bits, int wake,
+                            struct extent_state *state,
-                            int delete)
+                            int *bits, int wake)
 {
-        int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
        int ret = state->state & bits_to_clear;
-        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                WARN_ON(range > tree->dirty_bytes);
                tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
-        if (delete || state->state == 0) {
+        if (state->state == 0) {
                if (state->tree) {
-                        clear_state_cb(tree, state, state->state);
                        rb_erase(&state->rb_node, &tree->state);
                        state->tree = NULL;
                        free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int set = 0;
        int clear = 0;
+        if (delete)
+                bits |= ~EXTENT_CTLBITS;
+        bits |= EXTENT_FIRST_DELALLOC;
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
@@ -580,8 +581,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        set |= clear_state_bit(tree, state, bits, wake,
+                        set |= clear_state_bit(tree, state, &bits, wake);
-                                               delete);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
-                set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+                set |= clear_state_bit(tree, prealloc, &bits, wake);
                prealloc = NULL;
                goto out;
@@ -613,7 +613,7 @@ hit_next:
        else
                next_node = NULL;
-        set |= clear_state_bit(tree, state, bits, wake, delete);
+        set |= clear_state_bit(tree, state, &bits, wake);
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
@@ -706,19 +706,19 @@ out:
 static int set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                           int bits)
+                           int *bits)
 {
        int ret;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        ret = set_state_cb(tree, state, bits);
        if (ret)
                return ret;
+        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
-        state->state |= bits;
+        state->state |= bits_to_set;
        return 0;
 }
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
 * [start, end] is inclusive This takes the tree lock.
 */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                          int bits, int exclusive_bits, u64 *failed_start,
+                   int bits, int exclusive_bits, u64 *failed_start,
-                          struct extent_state **cached_state,
+                   struct extent_state **cached_state, gfp_t mask)
-                          gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
-                err = insert_state(tree, prealloc, start, end, bits);
+                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
                goto out;
@@ -802,7 +802,7 @@ hit_next:
                        goto out;
                }
-                err = set_state_bits(tree, state, bits);
+                err = set_state_bits(tree, state, &bits);
                if (err)
                        goto out;
@@ -852,7 +852,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        err = set_state_bits(tree, state, bits);
+                        err = set_state_bits(tree, state, &bits);
                        if (err)
                                goto out;
                        cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
                else
                        this_end = last_start - 1;
                err = insert_state(tree, prealloc, start, this_end,
-                                   bits);
+                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
                        prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
-                err = set_state_bits(tree, prealloc, bits);
+                err = set_state_bits(tree, prealloc, &bits);
                if (err) {
                        prealloc = NULL;
                        goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING, 0, 0,
+                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-                                NULL, mask);
 }
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (op & EXTENT_CLEAR_DELALLOC)
                clear_bits |= EXTENT_DELALLOC;
-        if (op & EXTENT_CLEAR_ACCOUNTING)
-                clear_bits |= EXTENT_DO_ACCOUNTING;
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                           mirror_num, bio_flags);
+                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
+        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
        size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
        end = page_end;
-        lock_extent(tree, start, end, GFP_NOFS);
+        while (1) {
+                lock_extent(tree, start, end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, start);
+                if (!ordered)
+                        break;
+                unlock_extent(tree, start, end, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+        }
        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
                char *userpage;
@@ -2589,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
-                .bdi            = wbc->bdi,
                .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
@@ -2623,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
                .sync_io = mode == WB_SYNC_ALL,
        };
        struct writeback_control wbc_writepages = {
-                .bdi            = inode->i_mapping->backing_dev_info,
                .sync_mode      = mode,
                .older_than_this = NULL,
                .nr_to_write    = nr_pages * 2,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
-                                       unsigned long bio_flags);
+                                       unsigned long bio_flags, u64 bio_offset);
 struct extent_io_ops {
        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
                             u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
                                    struct extent_state *state);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
-        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+        int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long old, unsigned long bits);
+                            int *bits);
        int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long bits);
+                              int *bits);
        int (*merge_extent_hook)(struct inode *inode,
                                 struct extent_state *new,
                                 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
                     u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int exclusive_bits, u64 *failed_start,
+                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
-                          struct bio *bio, u32 *dst)
+                                   struct inode *inode, struct bio *bio,
+                                   u64 logical_offset, u32 *dst, int dio)
 {
        u32 sum;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
-        u64 offset;
+        u64 offset = 0;
        u64 item_start_offset = 0;
        u64 item_last_offset = 0;
        u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
        WARN_ON(bio->bi_vcnt <= 0);
        disk_bytenr = (u64)bio->bi_sector << 9;
+        if (dio)
+                offset = logical_offset;
        while (bio_index < bio->bi_vcnt) {
-                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!dio)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
                if (ret == 0)
                        goto found;
@@ -238,6 +242,7 @@ found:
                else
                        set_state_private(io_tree, offset, sum);
                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
                bio_index++;
                bvec++;
        }
@@ -245,6 +250,18 @@ found:
        return 0;
 }
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 offset, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list)
 {
@@ -657,6 +674,9 @@ again:
                goto found;
        }
        ret = PTR_ERR(item);
+        if (ret != -EFBIG && ret != -ENOENT)
+                goto fail_unlock;
        if (ret == -EFBIG) {
                u32 item_size;
                /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
-                                         const char __user *buf)
+                                         struct iov_iter *i)
 {
-        long page_fault = 0;
+        size_t copied;
-        int i;
+        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
-        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
-                struct page *page = prepared_pages[i];
+                struct page *page = prepared_pages[pg];
-                fault_in_pages_readable(buf, count);
+again:
+                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                        return -EFAULT;
                /* Copy data from userspace to the current page */
-                kmap(page);
+                copied = iov_iter_copy_from_user(page, i, offset, count);
-                page_fault = __copy_from_user(page_address(page) + offset,
-                                              buf, count);
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
-                kunmap(page);
+                iov_iter_advance(i, copied);
-                buf += count;
+                write_bytes -= copied;
-                write_bytes -= count;
-                if (page_fault)
+                if (unlikely(copied == 0)) {
-                        break;
+                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                                      iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                        offset += copied;
+                } else {
+                        pg++;
+                        offset = 0;
+                }
        }
-        return page_fault ? -EFAULT : 0;
+        return 0;
 }
 /*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        NULL);
-        if (err)
+        BUG_ON(err);
-                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                 * at this time.
                 */
        }
-        return err;
+        return 0;
 }
 /*
@@ -823,45 +832,46 @@ again:
        return 0;
 }
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                size_t count, loff_t *ppos)
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos)
 {
-        loff_t pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *pinned[2];
+        struct page **pages = NULL;
+        struct iov_iter i;
+        loff_t *ppos = &iocb->ki_pos;
        loff_t start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
+        size_t count;
+        size_t ocount;
        int ret = 0;
-        struct inode *inode = fdentry(file)->d_inode;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page **pages = NULL;
        int nrptrs;
-        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
        int will_write;
+        int buffered = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
        pinned[0] = NULL;
        pinned[1] = NULL;
-        pos = *ppos;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-        /* do the reserve before the mutex lock in case we have to do some
-         * flushing.  We wouldn't deadlock, but this is more polite.
-         */
-        err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (err)
-                goto out_nolock;
        mutex_lock(&inode->i_mutex);
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+        if (err)
+                goto out;
+        count = ocount;
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                goto out;
        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+                                                        pos, ppos, count,
+                                                        ocount);
+                /*
+                 * the generic O_DIRECT will update in-memory i_size after the
+                 * DIOs are done.  But our endio handlers that update the on
+                 * disk i_size never update past the in memory i_size.  So we
+                 * need one more update here to catch any additions to the
+                 * file
+                 */
+                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        mark_inode_dirty(inode);
+                }
+                if (num_written < 0) {
+                        ret = num_written;
+                        num_written = 0;
+                        goto out;
+                } else if (num_written == count) {
+                        /* pick up pos changes done by the generic code */
+                        pos = *ppos;
+                        goto out;
+                }
+                /*
+                 * We are going to do buffered for the rest of the range, so we
+                 * need to make sure to invalidate the buffered pages when we're
+                 * done.
+                 */
+                buffered = 1;
+                pos += num_written;
+        }
+        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        /* generic_write_checks can change our pos */
        start_pos = pos;
-        BTRFS_I(inode)->sequence++;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
        /*
         * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        unlock_page(pinned[0]);
                }
        }
-        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
                if (!PageUptodate(pinned[1])) {
                        ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
        }
-        while (count > 0) {
+        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-                size_t write_bytes = min(count, nrptrs *
+                size_t write_bytes = min(iov_iter_count(&i),
-                                        (size_t)PAGE_CACHE_SIZE -
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_check_data_free_space(root, inode, write_bytes);
+                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
                if (ret)
                        goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
                ret = btrfs_copy_from_user(pos, num_pages,
-                                           write_bytes, pages, buf);
+                                           write_bytes, pages, &i);
-                if (ret) {
+                if (ret == 0) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        dirty_and_release_pages(NULL, root, file, pages,
-                                                       write_bytes);
+                                                num_pages, pos, write_bytes);
-                        btrfs_drop_pages(pages, num_pages);
-                        goto out;
                }
-                ret = dirty_and_release_pages(NULL, root, file, pages,
-                                              num_pages, pos, write_bytes);
                btrfs_drop_pages(pages, num_pages);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        btrfs_throttle(root);
                }
-                buf += write_bytes;
-                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
        mutex_unlock(&inode->i_mutex);
        if (ret)
                err = ret;
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
        kfree(pages);
        if (pinned[0])
                page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
                        num_written = err;
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
                                btrfs_end_transaction(trans, root);
                        }
                }
-                if (file->f_flags & O_DIRECT) {
+                if (file->f_flags & O_DIRECT && buffered) {
                        invalidate_mapping_pages(inode->i_mapping,
                              start_pos >> PAGE_CACHE_SHIFT,
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
-        if (file && file->private_data)
+        if (file->private_data)
                btrfs_ioctl_trans_end(file);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
+        if (IS_ERR(trans)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
 {
-        vma->vm_ops = &btrfs_file_vm_ops;
+        struct address_space *mapping = filp->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
        file_accessed(filp);
+        vma->vm_ops = &btrfs_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
+        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
-        .write          = btrfs_file_write,
+        .aio_write      = btrfs_file_aio_write,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = ref_objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return NULL;
+        if (!find_name_in_backref(path, name, name_len, &ref))
+                return NULL;
+        return ref;
+}
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..c03864406af3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
@@ -414,6 +415,7 @@ again:
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
                        btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
        return 0;
 }
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                      u64 num_bytes)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 alloc_hint = 0;
+        read_lock(&em_tree->lock);
+        em = search_extent_mapping(em_tree, start, num_bytes);
+        if (em) {
+                /*
+                 * if block start isn't an actual block number then find the
+                 * first block in this inode and use that as a hint.  If that
+                 * block is also bogus then just don't worry about it.
+                 */
+                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                        free_extent_map(em);
+                        em = search_extent_mapping(em_tree, 0, 0);
+                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                                alloc_hint = em->block_start;
+                        if (em)
+                                free_extent_map(em);
+                } else {
+                        alloc_hint = em->block_start;
+                        free_extent_map(em);
+                }
+        }
+        read_unlock(&em_tree->lock);
+        return alloc_hint;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
-                                     EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
-        read_lock(&BTRFS_I(inode)->extent_tree.lock);
-        em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                   start, num_bytes);
-        if (em) {
-                /*
-                 * if block start isn't an actual block number then find the
-                 * first block in this inode and use that as a hint.  If that
-                 * block is also bogus then just don't worry about it.
-                 */
-                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                        free_extent_map(em);
-                        em = search_extent_mapping(em_tree, 0, 0);
-                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                                alloc_hint = em->block_start;
-                        if (em)
-                                free_extent_map(em);
-                } else {
-                        alloc_hint = em->block_start;
-                        free_extent_map(em);
-                }
-        }
-        read_unlock(&BTRFS_I(inode)->extent_tree.lock);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                      num_bytes);
+                        BUG_ON(ret);
+                }
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 static int btrfs_split_extent_hook(struct inode *inode,
-                                    struct extent_state *orig, u64 split)
+                                   struct extent_state *orig, u64 split)
 {
+        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents++;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 * bytes in this file, and to maintain the list of inodes that
 * have pending delalloc work to be done.
 */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode,
-                       unsigned long old, unsigned long bits)
+                              struct extent_state *state, int *bits)
 {
        /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                BTRFS_I(inode)->outstanding_extents++;
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                else
-                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+                        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
                spin_lock(&root->fs_info->delalloc_lock);
-                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                BTRFS_I(inode)->delalloc_bytes += len;
-                root->fs_info->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 * extent_io.c clear_bit_hook, see set_bit_hook for why
 */
 static int btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, unsigned long bits)
+                                struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                if (bits & EXTENT_DO_ACCOUNTING) {
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                        spin_lock(&BTRFS_I(inode)->accounting_lock);
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
+                else if (!(*bits & EXTENT_DO_ACCOUNTING))
-                        BTRFS_I(inode)->outstanding_extents--;
+                        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-                        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+                if (*bits & EXTENT_DO_ACCOUNTING)
-                }
+                        btrfs_delalloc_release_metadata(inode, len);
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
-                if (state->end - state->start + 1 >
+                root->fs_info->delalloc_bytes -= len;
-                    root->fs_info->delalloc_bytes) {
+                BTRFS_I(inode)->delalloc_bytes -= len;
-                        printk(KERN_INFO "btrfs warning: delalloc account "
-                               "%llu %llu\n",
-                               (unsigned long long)
-                               state->end - state->start + 1,
-                               (unsigned long long)
-                               root->fs_info->delalloc_bytes);
-                        btrfs_delalloc_free_space(root, inode, (u64)-1);
-                        root->fs_info->delalloc_bytes = 0;
-                        BTRFS_I(inode)->delalloc_bytes = 0;
-                } else {
-                        btrfs_delalloc_free_space(root, inode,
-                                                  state->end -
-                                                  state->start + 1);
-                        root->fs_info->delalloc_bytes -= state->end -
-                                state->start + 1;
-                        BTRFS_I(inode)->delalloc_bytes -= state->end -
-                                state->start + 1;
-                }
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 * are inserted into the btree
 */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 * on write, or reading the csums from the tree before a read
 */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1425,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
-        if (!(rw & (1 << BIO_RW))) {
+        if (!(rw & REQ_WRITE)) {
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                   bio_flags, __btrfs_submit_bio_start,
+                                   bio_flags, bio_offset,
+                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
@@ -1520,6 +1525,7 @@ again:
                goto again;
        }
+        BUG();
        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
+        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        trans = btrfs_join_transaction(root, 1);
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, root);
                }
                goto out;
        }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        /* this also removes the ordered extent from the tree */
        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_end_transaction(trans, root);
 out:
+        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (trans)
+                btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -1831,14 +1841,14 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        bio->bi_size = 0;
        bio_add_page(bio, page, failrec->len, start - page_offset(page));
-        if (failed_bio->bi_rw & (1 << BIO_RW))
+        if (failed_bio->bi_rw & REQ_WRITE)
                rw = WRITE;
        else
                rw = READ;
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                      failrec->bio_flags);
+                                                      failrec->bio_flags, 0);
        return 0;
 }
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 }
 /*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        root = pending->root;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        block_rsv = root->orphan_block_rsv;
+        /* orphan block reservation for the snapshot */
+        num_bytes = block_rsv->size;
+        /*
+         * after the snapshot is created, COWing tree blocks may use more
+         * space than it frees. So we should make sure there is enough
+         * reserved space.
+         */
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes += block_rsv->size -
+                             (block_rsv->reserved + block_rsv->freed[index]);
+        }
+        *bytes_to_reserve += num_bytes;
+}
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *snap = pending->snap;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        int ret;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        /* refill source subvolume's orphan block reservation */
+        block_rsv = root->orphan_block_rsv;
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes = block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              root->orphan_block_rsv,
+                                              num_bytes);
+                BUG_ON(ret);
+        }
+        /* setup orphan block reservation for the snapshot */
+        block_rsv = btrfs_alloc_block_rsv(snap);
+        BUG_ON(!block_rsv);
+        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+        snap->orphan_block_rsv = block_rsv;
+        num_bytes = root->orphan_block_rsv->size;
+        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                      block_rsv, num_bytes);
+        BUG_ON(ret);
+#if 0
+        /* insert orphan item for the snapshot */
+        WARN_ON(!root->orphan_item_inserted);
+        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                       snap->root_key.objectid);
+        BUG_ON(ret);
+        snap->orphan_item_inserted = 1;
+#endif
+}
+enum btrfs_orphan_cleanup_state {
+        ORPHAN_CLEANUP_STARTED  = 1,
+        ORPHAN_CLEANUP_DONE     = 2,
+};
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+        int ret;
+        if (!list_empty(&root->orphan_list) ||
+            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+                return;
+        if (root->orphan_item_inserted &&
+            btrfs_root_refs(&root->root_item) > 0) {
+                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                            root->root_key.objectid);
+                BUG_ON(ret);
+                root->orphan_item_inserted = 0;
+        }
+        if (root->orphan_block_rsv) {
+                WARN_ON(root->orphan_block_rsv->size > 0);
+                btrfs_free_block_rsv(root, root->orphan_block_rsv);
+                root->orphan_block_rsv = NULL;
+        }
+}
+/*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *       this function.
 */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
+        struct btrfs_block_rsv *block_rsv = NULL;
+        int reserve = 0;
+        int insert = 0;
+        int ret;
+        if (!root->orphan_block_rsv) {
+                block_rsv = btrfs_alloc_block_rsv(root);
+                BUG_ON(!block_rsv);
+        }
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
+        if (!root->orphan_block_rsv) {
+                root->orphan_block_rsv = block_rsv;
+        } else if (block_rsv) {
+                btrfs_free_block_rsv(root, block_rsv);
+                block_rsv = NULL;
+        }
-        /* already on the orphan list, we're good */
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->list_lock);
+#if 0
-                return 0;
+                /*
+                 * For proper ENOSPC handling, we should do orphan
+                 * cleanup when mounting. But this introduces backward
+                 * compatibility issue.
+                 */
+                if (!xchg(&root->orphan_item_inserted, 1))
+                        insert = 2;
+                else
+                        insert = 1;
+#endif
+                insert = 1;
+        } else {
+                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
-        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        if (!BTRFS_I(inode)->orphan_meta_reserved) {
+                BTRFS_I(inode)->orphan_meta_reserved = 1;
+                reserve = 1;
+        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (block_rsv)
+                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-        /*
+        /* grab metadata reservation from transaction handle */
-         * insert an orphan item to track this unlinked/truncated file
+        if (reserve) {
-         */
+                ret = btrfs_orphan_reserve_metadata(trans, inode);
-        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        return ret;
+        /* insert an orphan item to track this unlinked/truncated file */
+        if (insert >= 1) {
+                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
+        /* insert an orphan item to track subvolume contains orphan files */
+        if (insert >= 2) {
+                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                               root->root_key.objectid);
+                BUG_ON(ret);
+        }
+        return 0;
 }
 /*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int delete_item = 0;
+        int release_rsv = 0;
        int ret = 0;
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_del_init(&BTRFS_I(inode)->i_orphan);
-                spin_unlock(&root->list_lock);
+                delete_item = 1;
-                return 0;
        }
-        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (BTRFS_I(inode)->orphan_meta_reserved) {
-        if (!trans) {
+                BTRFS_I(inode)->orphan_meta_reserved = 0;
-                spin_unlock(&root->list_lock);
+                release_rsv = 1;
-                return 0;
        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (trans && delete_item) {
+                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        if (release_rsv)
+                btrfs_orphan_release_metadata(inode);
-        return ret;
+        return 0;
 }
 /*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
-        if (!xchg(&root->clean_orphans, 0))
+        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
        path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode))
+                BUG_ON(IS_ERR(inode));
-                        break;
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-                spin_lock(&root->list_lock);
+                spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->list_lock);
+                spin_unlock(&root->orphan_lock);
                /*
                 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+        btrfs_free_path(path);
+        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+        if (root->orphan_block_rsv)
+                btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                        (u64)-1);
+        if (root->orphan_block_rsv || root->orphan_item_inserted) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_end_transaction(trans, root);
+        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-        btrfs_free_path(path);
 }
 /*
@@ -2478,29 +2666,201 @@ out:
        return ret;
 }
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+                             struct btrfs_path *path)
+{
+        struct extent_buffer *eb;
+        int level;
+        int ret;
+        u64 refs = 1;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                eb = path->nodes[level];
+                if (!btrfs_block_can_be_shared(root, eb))
+                        continue;
+                ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                               &refs, NULL);
+                if (refs > 1)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                       struct dentry *dentry)
 {
-        struct btrfs_root *root;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
+        u64 index;
+        int check_link = 1;
+        int err = -ENOSPC;
        int ret;
-        unsigned long nr = 0;
-        root = BTRFS_I(dir)->root;
+        trans = btrfs_start_transaction(root, 10);
+        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+                return trans;
-        /*
+        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-         * 5 items for unlink inode
+                return ERR_PTR(-ENOSPC);
-         * 1 for orphan
-         */
+        /* check if there is someone else holds reference */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-        if (ret)
+                return ERR_PTR(-ENOSPC);
-                return ret;
+        if (atomic_read(&inode->i_count) > 2)
+                return ERR_PTR(-ENOSPC);
+        if (xchg(&root->fs_info->enospc_unlink, 1))
+                return ERR_PTR(-ENOSPC);
+        path = btrfs_alloc_path();
+        if (!path) {
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(-ENOMEM);
+        }
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 6);
+                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                root->fs_info->enospc_unlink = 0;
+                return trans;
+        }
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(dir)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(inode)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
        }
+        btrfs_release_path(root, path);
+        if (ret == 0 && S_ISREG(inode->i_mode)) {
+                ret = btrfs_lookup_file_extent(trans, root, path,
+                                               inode->i_ino, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                BUG_ON(ret == 0);
+                if (check_path_shared(root, path))
+                        goto out;
+                btrfs_release_path(root, path);
+        }
+        if (!check_link) {
+                err = 0;
+                goto out;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        if (di) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                err = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        ref = btrfs_lookup_inode_ref(trans, root, path,
+                                dentry->d_name.name, dentry->d_name.len,
+                                inode->i_ino, dir->i_ino, 0);
+        if (IS_ERR(ref)) {
+                err = PTR_ERR(ref);
+                goto out;
+        }
+        BUG_ON(!ref);
+        if (check_path_shared(root, path))
+                goto out;
+        index = btrfs_inode_ref_index(path->nodes[0], ref);
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        BUG_ON(ret == -ENOENT);
+        if (check_path_shared(root, path))
+                goto out;
+        err = 0;
+out:
+        btrfs_free_path(path);
+        if (err) {
+                btrfs_end_transaction(trans, root);
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(err);
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        return trans;
+}
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                BUG_ON(!root->fs_info->enospc_unlink);
+                root->fs_info->enospc_unlink = 0;
+        }
+        btrfs_end_transaction_throttle(trans, root);
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        trans = __unlink_start_trans(dir, dentry);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
+        BUG_ON(ret);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
+                BUG_ON(ret);
+        }
        nr = trans->blocks_used;
+        __unlink_end_trans(trans, root);
-        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 6);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2577,7 +2938,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
        BUG_ON(ret);
-        dir->i_sb->s_dirt = 1;
        btrfs_free_path(path);
        return 0;
@@ -2587,7 +2947,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-        int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
@@ -2596,15 +2955,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
-        ret = btrfs_reserve_metadata_space(root, 5);
+        trans = __unlink_start_trans(dir, dentry);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 5);
                return PTR_ERR(trans);
-        }
        btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2980,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                btrfs_i_size_write(inode, 0);
 out:
        nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+        __unlink_end_trans(trans, root);
-        btrfs_unreserve_metadata_space(root, 5);
        btrfs_btree_balance_dirty(root, nr);
-        if (ret && !err)
-                err = ret;
        return err;
 }
@@ -3029,6 +3379,7 @@ out:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+                BUG_ON(ret);
        }
        btrfs_free_path(path);
        return err;
@@ -3056,11 +3407,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        if ((offset & (blocksize - 1)) == 0)
                goto out;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-        if (ret)
-                goto out;
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
        if (ret)
                goto out;
@@ -3068,8 +3415,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
        page = grab_cache_page(mapping, index);
        if (!page) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                goto out;
        }
@@ -3132,8 +3478,7 @@ again:
 out_unlock:
        if (ret)
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -3145,7 +3490,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct extent_map *em;
+        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3528,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        err = btrfs_reserve_metadata_space(root, 2);
+                        trans = btrfs_start_transaction(root, 2);
-                        if (err)
+                        if (IS_ERR(trans)) {
+                                err = PTR_ERR(trans);
                                break;
+                        }
-                        trans = btrfs_start_transaction(root, 1);
                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3550,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                        last_byte - 1, 0);
                        btrfs_end_transaction(trans, root);
-                        btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
+                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
+        free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
        return err;
@@ -3239,11 +3585,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                }
        }
-        ret = btrfs_reserve_metadata_space(root, 1);
+        trans = btrfs_start_transaction(root, 5);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3596,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_unreserve_metadata_space(root, 1);
        btrfs_btree_balance_dirty(root, nr);
        if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3608,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                i_size_write(inode, attr->ia_size);
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = root->orphan_block_rsv;
+                BUG_ON(!trans->block_rsv);
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
@@ -3308,17 +3655,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                if (err)
                        return err;
        }
-        attr->ia_valid &= ~ATTR_SIZE;
-        if (attr->ia_valid)
+        if (attr->ia_valid) {
-                err = inode_setattr(inode, attr);
+                setattr_copy(inode, attr);
+                mark_inode_dirty(inode);
+                if (attr->ia_valid & ATTR_MODE)
+                        err = btrfs_acl_chmod(inode);
+        }
-        if (!err && ((attr->ia_valid & ATTR_MODE)))
-                err = btrfs_acl_chmod(inode);
        return err;
 }
-void btrfs_delete_inode(struct inode *inode)
+void btrfs_evict_inode(struct inode *inode)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3326,10 +3675,14 @@ void btrfs_delete_inode(struct inode *inode)
        int ret;
        truncate_inode_pages(&inode->i_data, 0);
+        if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+                goto no_delete;
        if (is_bad_inode(inode)) {
                btrfs_orphan_del(NULL, inode);
                goto no_delete;
        }
+        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
        if (root->fs_info->log_root_recovering) {
@@ -3345,10 +3698,21 @@ void btrfs_delete_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
-                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+                trans->block_rsv = root->orphan_block_rsv;
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
+                }
+                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3356,6 +3720,7 @@ void btrfs_delete_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
        if (ret == 0) {
@@ -3367,7 +3732,7 @@ void btrfs_delete_inode(struct inode *inode)
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
 no_delete:
-        clear_inode(inode);
+        end_writeback(inode);
        return;
 }
@@ -3498,7 +3863,7 @@ again:
                        p = &parent->rb_right;
                else {
                        WARN_ON(!(entry->vfs_inode.i_state &
-                                  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+                                  (I_WILL_FREE | I_FREEING)));
                        rb_erase(parent, &root->inode_tree);
                        RB_CLEAR_NODE(parent);
                        spin_unlock(&root->inode_lock);
@@ -3577,7 +3942,7 @@ again:
                        if (atomic_read(&inode->i_count) > 1)
                                d_prune_aliases(inode);
                        /*
-                         * btrfs_drop_inode will remove it from
+                         * btrfs_drop_inode will have it removed from
                         * the inode cache when its usage count
                         * hits zero.
                         */
@@ -3596,40 +3961,10 @@ again:
        return 0;
 }
-static noinline void init_btrfs_i(struct inode *inode)
-{
-        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->generation = 0;
-        bi->sequence = 0;
-        bi->last_trans = 0;
-        bi->last_sub_trans = 0;
-        bi->logged_trans = 0;
-        bi->delalloc_bytes = 0;
-        bi->reserved_bytes = 0;
-        bi->disk_i_size = 0;
-        bi->flags = 0;
-        bi->index_cnt = (u64)-1;
-        bi->last_unlink_trans = 0;
-        bi->ordered_data_close = 0;
-        bi->force_compress = 0;
-        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                             inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                             inode->i_mapping, GFP_NOFS);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-        mutex_init(&BTRFS_I(inode)->log_mutex);
-}
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@ -3692,8 +4027,6 @@ static struct inode *new_simple_dir(struct super_block *s,
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4283,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_trans_handle *trans;
        int ret = 0;
-        if (root->fs_info->btree_inode == inode)
+        if (BTRFS_I(inode)->dummy_inode)
                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4304,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+        int ret;
+        if (BTRFS_I(inode)->dummy_inode)
+                return;
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret && ret == -ENOSPC) {
+                /* whoops, lets try again with the full transaction */
+                btrfs_end_transaction(trans, root);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %ld\n",
+                                       inode->i_ino, PTR_ERR(trans));
+                        }
+                        return;
+                }
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
+                if (ret) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %d\n",
+                                       inode->i_ino, ret);
+                        }
+                }
+        }
        btrfs_end_transaction(trans, root);
 }
@@ -4092,7 +4453,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-        init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
@@ -4121,16 +4481,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir && (dir->i_mode & S_ISGID)) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4256,26 +4607,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4304,13 +4650,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
+        btrfs_btree_balance_dirty(root, nr);
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -4320,32 +4664,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int err;
        int drop_inode = 0;
+        int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino,
@@ -4377,8 +4715,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4405,21 +4741,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
-        /*
-         * 1 item for inode ref
-         * 2 items for dir items
-         */
-        err = btrfs_reserve_metadata_space(root, 3);
-        if (err)
-                return err;
        btrfs_inc_nlink(inode);
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
-        trans = btrfs_start_transaction(root, 1);
+        /*
+         * 1 item for inode ref
+         * 2 items for dir items
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto fail;
+        }
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
@@ -4438,7 +4774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
 fail:
-        btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4458,28 +4793,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                err = -ENOMEM;
-                goto out_unlock;
-        }
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_fail;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4519,9 +4846,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_unlock:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -4779,6 +5103,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                        WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -4875,11 +5200,651 @@ out:
        return em;
 }
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  u64 start, u64 len)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct btrfs_key ins;
+        u64 alloc_hint;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        trans = btrfs_join_transaction(root, 0);
+        if (!trans)
+                return ERR_PTR(-ENOMEM);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        alloc_hint = get_extent_allocation_hint(inode, start, len);
+        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                   alloc_hint, (u64)-1, &ins, 1);
+        if (ret) {
+                em = ERR_PTR(ret);
+                goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = start;
+        em->orig_start = em->start;
+        em->len = ins.offset;
+        em->block_start = ins.objectid;
+        em->block_len = ins.offset;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        while (1) {
+                write_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                if (ret != -EEXIST)
+                        break;
+                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+        }
+        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                           ins.offset, ins.offset, 0);
+        if (ret) {
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                em = ERR_PTR(ret);
+        }
+out:
+        btrfs_end_transaction(trans, root);
+        return em;
+}
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                      struct inode *inode, u64 offset, u64 len)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *leaf;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 disk_bytenr;
+        u64 backref_offset;
+        u64 extent_end;
+        u64 num_bytes;
+        int slot;
+        int found_type;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       offset, 0);
+        if (ret < 0)
+                goto out;
+        slot = path->slots[0];
+        if (ret == 1) {
+                if (slot == 0) {
+                        /* can't find the item, must cow */
+                        ret = 0;
+                        goto out;
+                }
+                slot--;
+        }
+        ret = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != inode->i_ino ||
+            key.type != BTRFS_EXTENT_DATA_KEY) {
+                /* not our file or wrong item type, must cow */
+                goto out;
+        }
+        if (key.offset > offset) {
+                /* Wrong offset, must cow */
+                goto out;
+        }
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(leaf, fi);
+        if (found_type != BTRFS_FILE_EXTENT_REG &&
+            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+                /* not a regular extent, must cow */
+                goto out;
+        }
+        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        backref_offset = btrfs_file_extent_offset(leaf, fi);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if (extent_end < offset + len) {
+                /* extent doesn't include our full range, must cow */
+                goto out;
+        }
+        if (btrfs_extent_readonly(root, disk_bytenr))
+                goto out;
+        /*
+         * look for other files referencing this extent, if we
+         * find any we must cow
+         */
+        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                  key.offset - backref_offset, disk_bytenr))
+                goto out;
+        /*
+         * adjust disk_bytenr and num_bytes to cover just the bytes
+         * in this extent we are about to write.  If there
+         * are any csums in that range we have to cow in order
+         * to keep the csums correct
+         */
+        disk_bytenr += backref_offset;
+        disk_bytenr += offset - key.offset;
+        num_bytes = min(offset + len, extent_end) - offset;
+        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out;
+        /*
+         * all of the above have passed, it is safe to overwrite this extent
+         * without cow
+         */
+        ret = 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start = iblock << inode->i_blkbits;
+        u64 len = bh_result->b_size;
+        struct btrfs_trans_handle *trans;
+        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        /*
+         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+         * io.  INLINE is special, and we could probably kludge it in here, but
+         * it's still buffered so for safety lets just fall back to the generic
+         * buffered path.
+         *
+         * For COMPRESSED we _have_ to read the entire extent in so we can
+         * decompress it, so there will be buffering required no matter what we
+         * do, so go ahead and fallback to buffered.
+         *
+         * We return -ENOTBLK because thats what makes DIO go ahead and go back
+         * to buffered IO.  Don't blame me, this is the price we pay for using
+         * the generic code.
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+            em->block_start == EXTENT_MAP_INLINE) {
+                free_extent_map(em);
+                return -ENOTBLK;
+        }
+        /* Just a good old fashioned hole, return */
+        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                free_extent_map(em);
+                /* DIO will do one hole at a time, so just unlock a sector */
+                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                              start + root->sectorsize - 1, GFP_NOFS);
+                return 0;
+        }
+        /*
+         * We don't allocate a new extent in the following cases
+         *
+         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+         * existing extent.
+         * 2) The extent is marked as PREALLOC.  We're good to go here and can
+         * just use the extent.
+         *
+         */
+        if (!create) {
+                len = em->len - (start - em->start);
+                goto map;
+        }
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+             em->block_start != EXTENT_MAP_HOLE)) {
+                int type;
+                int ret;
+                u64 block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        type = BTRFS_ORDERED_PREALLOC;
+                else
+                        type = BTRFS_ORDERED_NOCOW;
+                len = min(len, em->len - (start - em->start));
+                block_start = em->block_start + (start - em->start);
+                /*
+                 * we're not going to log anything, but we do need
+                 * to make sure the current transaction stays open
+                 * while we look for nocow cross refs
+                 */
+                trans = btrfs_join_transaction(root, 0);
+                if (!trans)
+                        goto must_cow;
+                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                        ret = btrfs_add_ordered_extent_dio(inode, start,
+                                           block_start, len, len, type);
+                        btrfs_end_transaction(trans, root);
+                        if (ret) {
+                                free_extent_map(em);
+                                return ret;
+                        }
+                        goto unlock;
+                }
+                btrfs_end_transaction(trans, root);
+        }
+must_cow:
+        /*
+         * this will cow the extent, reset the len in case we changed
+         * it above
+         */
+        len = bh_result->b_size;
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        len = min(len, em->len - (start - em->start));
+unlock:
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                          0, NULL, GFP_NOFS);
+map:
+        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+                inode->i_blkbits;
+        bh_result->b_size = len;
+        bh_result->b_bdev = em->bdev;
+        set_buffer_mapped(bh_result);
+        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                set_buffer_new(bh_result);
+        free_extent_map(em);
+        return 0;
+}
+struct btrfs_dio_private {
+        struct inode *inode;
+        u64 logical_offset;
+        u64 disk_bytenr;
+        u64 bytes;
+        u32 *csums;
+        void *private;
+};
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start;
+        u32 *private = dip->csums;
+        start = dip->logical_offset;
+        do {
+                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                        struct page *page = bvec->bv_page;
+                        char *kaddr;
+                        u32 csum = ~(u32)0;
+                        unsigned long flags;
+                        local_irq_save(flags);
+                        kaddr = kmap_atomic(page, KM_IRQ0);
+                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                               csum, bvec->bv_len);
+                        btrfs_csum_final(csum, (char *)&csum);
+                        kunmap_atomic(kaddr, KM_IRQ0);
+                        local_irq_restore(flags);
+                        flush_dcache_page(bvec->bv_page);
+                        if (csum != *private) {
+                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                      " %llu csum %u private %u\n",
+                                      inode->i_ino, (unsigned long long)start,
+                                      csum, *private);
+                                err = -EIO;
+                        }
+                }
+                start += bvec->bv_len;
+                private++;
+                bvec++;
+        } while (bvec <= bvec_end);
+        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered = NULL;
+        struct extent_state *cached_state = NULL;
+        int ret;
+        if (err)
+                goto out_done;
+        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                             dip->logical_offset, dip->bytes);
+        if (!ret)
+                goto out_done;
+        BUG_ON(!ordered);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, inode);
+                err = ret;
+                goto out;
+        }
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                         ordered->file_offset + ordered->len - 1, 0,
+                         &cached_state, GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                ret = btrfs_mark_extent_written(trans, inode,
+                                                ordered->file_offset,
+                                                ordered->file_offset +
+                                                ordered->len);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  ordered->file_offset,
+                                                  ordered->start,
+                                                  ordered->disk_len,
+                                                  ordered->len,
+                                                  ordered->len,
+                                                  0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_REG);
+                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                   ordered->file_offset, ordered->len);
+                if (ret) {
+                        err = ret;
+                        WARN_ON(1);
+                        goto out_unlock;
+                }
+        }
+        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+        btrfs_ordered_update_i_size(inode, 0, ordered);
+        btrfs_update_inode(trans, root, inode);
+out_unlock:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                             ordered->file_offset + ordered->len - 1,
+                             &cached_state, GFP_NOFS);
+out:
+        btrfs_delalloc_release_metadata(inode, ordered->len);
+        btrfs_end_transaction(trans, root);
+        btrfs_put_ordered_extent(ordered);
+        btrfs_put_ordered_extent(ordered);
+out_done:
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags, u64 offset)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+        BUG_ON(ret);
+        return 0;
+}
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                                loff_t file_offset)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_dio_private *dip;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        u64 start;
+        int skip_sum;
+        int write = rw & REQ_WRITE;
+        int ret = 0;
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        dip = kmalloc(sizeof(*dip), GFP_NOFS);
+        if (!dip) {
+                ret = -ENOMEM;
+                goto free_ordered;
+        }
+        dip->csums = NULL;
+        if (!skip_sum) {
+                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+                if (!dip->csums) {
+                        ret = -ENOMEM;
+                        goto free_ordered;
+                }
+        }
+        dip->private = bio->bi_private;
+        dip->inode = inode;
+        dip->logical_offset = file_offset;
+        start = dip->logical_offset;
+        dip->bytes = 0;
+        do {
+                dip->bytes += bvec->bv_len;
+                bvec++;
+        } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+        dip->disk_bytenr = (u64)bio->bi_sector << 9;
+        bio->bi_private = dip;
+        if (write)
+                bio->bi_end_io = btrfs_endio_direct_write;
+        else
+                bio->bi_end_io = btrfs_endio_direct_read;
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto out_err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   dip->logical_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                if (ret)
+                        goto out_err;
+                return;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          dip->logical_offset, dip->csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+        if (ret)
+                goto out_err;
+        return;
+out_err:
+        kfree(dip->csums);
+        kfree(dip);
+free_ordered:
+        /*
+         * If this is a write, we need to clean up the reserved space and kill
+         * the ordered extent.
+         */
+        if (write) {
+                struct btrfs_ordered_extent *ordered;
+                ordered = btrfs_lookup_ordered_extent(inode,
+                                                      dip->logical_offset);
+                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                        btrfs_free_reserved_extent(root, ordered->start,
+                                                   ordered->disk_len);
+                btrfs_put_ordered_extent(ordered);
+                btrfs_put_ordered_extent(ordered);
+        }
+        bio_endio(bio, ret);
+}
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        int seg;
+        size_t size;
+        unsigned long addr;
+        unsigned blocksize_mask = root->sectorsize - 1;
+        ssize_t retval = -EINVAL;
+        loff_t end = offset;
+        if (offset & blocksize_mask)
+                goto out;
+        /* Check the memory alignment.  Blocks cannot straddle pages */
+        for (seg = 0; seg < nr_segs; seg++) {
+                addr = (unsigned long)iov[seg].iov_base;
+                size = iov[seg].iov_len;
+                end += size;
+                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                        goto out;
+        }
+        retval = 0;
+out:
+        return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-        return -EINVAL;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
+        u64 lockstart, lockend;
+        ssize_t ret;
+        int writing = rw & WRITE;
+        int write_bits = 0;
+        size_t count = iov_length(iov, nr_segs);
+        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                            offset, nr_segs)) {
+                return 0;
+        }
+        lockstart = offset;
+        lockend = offset + count - 1;
+        if (writing) {
+                ret = btrfs_delalloc_reserve_space(inode, count);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, &cached_state, GFP_NOFS);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                if (!ordered)
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     &cached_state, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                cond_resched();
+        }
+        /*
+         * we don't use btrfs_set_extent_delalloc because we don't want
+         * the dirty or uptodate bits
+         */
+        if (writing) {
+                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                     GFP_NOFS);
+                if (ret) {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockend, EXTENT_LOCKED | write_bits,
+                                         1, 0, &cached_state, GFP_NOFS);
+                        goto out;
+                }
+        }
+        free_extent_state(cached_state);
+        cached_state = NULL;
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                   btrfs_submit_direct, 0);
+        if (ret < 0 && ret != -EIOCBQUEUED) {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+                /*
+                 * We're falling back to buffered, unlock the section we didn't
+                 * do IO on.
+                 */
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        }
+out:
+        free_extent_state(cached_state);
+        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5043,7 +6008,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -5052,13 +6017,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out;
        }
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (ret) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
@@ -5068,7 +6026,6 @@ again:
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
@@ -5109,7 +6066,6 @@ again:
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
        }
        ret = 0;
@@ -5136,10 +6092,10 @@ again:
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
        return ret;
 }
@@ -5164,8 +6120,10 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = root->orphan_block_rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -5188,6 +6146,23 @@ static void btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                if (!trans) {
+                        trans = btrfs_start_transaction(root, 0);
+                        BUG_ON(IS_ERR(trans));
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = root->orphan_block_rsv;
+                }
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        trans = NULL;
+                        continue;
+                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -5199,10 +6174,8 @@ static void btrfs_truncate(struct inode *inode)
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
+                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
-                trans = btrfs_start_transaction(root, 1);
-                btrfs_set_trans_block_group(trans, inode);
        }
        if (ret == 0 && inode->i_nlink > 0) {
@@ -5263,21 +6236,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
+        struct inode *inode;
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+        ei->root = NULL;
+        ei->space_info = NULL;
+        ei->generation = 0;
+        ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
-        ei->outstanding_extents = 0;
+        ei->delalloc_bytes = 0;
-        ei->reserved_extents = 0;
+        ei->reserved_bytes = 0;
-        ei->root = NULL;
+        ei->disk_i_size = 0;
+        ei->flags = 0;
+        ei->index_cnt = (u64)-1;
+        ei->last_unlink_trans = 0;
        spin_lock_init(&ei->accounting_lock);
+        atomic_set(&ei->outstanding_extents, 0);
+        ei->reserved_extents = 0;
+        ei->ordered_data_close = 0;
+        ei->orphan_meta_reserved = 0;
+        ei->dummy_inode = 0;
+        ei->force_compress = 0;
+        inode = &ei->vfs_inode;
+        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-        return &ei->vfs_inode;
+        RB_CLEAR_NODE(&ei->rb_node);
+        return inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
@@ -5287,6 +6286,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+        WARN_ON(BTRFS_I(inode)->reserved_extents);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -5307,13 +6308,13 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
                       inode->i_ino);
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-        spin_unlock(&root->list_lock);
+        spin_unlock(&root->orphan_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5335,13 +6336,14 @@ free:
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
-void btrfs_drop_inode(struct inode *inode)
+int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
-                generic_delete_inode(inode);
+        if (btrfs_root_refs(&root->root_item) == 0)
+                return 1;
        else
-                generic_drop_inode(inode);
+                return generic_drop_inode(inode);
 }
 static void init_once(void *foo)
@@ -5434,19 +6436,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-        /*
-         * We want to reserve the absolute worst case amount of items.  So if
-         * both inodes are subvols and we need to unlink them then that would
-         * require 4 item modifications, but if they are both normal inodes it
-         * would require 5 item modifications, so we'll assume their normal
-         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-         * should cover the worst case number of items we'll modify.
-         */
-        ret = btrfs_reserve_metadata_space(root, 11);
-        if (ret)
-                return ret;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -5459,8 +6448,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+        /*
+         * We want to reserve the absolute worst case amount of items.  So if
+         * both inodes are subvols and we need to unlink them then that would
+         * require 4 item modifications, but if they are both normal inodes it
+         * would require 5 item modifications, so we'll assume their normal
+         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+         * should cover the worst case number of items we'll modify.
+         */
+        trans = btrfs_start_transaction(root, 20);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
@@ -5559,7 +6558,6 @@ out_fail:
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
-        btrfs_unreserve_metadata_space(root, 11);
        return ret;
 }
@@ -5611,6 +6609,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+        struct btrfs_inode *binode;
+        struct inode *inode = NULL;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&root->fs_info->delalloc_inodes)) {
+                binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                    struct btrfs_inode, delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (inode) {
+                        list_move_tail(&binode->delalloc_inodes,
+                                       &root->fs_info->delalloc_inodes);
+                        break;
+                }
+                list_del_init(&binode->delalloc_inodes);
+                cond_resched_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        if (inode) {
+                write_inode_now(inode, 0);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                return 1;
+        }
+        return 0;
+}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -5634,26 +6664,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
         * 1 item for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -5725,8 +6749,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -5735,33 +6757,28 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                        u64 alloc_hint, int mode, loff_t actual_len)
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
-        u64 num_bytes = end - start;
        int ret = 0;
-        u64 i_size;
        while (num_bytes > 0) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 3);
+                if (IS_ERR(trans)) {
-                ret = btrfs_reserve_extent(trans, root, num_bytes,
+                        ret = PTR_ERR(trans);
-                                           root->sectorsize, 0, alloc_hint,
+                        break;
-                                           (u64)-1, &ins, 1);
-                if (ret) {
-                        WARN_ON(1);
-                        goto stop_trans;
                }
-                ret = btrfs_reserve_metadata_space(root, 3);
+                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_free_reserved_extent(root, ins.objectid,
+                        btrfs_end_transaction(trans, root);
-                                                   ins.offset);
+                        break;
-                        goto stop_trans;
                }
                ret = insert_reserved_file_extent(trans, inode,
@@ -5775,34 +6792,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-                alloc_hint = ins.objectid + ins.offset;
+                *alloc_hint = ins.objectid + ins.offset;
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                        (actual_len > inode->i_size) &&
+                    (actual_len > inode->i_size) &&
-                        (cur_offset > inode->i_size)) {
+                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size  = actual_len;
+                                i_size_write(inode, actual_len);
                        else
-                                i_size = cur_offset;
+                                i_size_write(inode, cur_offset);
-                        i_size_write(inode, i_size);
+                        i_size_write(inode, cur_offset);
-                        btrfs_ordered_update_i_size(inode, i_size, NULL);
+                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
                btrfs_end_transaction(trans, root);
-                btrfs_unreserve_metadata_space(root, 3);
        }
        return ret;
-stop_trans:
-        btrfs_end_transaction(trans, root);
-        return ret;
 }
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5835,8 +6845,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-                                          alloc_end - alloc_start);
        if (ret)
                goto out;
@@ -5881,16 +6890,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = prealloc_file_range(inode,
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                  cur_offset, last_byte,
+                                                        last_byte - cur_offset,
-                                                alloc_hint, mode, offset+len);
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                        alloc_hint = em->block_start;
                free_extent_map(em);
                cur_offset = last_byte;
@@ -5902,8 +6911,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-                                       alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
+        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                return ret;
        /*
         * 1 - inode item
         * 2 - refs
         * 1 - root item
         * 2 - dir items
         */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        trans = btrfs_start_transaction(root, 6);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-                                       0, &objectid);
-        if (ret)
-                goto fail;
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
        err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
-        btrfs_unreserve_metadata_space(root, 6);
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
-                           char *name, int namelen)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!root->ref_cows)
                return -EINVAL;
-        /*
-         * 1 - inode item
-         * 2 - refs
-         * 1 - root item
-         * 2 - dir items
-         */
-        ret = btrfs_reserve_metadata_space(root, 6);
-        if (ret)
-                goto fail;
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-        if (!pending_snapshot) {
+        if (!pending_snapshot)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
+        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
-        }
-        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-        if (!pending_snapshot->name) {
-                ret = -ENOMEM;
-                kfree(pending_snapshot);
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
-        }
-        memcpy(pending_snapshot->name, name, namelen);
-        pending_snapshot->name[namelen] = '\0';
        pending_snapshot->dentry = dentry;
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
        pending_snapshot->root = root;
+        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto fail;
+        }
+        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+        BUG_ON(ret);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
        BUG_ON(ret);
-        btrfs_unreserve_metadata_space(root, 6);
+        ret = pending_snapshot->error;
+        if (ret)
+                goto fail;
+        btrfs_orphan_cleanup(pending_snapshot->snap);
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
+        kfree(pending_snapshot);
        return ret;
 }
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry,
+                error = create_snapshot(snap_src, dentry);
-                                        name, namelen);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = 1;
-                ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-                if (ret) {
+                if (ret)
-                        ret = -ENOSPC;
+                        goto err_unlock;
-                        break;
-                }
-                ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
-                                                       PAGE_CACHE_SIZE);
-                        ret = -ENOSPC;
-                        break;
-                }
 again:
                if (inode->i_size == 0 ||
                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
                }
                page = grab_cache_page(inode->i_mapping, i);
-                if (!page)
+                if (!page) {
+                        ret = -ENOMEM;
                        goto err_reservations;
+                }
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                ret = -EIO;
                                goto err_reservations;
                        }
                }
@@ -644,8 +623,7 @@ again:
                wait_on_page_writeback(page);
                if (PageDirty(page)) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                                                       PAGE_CACHE_SIZE);
                        goto loop_unlock;
                }
@@ -683,7 +661,6 @@ loop_unlock:
                page_cache_release(page);
                mutex_unlock(&inode->i_mutex);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
                i++;
        }
@@ -713,9 +690,9 @@ loop_unlock:
        return 0;
 err_reservations:
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
        mutex_unlock(&inode->i_mutex);
-        btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        return ret;
 }
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        if (err)
                goto out_up_write;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out_up_write;
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        dest->root_item.drop_level = 0;
        btrfs_set_root_refs(&dest->root_item, 0);
-        ret = btrfs_insert_orphan_item(trans,
+        if (!xchg(&dest->orphan_item_inserted, 1)) {
-                                root->fs_info->tree_root,
+                ret = btrfs_insert_orphan_item(trans,
-                                dest->root_key.objectid);
+                                        root->fs_info->tree_root,
-        BUG_ON(ret);
+                                        dest->root_key.objectid);
+                BUG_ON(ret);
+        }
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-                btrfs_defrag_root(root, 0);
+                ret = btrfs_defrag_root(root, 0);
-                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                if (ret)
+                        goto out;
+                ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(file, range);
                kfree(range);
                break;
+        default:
+                ret = -EINVAL;
        }
 out:
        mnt_drop_write(file->f_path.mnt);
@@ -1469,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         */
        /* the destination must be opened for writing */
-        if (!(file->f_mode & FMODE_WRITE))
+        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
        ret = mnt_want_write(file->f_path.mnt);
@@ -1522,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        /* determine range to clone */
        ret = -EINVAL;
-        if (off >= src->i_size || off + len > src->i_size)
+        if (off + len > src->i_size || off + len < off)
                goto out_unlock;
        if (len == 0)
                olen = len = src->i_size - off;
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, off+len);
        }
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        /* punch hole in destination first */
-        btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
        /* clone data */
        key.objectid = src->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 * note the key will change type as we walk through the
                 * tree.
                 */
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
@@ -1595,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        u64 disko = 0, diskl = 0;
                        u64 datao = 0, datal = 0;
                        u8 comp;
+                        u64 endoff;
                        size = btrfs_item_size_nr(leaf, slot);
                        read_extent_buffer(leaf, buf,
@@ -1629,12 +1613,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                leaf = path->nodes[0];
                                slot = path->slots[0];
@@ -1645,14 +1648,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
-                                if (off > key.offset) {
-                                        datao += off - key.offset;
-                                        datal -= off - key.offset;
-                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                /* disko == 0 means it's a hole */
                                if (!disko)
                                        datao = 0;
@@ -1683,14 +1678,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
+                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
                                size -= skip + trim;
                                datal -= skip + trim;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                if (skip) {
                                        u32 start =
@@ -1708,8 +1710,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                }
+                        btrfs_release_path(root, path);
+                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                        /*
+                         * we round up to the block size at eof when
+                         * determining which extents to clone above,
+                         * but shouldn't round up the file size
+                         */
+                        endoff = new_key.offset + datal;
+                        if (endoff > off+olen)
+                                endoff = off+olen;
+                        if (endoff > inode->i_size)
+                                btrfs_i_size_write(inode, endoff);
+                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                        ret = btrfs_update_inode(trans, root, inode);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, root);
+                }
 next:
                btrfs_release_path(root, path);
                key.offset++;
@@ -1717,17 +1737,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(root, path);
-        if (ret == 0) {
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                if (destoff + olen > inode->i_size)
-                        btrfs_i_size_write(inode, destoff + olen);
-                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-                ret = btrfs_update_inode(trans, root, inode);
-        }
-        btrfs_end_transaction(trans, root);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-        if (ret)
-                vmtruncate(inode, 0);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1855,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
-        if (!di) {
+        if (IS_ERR_OR_NULL(di)) {
                btrfs_free_path(path);
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
        return 1;
 }
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+                          u64 len)
+{
+        if (file_offset + len <= entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
 /*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int type)
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int dio)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
+        if (dio)
+                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        return 0;
 }
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0);
+}
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 1);
+}
 /*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 * when an ordered extent is finished.  If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
-                                              inode, 1);
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-        filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+                filemap_fdatawrite_range(inode->i_mapping, start, end);
        if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -588,6 +609,47 @@ out:
        return entry;
 }
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                node = tree_search(tree, file_offset + len);
+                if (!node)
+                        goto out;
+        }
+        while (1) {
+                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (range_overlaps(entry, file_offset, len))
+                        break;
+                if (entry->file_offset >= file_offset + len) {
+                        entry = NULL;
+                        break;
+                }
+                entry = NULL;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        if (entry)
+                atomic_inc(&entry->refs);
+        spin_unlock(&tree->lock);
+        return entry;
+}
 /*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int tyep);
+                             u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
 struct backref_node {
        struct rb_node rb_node;
        u64 bytenr;
-        /* objectid tree block owner */
+        u64 new_bytenr;
+        /* objectid of tree block owner, can be not uptodate */
        u64 owner;
+        /* link to pending, changed or detached list */
+        struct list_head list;
        /* list of upper level blocks reference this block */
        struct list_head upper;
        /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
        struct extent_buffer *eb;
        /* level of tree block */
        unsigned int level:8;
-        /* 1 if the block is root of old snapshot */
+        /* is the block in non-reference counted tree */
-        unsigned int old_root:1;
+        unsigned int cowonly:1;
-        /* 1 if no child blocks in the cache */
+        /* 1 if no child node in the cache */
        unsigned int lowest:1;
        /* is the extent buffer locked */
        unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
        unsigned int processed:1;
        /* have backrefs of this block been checked */
        unsigned int checked:1;
+        /*
+         * 1 if corresponding block has been cowed but some upper
+         * level block pointers may not point to the new location
+         */
+        unsigned int pending:1;
+        /*
+         * 1 if the backref node isn't connected to any other
+         * backref node.
+         */
+        unsigned int detached:1;
 };
 /*
@@ -74,7 +88,6 @@ struct backref_node {
 struct backref_edge {
        struct list_head list[2];
        struct backref_node *node[2];
-        u64 blockptr;
 };
 #define LOWER   0
@@ -83,9 +96,25 @@ struct backref_edge {
 struct backref_cache {
        /* red black tree of all backref nodes in the cache */
        struct rb_root rb_root;
-        /* list of backref nodes with no child block in the cache */
+        /* for passing backref nodes to btrfs_reloc_cow_block */
+        struct backref_node *path[BTRFS_MAX_LEVEL];
+        /*
+         * list of blocks that have been cowed but some block
+         * pointers in upper level blocks may not reflect the
+         * new location
+         */
        struct list_head pending[BTRFS_MAX_LEVEL];
-        spinlock_t lock;
+        /* list of backref nodes with no child node */
+        struct list_head leaves;
+        /* list of blocks that have been cowed in current transaction */
+        struct list_head changed;
+        /* list of detached backref node. */
+        struct list_head detached;
+        u64 last_trans;
+        int nr_nodes;
+        int nr_edges;
 };
 /*
@@ -113,15 +142,6 @@ struct tree_block {
        unsigned int key_ready:1;
 };
-/* inode vector */
-#define INODEVEC_SIZE 16
-struct inodevec {
-        struct list_head list;
-        struct inode *inode[INODEVEC_SIZE];
-        int nr;
-};
 #define MAX_EXTENTS 128
 struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
        struct btrfs_root *extent_root;
        /* inode for moving data */
        struct inode *data_inode;
-        struct btrfs_workers workers;
+        struct btrfs_block_rsv *block_rsv;
+        struct backref_cache backref_cache;
+        struct file_extent_cluster cluster;
        /* tree blocks have been processed */
        struct extent_io_tree processed_blocks;
        /* map start of tree root to corresponding reloc tree */
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
+        /* size of metadata reservation for merging reloc trees */
+        u64 merging_rsv_size;
+        /* size of relocated tree nodes */
+        u64 nodes_relocated;
        u64 search_start;
        u64 extents_found;
-        u64 extents_skipped;
-        int stage;
+        int block_rsv_retries;
-        int create_reloc_root;
+        unsigned int stage:8;
+        unsigned int create_reloc_tree:1;
+        unsigned int merge_reloc_tree:1;
        unsigned int found_file_extent:1;
-        unsigned int found_old_snapshot:1;
+        unsigned int commit_transaction:1;
 };
 /* stages of data relocation */
 #define MOVE_DATA_EXTENTS       0
 #define UPDATE_DATA_PTRS        1
-/*
+static void remove_backref_node(struct backref_cache *cache,
- * merge reloc tree to corresponding fs tree in worker threads
+                                struct backref_node *node);
- */
+static void __mark_block_processed(struct reloc_control *rc,
-struct async_merge {
+                                   struct backref_node *node);
-        struct btrfs_work work;
-        struct reloc_control *rc;
-        struct btrfs_root *root;
-        struct completion *done;
-        atomic_t *num_pending;
-};
 static void mapping_tree_init(struct mapping_tree *tree)
 {
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
        cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
-        spin_lock_init(&cache->lock);
+        INIT_LIST_HEAD(&cache->changed);
+        INIT_LIST_HEAD(&cache->detached);
+        INIT_LIST_HEAD(&cache->leaves);
+}
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int i;
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->leaves)) {
+                node = list_entry(cache->leaves.next,
+                                  struct backref_node, lower);
+                remove_backref_node(cache, node);
+        }
+        cache->last_trans = 0;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                BUG_ON(!list_empty(&cache->pending[i]));
+        BUG_ON(!list_empty(&cache->changed));
+        BUG_ON(!list_empty(&cache->detached));
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        BUG_ON(cache->nr_nodes);
+        BUG_ON(cache->nr_edges);
+}
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        node = kzalloc(sizeof(*node), GFP_NOFS);
+        if (node) {
+                INIT_LIST_HEAD(&node->list);
+                INIT_LIST_HEAD(&node->upper);
+                INIT_LIST_HEAD(&node->lower);
+                RB_CLEAR_NODE(&node->rb_node);
+                cache->nr_nodes++;
+        }
+        return node;
+}
+static void free_backref_node(struct backref_cache *cache,
+                              struct backref_node *node)
+{
+        if (node) {
+                cache->nr_nodes--;
+                kfree(node);
+        }
+}
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+        struct backref_edge *edge;
+        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+        if (edge)
+                cache->nr_edges++;
+        return edge;
 }
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+                              struct backref_edge *edge)
 {
-        memset(node, 0, sizeof(*node));
+        if (edge) {
-        INIT_LIST_HEAD(&node->upper);
+                cache->nr_edges--;
-        INIT_LIST_HEAD(&node->lower);
+                kfree(edge);
-        RB_CLEAR_NODE(&node->rb_node);
+        }
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
                edges[idx++] = edge;
                node = edge->node[UPPER];
        }
+        BUG_ON(node->detached);
        *index = idx;
        return node;
 }
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
        return NULL;
 }
+static void unlock_node_buffer(struct backref_node *node)
+{
+        if (node->locked) {
+                btrfs_tree_unlock(node->eb);
+                node->locked = 0;
+        }
+}
 static void drop_node_buffer(struct backref_node *node)
 {
        if (node->eb) {
-                if (node->locked) {
+                unlock_node_buffer(node);
-                        btrfs_tree_unlock(node->eb);
-                        node->locked = 0;
-                }
                free_extent_buffer(node->eb);
                node->eb = NULL;
        }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
 static void drop_backref_node(struct backref_cache *tree,
                              struct backref_node *node)
 {
-        BUG_ON(!node->lowest);
        BUG_ON(!list_empty(&node->upper));
        drop_node_buffer(node);
+        list_del(&node->list);
        list_del(&node->lower);
+        if (!RB_EMPTY_NODE(&node->rb_node))
-        rb_erase(&node->rb_node, &tree->rb_root);
+                rb_erase(&node->rb_node, &tree->rb_root);
-        kfree(node);
+        free_backref_node(tree, node);
 }
 /*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
        if (!node)
                return;
-        BUG_ON(!node->lowest);
+        BUG_ON(!node->lowest && !node->detached);
        while (!list_empty(&node->upper)) {
                edge = list_entry(node->upper.next, struct backref_edge,
                                  list[LOWER]);
                upper = edge->node[UPPER];
                list_del(&edge->list[LOWER]);
                list_del(&edge->list[UPPER]);
-                kfree(edge);
+                free_backref_edge(cache, edge);
+                if (RB_EMPTY_NODE(&upper->rb_node)) {
+                        BUG_ON(!list_empty(&node->upper));
+                        drop_backref_node(cache, node);
+                        node = upper;
+                        node->lowest = 1;
+                        continue;
+                }
                /*
-                 * add the node to pending list if no other
+                 * add the node to leaf node list if no other
                 * child block cached.
                 */
                if (list_empty(&upper->lower)) {
-                        list_add_tail(&upper->lower,
+                        list_add_tail(&upper->lower, &cache->leaves);
-                                      &cache->pending[upper->level]);
                        upper->lowest = 1;
                }
        }
        drop_backref_node(cache, node);
 }
+static void update_backref_node(struct backref_cache *cache,
+                                struct backref_node *node, u64 bytenr)
+{
+        struct rb_node *rb_node;
+        rb_erase(&node->rb_node, &cache->rb_root);
+        node->bytenr = bytenr;
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+}
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int level = 0;
+        if (cache->last_trans == 0) {
+                cache->last_trans = trans->transid;
+                return 0;
+        }
+        if (cache->last_trans == trans->transid)
+                return 0;
+        /*
+         * detached nodes are used to avoid unnecessary backref
+         * lookup. transaction commit changes the extent tree.
+         * so the detached nodes are no longer useful.
+         */
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->changed)) {
+                node = list_entry(cache->changed.next,
+                                  struct backref_node, list);
+                list_del_init(&node->list);
+                BUG_ON(node->pending);
+                update_backref_node(cache, node, node->new_bytenr);
+        }
+        /*
+         * some nodes can be left in the pending list if there were
+         * errors during processing the pending nodes.
+         */
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                list_for_each_entry(node, &cache->pending[level], list) {
+                        BUG_ON(!node->pending);
+                        if (node->bytenr == node->new_bytenr)
+                                continue;
+                        update_backref_node(cache, node, node->new_bytenr);
+                }
+        }
+        cache->last_trans = 0;
+        return 1;
+}
+static int should_ignore_root(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        if (!root->ref_cows)
+                return 0;
+        reloc_root = root->reloc_root;
+        if (!reloc_root)
+                return 0;
+        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+            root->fs_info->running_transaction->transid - 1)
+                return 0;
+        /*
+         * if there is reloc tree and it was created in previous
+         * transaction backref lookup can find the reloc tree,
+         * so backref node for the fs tree root is useless for
+         * relocation.
+         */
+        return 1;
+}
 /*
 * find reloc tree by address of tree root
 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 * for all upper level blocks that directly/indirectly reference the
 * block are also cached.
 */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
+static noinline_for_stack
-                                               struct backref_cache *cache,
+struct backref_node *build_backref_tree(struct reloc_control *rc,
-                                               struct btrfs_key *node_key,
+                                        struct btrfs_key *node_key,
-                                               int level, u64 bytenr)
+                                        int level, u64 bytenr)
 {
+        struct backref_cache *cache = &rc->backref_cache;
        struct btrfs_path *path1;
        struct btrfs_path *path2;
        struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
        unsigned long end;
        unsigned long ptr;
        LIST_HEAD(list);
+        LIST_HEAD(useless);
+        int cowonly;
        int ret;
        int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
                goto out;
        }
-        node = kmalloc(sizeof(*node), GFP_NOFS);
+        node = alloc_backref_node(cache);
        if (!node) {
                err = -ENOMEM;
                goto out;
        }
-        backref_node_init(node);
        node->bytenr = bytenr;
-        node->owner = 0;
        node->level = level;
        node->lowest = 1;
        cur = node;
@@ -587,17 +780,21 @@ again:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                        if (key.objectid == key.offset &&
+                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
                                struct btrfs_extent_ref_v0 *ref0;
                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
                                                struct btrfs_extent_ref_v0);
-                                root = find_tree_root(rc, eb, ref0);
+                                if (key.objectid == key.offset) {
-                                if (root)
+                                        root = find_tree_root(rc, eb, ref0);
-                                        cur->root = root;
+                                        if (root && !should_ignore_root(root))
-                                else
+                                                cur->root = root;
-                                        cur->old_root = 1;
+                                        else
-                                break;
+                                                list_add(&cur->list, &useless);
+                                        break;
+                                }
+                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
+                                                                      ref0)))
+                                        cur->cowonly = 1;
                        }
 #else
                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
                        }
                        rb_node = tree_search(&cache->rb_root, key.offset);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = key.offset;
-                                upper->owner = 0;
                                upper->level = cur->level + 1;
                                /*
                                 *  backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
+                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                        }
-                        list_add(&edge->list[LOWER], &cur->upper);
+                        list_add_tail(&edge->list[LOWER], &cur->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = cur;
+                        edge->node[UPPER] = upper;
                        goto next;
                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
                        goto out;
                }
+                if (!root->ref_cows)
+                        cur->cowonly = 1;
                if (btrfs_root_level(&root->root_item) == cur->level) {
                        /* tree root */
                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                               cur->bytenr);
-                        cur->root = root;
+                        if (should_ignore_root(root))
+                                list_add(&cur->list, &useless);
+                        else
+                                cur->root = root;
                        break;
                }
@@ -692,11 +894,14 @@ again:
                        if (!path2->nodes[level]) {
                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                                       lower->bytenr);
-                                lower->root = root;
+                                if (should_ignore_root(root))
+                                        list_add(&lower->list, &useless);
+                                else
+                                        lower->root = root;
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
@@ -705,16 +910,17 @@ again:
                        eb = path2->nodes[level];
                        rb_node = tree_search(&cache->rb_root, eb->start);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = eb->start;
                                upper->owner = btrfs_header_owner(eb);
                                upper->level = lower->level + 1;
+                                if (!root->ref_cows)
+                                        upper->cowonly = 1;
                                /*
                                 * if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
                                                 rb_node);
                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                                if (!upper->owner)
+                                        upper->owner = btrfs_header_owner(eb);
                        }
                        list_add_tail(&edge->list[LOWER], &lower->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = lower;
+                        edge->node[UPPER] = upper;
                        if (rb_node)
                                break;
@@ -785,8 +993,13 @@ next:
         * into the cache.
         */
        BUG_ON(!node->checked);
-        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        cowonly = node->cowonly;
-        BUG_ON(rb_node);
+        if (!cowonly) {
+                rb_node = tree_insert(&cache->rb_root, node->bytenr,
+                                      &node->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&node->lower, &cache->leaves);
+        }
        list_for_each_entry(edge, &node->upper, list[LOWER])
                list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
                list_del_init(&edge->list[UPPER]);
                upper = edge->node[UPPER];
+                if (upper->detached) {
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                        continue;
+                }
                if (!RB_EMPTY_NODE(&upper->rb_node)) {
                        if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
                }
                BUG_ON(!upper->checked);
-                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                BUG_ON(cowonly != upper->cowonly);
-                                      &upper->rb_node);
+                if (!cowonly) {
-                BUG_ON(rb_node);
+                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                              &upper->rb_node);
+                        BUG_ON(rb_node);
+                }
                list_add_tail(&edge->list[UPPER], &upper->lower);
                list_for_each_entry(edge, &upper->upper, list[LOWER])
                        list_add_tail(&edge->list[UPPER], &list);
        }
+        /*
+         * process useless backref nodes. backref nodes for tree leaves
+         * are deleted from the cache. backref nodes for upper level
+         * tree blocks are left in the cache to avoid unnecessary backref
+         * lookup.
+         */
+        while (!list_empty(&useless)) {
+                upper = list_entry(useless.next, struct backref_node, list);
+                list_del_init(&upper->list);
+                BUG_ON(!list_empty(&upper->upper));
+                if (upper == node)
+                        node = NULL;
+                if (upper->lowest) {
+                        list_del_init(&upper->lower);
+                        upper->lowest = 0;
+                }
+                while (!list_empty(&upper->lower)) {
+                        edge = list_entry(upper->lower.next,
+                                          struct backref_edge, list[UPPER]);
+                        list_del(&edge->list[UPPER]);
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                }
+                __mark_block_processed(rc, upper);
+                if (upper->level > 0) {
+                        list_add(&upper->list, &cache->detached);
+                        upper->detached = 1;
+                } else {
+                        rb_erase(&upper->rb_node, &cache->rb_root);
+                        free_backref_node(cache, upper);
+                }
+        }
 out:
        btrfs_free_path(path1);
        btrfs_free_path(path2);
        if (err) {
-                INIT_LIST_HEAD(&list);
+                while (!list_empty(&useless)) {
+                        lower = list_entry(useless.next,
+                                           struct backref_node, upper);
+                        list_del_init(&lower->upper);
+                }
                upper = node;
+                INIT_LIST_HEAD(&list);
                while (upper) {
                        if (RB_EMPTY_NODE(&upper->rb_node)) {
                                list_splice_tail(&upper->upper, &list);
-                                kfree(upper);
+                                free_backref_node(cache, upper);
                        }
                        if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
                        edge = list_entry(list.next, struct backref_edge,
                                          list[LOWER]);
+                        list_del(&edge->list[LOWER]);
                        upper = edge->node[UPPER];
-                        kfree(edge);
+                        free_backref_edge(cache, edge);
                }
                return ERR_PTR(err);
        }
+        BUG_ON(node && node->detached);
        return node;
 }
 /*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+                              struct reloc_control *rc,
+                              struct btrfs_root *src,
+                              struct btrfs_root *dest)
+{
+        struct btrfs_root *reloc_root = src->reloc_root;
+        struct backref_cache *cache = &rc->backref_cache;
+        struct backref_node *node = NULL;
+        struct backref_node *new_node;
+        struct backref_edge *edge;
+        struct backref_edge *new_edge;
+        struct rb_node *rb_node;
+        if (cache->last_trans > 0)
+                update_backref_cache(trans, cache);
+        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct backref_node, rb_node);
+                if (node->detached)
+                        node = NULL;
+                else
+                        BUG_ON(node->new_bytenr != reloc_root->node->start);
+        }
+        if (!node) {
+                rb_node = tree_search(&cache->rb_root,
+                                      reloc_root->commit_root->start);
+                if (rb_node) {
+                        node = rb_entry(rb_node, struct backref_node,
+                                        rb_node);
+                        BUG_ON(node->detached);
+                }
+        }
+        if (!node)
+                return 0;
+        new_node = alloc_backref_node(cache);
+        if (!new_node)
+                return -ENOMEM;
+        new_node->bytenr = dest->node->start;
+        new_node->level = node->level;
+        new_node->lowest = node->lowest;
+        new_node->root = dest;
+        if (!node->lowest) {
+                list_for_each_entry(edge, &node->lower, list[UPPER]) {
+                        new_edge = alloc_backref_edge(cache);
+                        if (!new_edge)
+                                goto fail;
+                        new_edge->node[UPPER] = new_node;
+                        new_edge->node[LOWER] = edge->node[LOWER];
+                        list_add_tail(&new_edge->list[UPPER],
+                                      &new_node->lower);
+                }
+        }
+        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+                              &new_node->rb_node);
+        BUG_ON(rb_node);
+        if (!new_node->lowest) {
+                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+                        list_add_tail(&new_edge->list[LOWER],
+                                      &new_edge->node[LOWER]->upper);
+                }
+        }
+        return 0;
+fail:
+        while (!list_empty(&new_node->lower)) {
+                new_edge = list_entry(new_node->lower.next,
+                                      struct backref_edge, list[UPPER]);
+                list_del(&new_edge->list[UPPER]);
+                free_backref_edge(cache, new_edge);
+        }
+        free_backref_node(cache, new_node);
+        return -ENOMEM;
+}
+/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
 static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
        return 0;
 }
-/*
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
- * create reloc tree for a given fs tree. reloc tree is just a
+                                        struct btrfs_root *root, u64 objectid)
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
        struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
        struct btrfs_key root_key;
        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                reloc_root->last_trans = trans->transid;
-                return 0;
-        }
-        if (!root->fs_info->reloc_ctl ||
-            !root->fs_info->reloc_ctl->create_reloc_root ||
-            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-                return 0;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
        BUG_ON(!root_item);
        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        root_key.offset = root->root_key.objectid;
+        root_key.offset = objectid;
-        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+        if (root->root_key.objectid == objectid) {
-                              BTRFS_TREE_RELOC_OBJECTID);
+                /* called by btrfs_init_reloc_root */
-        BUG_ON(ret);
+                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+                btrfs_set_root_last_snapshot(&root->root_item,
+                                             trans->transid - 1);
+        } else {
+                /*
+                 * called by btrfs_reloc_post_snapshot_hook.
+                 * the source tree is a reloc tree, all tree blocks
+                 * modified after it was created have RELOC flag
+                 * set in their headers. so it's OK to not update
+                 * the 'last_snapshot'.
+                 */
+                ret = btrfs_copy_root(trans, root, root->node, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+        }
-        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
        memcpy(root_item, &root->root_item, sizeof(*root_item));
-        btrfs_set_root_refs(root_item, 1);
        btrfs_set_root_bytenr(root_item, eb->start);
        btrfs_set_root_level(root_item, btrfs_header_level(eb));
        btrfs_set_root_generation(root_item, trans->transid);
-        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-        root_item->drop_level = 0;
+        if (root->root_key.objectid == objectid) {
+                btrfs_set_root_refs(root_item, 0);
+                memset(&root_item->drop_progress, 0,
+                       sizeof(struct btrfs_disk_key));
+                root_item->drop_level = 0;
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                                                 &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
+        return reloc_root;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        int clear_rsv = 0;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!rc || !rc->create_reloc_tree ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        if (!trans->block_rsv) {
+                trans->block_rsv = rc->block_rsv;
+                clear_rsv = 1;
+        }
+        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+        if (clear_rsv)
+                trans->block_rsv = NULL;
        __add_reloc_root(reloc_root);
        root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
-        if (btrfs_root_refs(root_item) == 0) {
+        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+            btrfs_root_refs(root_item) == 0) {
                root->reloc_root = NULL;
                del = 1;
        }
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                goto out;
        }
-        if (new_bytenr)
+        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                                struct reloc_control *rc,
+int replace_file_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
+                         struct reloc_control *rc,
-                                struct extent_buffer *leaf,
+                         struct btrfs_root *root,
-                                struct list_head *inode_list)
+                         struct extent_buffer *leaf)
 {
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        struct inode *inode = NULL;
-        struct inodevec *ivec = NULL;
        u64 parent;
        u64 bytenr;
-        u64 new_bytenr;
+        u64 new_bytenr = 0;
        u64 num_bytes;
        u64 end;
        u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                 * to complete and drop the extent cache
                 */
                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
-                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-                                BUG_ON(!ivec);
-                                ivec->nr = 0;
-                                list_add_tail(&ivec->list, inode_list);
-                        }
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                                first = 0;
                        } else if (inode && inode->i_ino < key.objectid) {
+                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                        }
                        if (inode && inode->i_ino == key.objectid) {
                                end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = get_new_location(rc->data_inode, &new_bytenr,
                                       bytenr, num_bytes);
-                if (ret > 0)
+                if (ret > 0) {
+                        WARN_ON(1);
                        continue;
+                }
                BUG_ON(ret < 0);
                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
        }
        if (dirty)
                btrfs_mark_buffer_dirty(leaf);
+        if (inode)
+                btrfs_add_delayed_iput(inode);
        return 0;
 }
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
-static int replace_path(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                        struct btrfs_root *dest, struct btrfs_root *src,
+int replace_path(struct btrfs_trans_handle *trans,
-                        struct btrfs_path *path, struct btrfs_key *next_key,
+                 struct btrfs_root *dest, struct btrfs_root *src,
-                        struct extent_buffer **leaf,
+                 struct btrfs_path *path, struct btrfs_key *next_key,
-                        int lowest_level, int max_level)
+                 int lowest_level, int max_level)
 {
        struct extent_buffer *eb;
        struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
        u64 new_ptr_gen;
        u64 last_snapshot;
        u32 blocksize;
+        int cow = 0;
        int level;
        int ret;
        int slot;
        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(lowest_level > 1 && leaf);
        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
        slot = path->slots[lowest_level];
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
                return 0;
        }
-        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        if (cow) {
-        BUG_ON(ret);
+                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+                BUG_ON(ret);
+        }
        btrfs_set_lock_blocking(eb);
        if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
                    memcmp_node_keys(parent, slot, path, level)) {
-                        if (level <= lowest_level && !leaf) {
+                        if (level <= lowest_level) {
                                ret = 0;
                                break;
                        }
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
                        btrfs_tree_lock(eb);
-                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                        if (cow) {
-                                              slot, &eb);
+                                ret = btrfs_cow_block(trans, dest, eb, parent,
-                        BUG_ON(ret);
+                                                      slot, &eb);
-                        btrfs_set_lock_blocking(eb);
+                                BUG_ON(ret);
-                        if (level <= lowest_level) {
-                                *leaf = eb;
-                                ret = 0;
-                                break;
                        }
+                        btrfs_set_lock_blocking(eb);
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (!cow) {
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        cow = 1;
+                        goto again;
+                }
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
                btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
-static void put_inodes(struct list_head *list)
-{
-        struct inodevec *ivec;
-        while (!list_empty(list)) {
-                ivec = list_entry(list->next, struct inodevec, list);
-                list_del(&ivec->list);
-                while (ivec->nr > 0) {
-                        ivec->nr--;
-                        iput(ivec->inode[ivec->nr]);
-                }
-                kfree(ivec);
-        }
-}
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
-        struct extent_buffer *leaf = NULL;
+        struct extent_buffer *leaf;
        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
        int ret;
        int err = 0;
+        u32 min_reserved;
        path = btrfs_alloc_path();
        if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                btrfs_unlock_up_safe(path, 0);
        }
-        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
-                trans = btrfs_start_transaction(root, 1);
+        memset(&next_key, 0, sizeof(next_key));
-                leaf = path->nodes[0];
+        while (1) {
-                btrfs_item_key_to_cpu(leaf, &key, 0);
+                trans = btrfs_start_transaction(root, 0);
-                btrfs_release_path(reloc_root, path);
+                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-                if (ret < 0) {
+                                            min_reserved, 0);
-                        err = ret;
+                if (ret) {
-                        goto out;
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
                }
-                leaf = path->nodes[0];
-                btrfs_unlock_up_safe(path, 1);
-                ret = replace_file_extents(trans, rc, root, leaf,
-                                           &inode_list);
-                if (ret < 0)
-                        err = ret;
-                goto out;
-        }
-        memset(&next_key, 0, sizeof(next_key));
-        while (1) {
-                leaf = NULL;
                replaced = 0;
-                trans = btrfs_start_transaction(root, 1);
                max_level = level;
                ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                if (!find_next_key(path, level, &key) &&
                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
                        ret = 0;
-                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_path(trans, root, reloc_root,
-                                           path, &next_key, &leaf,
-                                           level, max_level);
                } else {
-                        ret = replace_path(trans, root, reloc_root,
+                        ret = replace_path(trans, root, reloc_root, path,
-                                           path, &next_key, NULL,
+                                           &next_key, level, max_level);
-                                           level, max_level);
                }
                if (ret < 0) {
                        err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        replaced = 1;
-                } else if (leaf) {
-                        /*
-                         * no block got replaced, try replacing file extents
-                         */
-                        btrfs_item_key_to_cpu(leaf, &key, 0);
-                        ret = replace_file_extents(trans, rc, root, leaf,
-                                                   &inode_list);
-                        btrfs_tree_unlock(leaf);
-                        free_extent_buffer(leaf);
-                        BUG_ON(ret < 0);
                }
                ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                root_item->drop_level = level;
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
+                btrfs_end_transaction_throttle(trans, root);
                btrfs_btree_balance_dirty(root, nr);
-                /*
-                 * put inodes outside transaction, otherwise we may deadlock.
-                 */
-                put_inodes(&inode_list);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1765,87 +2110,125 @@ out:
                       sizeof(root_item->drop_progress));
                root_item->drop_level = 0;
                btrfs_set_root_refs(root_item, 0);
+                btrfs_update_reloc_root(trans, root);
        }
        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
+        btrfs_end_transaction_throttle(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        put_inodes(&inode_list);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
        return err;
 }
-/*
+static noinline_for_stack
- * callback for the work threads.
+int prepare_to_merge(struct reloc_control *rc, int err)
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = rc->extent_root;
-        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
-        struct async_merge *async;
+        struct btrfs_trans_handle *trans;
+        LIST_HEAD(reloc_roots);
+        u64 num_bytes = 0;
+        int ret;
+        int retries = 0;
+        mutex_lock(&root->fs_info->trans_mutex);
+        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+        rc->merging_rsv_size += rc->nodes_relocated * 2;
+        mutex_unlock(&root->fs_info->trans_mutex);
+again:
+        if (!err) {
+                num_bytes = rc->merging_rsv_size;
+                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                                          num_bytes, &retries);
+                if (ret)
+                        err = ret;
+        }
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (!err) {
+                if (num_bytes != rc->merging_rsv_size) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                        retries = 0;
+                        goto again;
+                }
+        }
-        async = container_of(work, struct async_merge, work);
+        rc->merge_reloc_tree = 1;
-        reloc_root = async->root;
+        while (!list_empty(&rc->reloc_roots)) {
+                reloc_root = list_entry(rc->reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del_init(&reloc_root->root_list);
-        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                root = read_fs_root(reloc_root->fs_info,
                                    reloc_root->root_key.offset);
                BUG_ON(IS_ERR(root));
                BUG_ON(root->reloc_root != reloc_root);
-                merge_reloc_root(async->rc, root);
+                /*
+                 * set reference count to 1, so btrfs_recover_relocation
-                trans = btrfs_start_transaction(root, 1);
+                 * knows it should resumes merging
+                 */
+                if (!err)
+                        btrfs_set_root_refs(&reloc_root->root_item, 1);
                btrfs_update_reloc_root(trans, root);
-                btrfs_end_transaction(trans, root);
-        }
-        btrfs_drop_snapshot(reloc_root, 0);
+                list_add(&reloc_root->root_list, &reloc_roots);
+        }
-        if (atomic_dec_and_test(async->num_pending))
+        list_splice(&reloc_roots, &rc->reloc_roots);
-                complete(async->done);
-        kfree(async);
+        if (!err)
+                btrfs_commit_transaction(trans, rc->extent_root);
+        else
+                btrfs_end_transaction(trans, rc->extent_root);
+        return err;
 }
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
 {
-        struct async_merge *async;
        struct btrfs_root *root;
-        struct completion done;
+        struct btrfs_root *reloc_root;
-        atomic_t num_pending;
+        LIST_HEAD(reloc_roots);
+        int found = 0;
+        int ret;
+again:
+        root = rc->extent_root;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&rc->reloc_roots, &reloc_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
-        init_completion(&done);
+        while (!list_empty(&reloc_roots)) {
-        atomic_set(&num_pending, 1);
+                found = 1;
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
-        while (!list_empty(&rc->reloc_roots)) {
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-                root = list_entry(rc->reloc_roots.next,
+                        root = read_fs_root(reloc_root->fs_info,
-                                  struct btrfs_root, root_list);
+                                            reloc_root->root_key.offset);
-                list_del_init(&root->root_list);
+                        BUG_ON(IS_ERR(root));
+                        BUG_ON(root->reloc_root != reloc_root);
-                async = kmalloc(sizeof(*async), GFP_NOFS);
+                        ret = merge_reloc_root(rc, root);
-                BUG_ON(!async);
+                        BUG_ON(ret);
-                async->work.func = merge_func;
+                } else {
-                async->work.flags = 0;
+                        list_del_init(&reloc_root->root_list);
-                async->rc = rc;
+                }
-                async->root = root;
+                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
-                async->done = &done;
-                async->num_pending = &num_pending;
-                atomic_inc(&num_pending);
-                btrfs_queue_worker(&rc->workers, &async->work);
        }
-        if (!atomic_dec_and_test(&num_pending))
+        if (found) {
-                wait_for_completion(&done);
+                found = 0;
+                goto again;
+        }
        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
        return 0;
 }
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
        return btrfs_record_root_in_trans(trans, root);
 }
-/*
+static noinline_for_stack
- * select one tree from trees that references the block.
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- * for blocks in refernce counted trees, we preper reloc tree.
+                                     struct reloc_control *rc,
- * if no reloc tree found and reloc_only is true, NULL is returned.
+                                     struct backref_node *node,
- */
+                                     struct backref_edge *edges[], int *nr)
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-                                            struct backref_node *node,
-                                            struct backref_edge *edges[],
-                                            int *nr, int reloc_only)
 {
        struct backref_node *next;
        struct btrfs_root *root;
-        int index;
+        int index = 0;
-        int loop = 0;
-again:
-        index = 0;
        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;
-                if (!root) {
+                BUG_ON(!root);
-                        BUG_ON(!node->old_root);
+                BUG_ON(!root->ref_cows);
-                        goto skip;
-                }
-                /* no other choice for non-refernce counted tree */
-                if (!root->ref_cows) {
-                        BUG_ON(reloc_only);
-                        break;
-                }
                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
                        record_reloc_root_in_trans(trans, root);
                        break;
                }
-                if (loop) {
+                btrfs_record_root_in_trans(trans, root);
-                        btrfs_record_root_in_trans(trans, root);
+                root = root->reloc_root;
+                if (next->new_bytenr != root->node->start) {
+                        BUG_ON(next->new_bytenr);
+                        BUG_ON(!list_empty(&next->list));
+                        next->new_bytenr = root->node->start;
+                        next->root = root;
+                        list_add_tail(&next->list,
+                                      &rc->backref_cache.changed);
+                        __mark_block_processed(rc, next);
                        break;
                }
-                if (reloc_only || next != node) {
+                WARN_ON(1);
-                        if (!root->reloc_root)
-                                btrfs_record_root_in_trans(trans, root);
-                        root = root->reloc_root;
-                        /*
-                         * if the reloc tree was created in current
-                         * transation, there is no node in backref tree
-                         * corresponds to the root of the reloc tree.
-                         */
-                        if (btrfs_root_last_snapshot(&root->root_item) ==
-                            trans->transid - 1)
-                                break;
-                }
-skip:
                root = NULL;
                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }
+        if (!root)
+                return NULL;
-        if (!root && !loop && !reloc_only) {
+        *nr = index;
-                loop = 1;
+        next = node;
-                goto again;
+        /* setup backref node path for btrfs_reloc_cow_block */
+        while (1) {
+                rc->backref_cache.path[next->level] = next;
+                if (--index < 0)
+                        break;
+                next = edges[index]->node[UPPER];
        }
-        if (root)
-                *nr = index;
-        else
-                *nr = 0;
        return root;
 }
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
 static noinline_for_stack
 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                                   struct backref_node *node)
 {
+        struct backref_node *next;
+        struct btrfs_root *root;
+        struct btrfs_root *fs_root = NULL;
        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-        int nr;
+        int index = 0;
-        return __select_one_root(trans, node, edges, &nr, 0);
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                BUG_ON(!root);
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows)
+                        return root;
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+                        fs_root = root;
+                if (next != node)
+                        return NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!fs_root)
+                return ERR_PTR(-ENOENT);
+        return fs_root;
 }
 static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+u64 calcu_metadata_size(struct reloc_control *rc,
-                                     struct backref_node *node,
+                        struct backref_node *node, int reserve)
-                                     struct backref_edge *edges[], int *nr)
 {
-        return __select_one_root(trans, node, edges, nr, 1);
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        u64 num_bytes = 0;
+        int index = 0;
+        BUG_ON(reserve && node->processed);
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed && (reserve || next != node))
+                                break;
+                        num_bytes += btrfs_level_size(rc->extent_root,
+                                                      next->level);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+        return num_bytes;
 }
-static void grab_path_buffers(struct btrfs_path *path,
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
-                              struct backref_node *node,
+                                  struct reloc_control *rc,
-                              struct backref_edge *edges[], int nr)
+                                  struct backref_node *node)
 {
-        int i = 0;
+        struct btrfs_root *root = rc->extent_root;
-        while (1) {
+        u64 num_bytes;
-                drop_node_buffer(node);
+        int ret;
-                node->eb = path->nodes[node->level];
-                BUG_ON(!node->eb);
+        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
-                if (path->locks[node->level])
-                        node->locked = 1;
-                path->nodes[node->level] = NULL;
-                path->locks[node->level] = 0;
-                if (i >= nr)
-                        break;
-                edges[i]->blockptr = node->eb->start;
+        trans->block_rsv = rc->block_rsv;
-                node = edges[i]->node[UPPER];
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-                i++;
+                                  &rc->block_rsv_retries);
+        if (ret) {
+                if (ret == -EAGAIN)
+                        rc->commit_transaction = 1;
+                return ret;
        }
+        rc->block_rsv_retries = 0;
+        return 0;
+}
+static void release_metadata_space(struct reloc_control *rc,
+                                   struct backref_node *node)
+{
+        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
 }
 /*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
 * in that case this function just updates pointers.
 */
 static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_key *key,
                         struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
        BUG_ON(lowest && node->eb);
        path->lowest_level = node->level + 1;
+        rc->backref_cache.path[node->level] = node;
        list_for_each_entry(edge, &node->upper, list[LOWER]) {
                cond_resched();
-                if (node->eb && node->eb->start == edge->blockptr)
-                        continue;
                upper = edge->node[UPPER];
-                root = select_reloc_root(trans, upper, edges, &nr);
+                root = select_reloc_root(trans, rc, upper, edges, &nr);
-                if (!root)
+                BUG_ON(!root);
-                        continue;
+                if (upper->eb && !upper->locked) {
-                if (upper->eb && !upper->locked)
+                        if (!lowest) {
+                                ret = btrfs_bin_search(upper->eb, key,
+                                                       upper->level, &slot);
+                                BUG_ON(ret);
+                                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                                if (node->eb->start == bytenr)
+                                        goto next;
+                        }
                        drop_node_buffer(upper);
+                }
                if (!upper->eb) {
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        }
                        BUG_ON(ret > 0);
-                        slot = path->slots[upper->level];
+                        if (!upper->eb) {
+                                upper->eb = path->nodes[upper->level];
+                                path->nodes[upper->level] = NULL;
+                        } else {
+                                BUG_ON(upper->eb != path->nodes[upper->level]);
+                        }
-                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        upper->locked = 1;
-                        grab_path_buffers(path, upper, edges, nr);
+                        path->locks[upper->level] = 0;
+                        slot = path->slots[upper->level];
                        btrfs_release_path(NULL, path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                }
                bytenr = btrfs_node_blockptr(upper->eb, slot);
-                if (!lowest) {
+                if (lowest) {
-                        if (node->eb->start == bytenr) {
+                        BUG_ON(bytenr != node->bytenr);
-                                btrfs_tree_unlock(upper->eb);
-                                upper->locked = 0;
-                                continue;
-                        }
                } else {
-                        BUG_ON(node->bytenr != bytenr);
+                        if (node->eb->start == bytenr)
+                                goto next;
                }
                blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
                                              slot, &eb);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
                        if (ret < 0) {
                                err = ret;
-                                break;
+                                goto next;
                        }
-                        btrfs_set_lock_blocking(eb);
+                        BUG_ON(node->eb != eb);
-                        node->eb = eb;
-                        node->locked = 1;
                } else {
                        btrfs_set_node_blockptr(upper->eb, slot,
                                                node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
                }
-                if (!lowest) {
+next:
-                        btrfs_tree_unlock(upper->eb);
+                if (!upper->pending)
-                        upper->locked = 0;
+                        drop_node_buffer(upper);
-                }
+                else
+                        unlock_node_buffer(upper);
+                if (err)
+                        break;
        }
+        if (!err && node->pending) {
+                drop_node_buffer(node);
+                list_move_tail(&node->list, &rc->backref_cache.changed);
+                node->pending = 0;
+        }
        path->lowest_level = 0;
+        BUG_ON(err == -ENOSPC);
        return err;
 }
 static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_path *path)
 {
        struct btrfs_key key;
-        if (!node->eb || list_empty(&node->upper))
-                return 0;
        btrfs_node_key_to_cpu(node->eb, &key, 0);
-        return do_relocation(trans, node, &key, path, 0);
+        return do_relocation(trans, rc, node, &key, path, 0);
 }
 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-                                struct backref_cache *cache,
+                                struct reloc_control *rc,
-                                struct btrfs_path *path)
+                                struct btrfs_path *path, int err)
 {
+        LIST_HEAD(list);
+        struct backref_cache *cache = &rc->backref_cache;
        struct backref_node *node;
        int level;
        int ret;
-        int err = 0;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                while (!list_empty(&cache->pending[level])) {
                        node = list_entry(cache->pending[level].next,
-                                          struct backref_node, lower);
+                                          struct backref_node, list);
-                        BUG_ON(node->level != level);
+                        list_move_tail(&node->list, &list);
+                        BUG_ON(!node->pending);
-                        ret = link_to_upper(trans, node, path);
+                        if (!err) {
-                        if (ret < 0)
+                                ret = link_to_upper(trans, rc, node, path);
-                                err = ret;
+                                if (ret < 0)
-                        /*
+                                        err = ret;
-                         * this remove the node from the pending list and
+                        }
-                         * may add some other nodes to the level + 1
-                         * pending list
-                         */
-                        remove_backref_node(cache, node);
                }
+                list_splice_init(&list, &cache->pending[level]);
        }
-        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
        return err;
 }
 static void mark_block_processed(struct reloc_control *rc,
-                                 struct backref_node *node)
+                                 u64 bytenr, u32 blocksize)
+{
+        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+                        EXTENT_DIRTY, GFP_NOFS);
+}
+static void __mark_block_processed(struct reloc_control *rc,
+                                   struct backref_node *node)
 {
        u32 blocksize;
        if (node->level == 0 ||
            in_block_group(node->bytenr, rc->block_group)) {
                blocksize = btrfs_level_size(rc->extent_root, node->level);
-                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                mark_block_processed(rc, node->bytenr, blocksize);
-                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
-                                GFP_NOFS);
        }
        node->processed = 1;
 }
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
                        if (next->processed)
                                break;
-                        mark_block_processed(rc, next);
+                        __mark_block_processed(rc, next);
                        if (list_empty(&next->upper))
                                break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
        return 0;
 }
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-                              u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-        struct btrfs_key found_key;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        u32 nritems;
-        int i;
-        int ret = 0;
-        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &found_key, i);
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                if (in_block_group(bytenr, rc->block_group)) {
-                        ret = 1;
-                        break;
-                }
-        }
-        free_extent_buffer(leaf);
-        return ret;
-}
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-                            struct reloc_control *rc,
-                            struct backref_node *node,
-                            struct rb_root *blocks)
-{
-        struct tree_block *block;
-        struct rb_node *rb_node;
-        u64 bytenr;
-        u64 ptr_gen;
-        u32 blocksize;
-        u32 nritems;
-        int i;
-        int err = 0;
-        nritems = btrfs_header_nritems(node->eb);
-        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                readahead_tree_block(rc->extent_root,
-                                     bytenr, blocksize, ptr_gen);
-        }
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-                        continue;
-                block = kmalloc(sizeof(*block), GFP_NOFS);
-                if (!block) {
-                        err = -ENOMEM;
-                        break;
-                }
-                block->bytenr = bytenr;
-                btrfs_node_key_to_cpu(node->eb, &block->key, i);
-                block->level = node->level - 1;
-                block->key_ready = 1;
-                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-                BUG_ON(rb_node);
-        }
-        if (err)
-                free_block_list(blocks);
-        return err;
-}
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-                        struct reloc_control *rc,
-                        struct backref_cache *cache,
-                        struct rb_root *blocks, int level,
-                        struct backref_node **upper)
-{
-        struct backref_node *node;
-        int ret = 0;
-        WARN_ON(!list_empty(&cache->pending[level]));
-        if (list_empty(&cache->pending[level + 1]))
-                return 1;
-        node = list_entry(cache->pending[level + 1].next,
-                          struct backref_node, lower);
-        if (node->eb)
-                ret = add_child_blocks(trans, rc, node, blocks);
-        *upper = node;
-        return ret;
-}
 static int get_tree_block_key(struct reloc_control *rc,
                              struct tree_block *block)
 {
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path)
 {
        struct btrfs_root *root;
-        int ret;
+        int release = 0;
+        int ret = 0;
+        if (!node)
+                return 0;
+        BUG_ON(node->processed);
        root = select_one_root(trans, node);
-        if (unlikely(!root)) {
+        if (root == ERR_PTR(-ENOENT)) {
-                rc->found_old_snapshot = 1;
                update_processed_blocks(rc, node);
-                return 0;
+                goto out;
        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+        if (!root || root->ref_cows) {
-                ret = do_relocation(trans, node, key, path, 1);
+                ret = reserve_metadata_space(trans, rc, node);
-                if (ret < 0)
+                if (ret)
-                        goto out;
-                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_file_extents(trans, rc, root,
-                                                   node->eb, NULL);
-                        if (ret < 0)
-                                goto out;
-                }
-                drop_node_buffer(node);
-        } else if (!root->ref_cows) {
-                path->lowest_level = node->level;
-                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                btrfs_release_path(root, path);
-                if (ret < 0)
                        goto out;
-        } else if (root != node->root) {
+                release = 1;
-                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
        }
-        update_processed_blocks(rc, node);
+        if (root) {
-        ret = 0;
+                if (root->ref_cows) {
+                        BUG_ON(node->new_bytenr);
+                        BUG_ON(!list_empty(&node->list));
+                        btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        node->new_bytenr = root->node->start;
+                        node->root = root;
+                        list_add_tail(&node->list, &rc->backref_cache.changed);
+                } else {
+                        path->lowest_level = node->level;
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        btrfs_release_path(root, path);
+                        if (ret > 0)
+                                ret = 0;
+                }
+                if (!ret)
+                        update_processed_blocks(rc, node);
+        } else {
+                ret = do_relocation(trans, rc, node, key, path, 1);
+        }
 out:
-        drop_node_buffer(node);
+        if (ret || node->level == 0 || node->cowonly) {
+                if (release)
+                        release_metadata_space(rc, node);
+                remove_backref_node(&rc->backref_cache, node);
+        }
        return ret;
 }
@@ -2415,12 +2753,10 @@ static noinline_for_stack
 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc, struct rb_root *blocks)
 {
-        struct backref_cache *cache;
        struct backref_node *node;
        struct btrfs_path *path;
        struct tree_block *block;
        struct rb_node *rb_node;
-        int level = -1;
        int ret;
        int err = 0;
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        cache = kmalloc(sizeof(*cache), GFP_NOFS);
-        if (!cache) {
-                btrfs_free_path(path);
-                return -ENOMEM;
-        }
-        backref_cache_init(cache);
        rb_node = rb_first(blocks);
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                if (level == -1)
-                        level = block->level;
-                else
-                        BUG_ON(level != block->level);
                if (!block->key_ready)
                        reada_tree_block(rc, block);
                rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                node = build_backref_tree(rc, cache, &block->key,
+                node = build_backref_tree(rc, &block->key,
                                          block->level, block->bytenr);
                if (IS_ERR(node)) {
                        err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                ret = relocate_tree_block(trans, rc, node, &block->key,
                                          path);
                if (ret < 0) {
-                        err = ret;
+                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
+                                err = ret;
                        goto out;
                }
-                remove_backref_node(cache, node);
                rb_node = rb_next(rb_node);
        }
+out:
-        if (level > 0)
-                goto out;
        free_block_list(blocks);
+        err = finish_pending_nodes(trans, rc, path, err);
-        /*
+        btrfs_free_path(path);
-         * now backrefs of some upper level tree blocks have been cached,
+        return err;
-         * try relocating blocks referenced by these upper level blocks.
+}
-         */
-        while (1) {
-                struct backref_node *upper = NULL;
-                if (trans->transaction->in_commit ||
-                    trans->transaction->delayed_refs.flushing)
-                        break;
-                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+static noinline_for_stack
-                                          &upper);
+int prealloc_file_extent_cluster(struct inode *inode,
-                if (ret < 0)
+                                 struct file_extent_cluster *cluster)
-                        err = ret;
+{
-                if (ret != 0)
+        u64 alloc_hint = 0;
-                        break;
+        u64 start;
+        u64 end;
+        u64 offset = BTRFS_I(inode)->index_cnt;
+        u64 num_bytes;
+        int nr = 0;
+        int ret = 0;
-                rb_node = rb_first(blocks);
+        BUG_ON(cluster->start != cluster->boundary[0]);
-                while (rb_node) {
+        mutex_lock(&inode->i_mutex);
-                        block = rb_entry(rb_node, struct tree_block, rb_node);
-                        if (trans->transaction->in_commit ||
-                            trans->transaction->delayed_refs.flushing)
-                                goto out;
-                        BUG_ON(!block->key_ready);
-                        node = build_backref_tree(rc, cache, &block->key,
-                                                  level, block->bytenr);
-                        if (IS_ERR(node)) {
-                                err = PTR_ERR(node);
-                                goto out;
-                        }
-                        ret = relocate_tree_block(trans, rc, node,
+        ret = btrfs_check_data_free_space(inode, cluster->end +
-                                                  &block->key, path);
+                                          1 - cluster->start);
-                        if (ret < 0) {
+        if (ret)
-                                err = ret;
+                goto out;
-                                goto out;
-                        }
-                        remove_backref_node(cache, node);
-                        rb_node = rb_next(rb_node);
-                }
-                free_block_list(blocks);
-                if (upper) {
+        while (nr < cluster->nr) {
-                        ret = link_to_upper(trans, upper, path);
+                start = cluster->boundary[nr] - offset;
-                        if (ret < 0) {
+                if (nr + 1 < cluster->nr)
-                                err = ret;
+                        end = cluster->boundary[nr + 1] - 1 - offset;
-                                break;
+                else
-                        }
+                        end = cluster->end - offset;
-                        remove_backref_node(cache, upper);
-                }
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                num_bytes = end + 1 - start;
+                ret = btrfs_prealloc_file_range(inode, 0, start,
+                                                num_bytes, num_bytes,
+                                                end + 1, &alloc_hint);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                if (ret)
+                        break;
+                nr++;
        }
+        btrfs_free_reserved_data_space(inode, cluster->end +
+                                       1 - cluster->start);
 out:
-        free_block_list(blocks);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
-        ret = finish_pending_nodes(trans, cache, path);
-        if (ret < 0)
-                err = ret;
-        kfree(cache);
-        btrfs_free_path(path);
-        return err;
 }
 static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
-        unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
        int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
        if (!ra)
                return -ENOMEM;
-        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        ret = prealloc_file_extent_cluster(inode, cluster);
-        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+        if (ret)
+                goto out;
-        mutex_lock(&inode->i_mutex);
+        file_ra_state_init(ra, inode->i_mapping);
-        i_size_write(inode, cluster->end + 1 - offset);
        ret = setup_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
-                goto out_unlock;
+                goto out;
-        file_ra_state_init(ra, inode->i_mapping);
-        WARN_ON(cluster->start != cluster->boundary[0]);
+        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                if (ret)
+                        goto out;
                page = find_lock_page(inode->i_mapping, index);
                if (!page) {
                        page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  last_index + 1 - index);
                        page = grab_cache_page(inode->i_mapping, index);
                        if (!page) {
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -ENOMEM;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -EIO;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                        EXTENT_BOUNDARY, GFP_NOFS);
                        nr++;
                }
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
                set_page_dirty(page);
-                dirty_page++;
                unlock_extent(&BTRFS_I(inode)->io_tree,
                              page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
                page_cache_release(page);
                index++;
-                if (nr < cluster->nr &&
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                    page_end + 1 + offset == cluster->boundary[nr]) {
+                btrfs_throttle(BTRFS_I(inode)->root);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                           dirty_page);
-                        dirty_page = 0;
-                }
-        }
-        if (dirty_page) {
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                   dirty_page);
        }
        WARN_ON(nr != cluster->nr);
-out_unlock:
+out:
-        mutex_unlock(&inode->i_mutex);
        kfree(ra);
        return ret;
 }
@@ -2870,9 +3173,6 @@ out:
 static int block_use_full_backref(struct reloc_control *rc,
                                  struct extent_buffer *eb)
 {
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct btrfs_key key;
        u64 flags;
        int ret;
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
                return 1;
-        path = btrfs_alloc_path();
+        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-        BUG_ON(!path);
+                                       eb->start, eb->len, NULL, &flags);
-        key.objectid = eb->start;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = eb->len;
-        path->search_commit_root = 1;
-        path->skip_locking = 1;
-        ret = btrfs_search_slot(NULL, rc->extent_root,
-                                &key, path, 0, 0);
        BUG_ON(ret);
-        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                            struct btrfs_extent_item);
-        flags = btrfs_extent_flags(path->nodes[0], ei);
-        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                ret = 1;
        else
                ret = 0;
-        btrfs_free_path(path);
        return ret;
 }
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
        struct btrfs_extent_inline_ref *iref;
        unsigned long ptr;
        unsigned long end;
-        u32 blocksize;
+        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
        int ret;
        int err = 0;
-        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-                               extent_key->offset);
-        BUG_ON(ret < 0);
-        if (ret > 0) {
-                /* the relocated data is fragmented */
-                rc->extents_skipped++;
-                btrfs_release_path(rc->extent_root, path);
-                return 0;
-        }
-        blocksize = btrfs_level_size(rc->extent_root, 0);
        eb = path->nodes[0];
        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
 */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
-                     struct reloc_control *rc, struct btrfs_path *path)
+                     struct reloc_control *rc, struct btrfs_path *path,
+                     struct btrfs_key *extent_key)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
+                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
        }
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+        if (!rc->block_rsv)
+                return -ENOMEM;
+        /*
+         * reserve some space for creating reloc trees.
+         * btrfs_init_reloc_root will use them when there
+         * is no reservation in transaction handle.
+         */
+        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+                                  rc->extent_root->nodesize * 256,
+                                  &rc->block_rsv_retries);
+        if (ret)
+                return ret;
+        rc->block_rsv->refill_used = 1;
+        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+        memset(&rc->cluster, 0, sizeof(rc->cluster));
+        rc->search_start = rc->block_group->key.objectid;
+        rc->extents_found = 0;
+        rc->nodes_relocated = 0;
+        rc->merging_rsv_size = 0;
+        rc->block_rsv_retries = 0;
+        rc->create_reloc_tree = 1;
+        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return 0;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
-        struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
-        cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
-        if (!cluster)
-                return -ENOMEM;
        path = btrfs_alloc_path();
-        if (!path) {
+        if (!path)
-                kfree(cluster);
                return -ENOMEM;
-        }
-        rc->extents_found = 0;
-        rc->extents_skipped = 0;
-        rc->search_start = rc->block_group->key.objectid;
-        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-                          GFP_NOFS);
-        rc->create_reloc_root = 1;
-        set_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        ret = prepare_to_relocate(rc);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (ret) {
+                err = ret;
+                goto out_free;
+        }
        while (1) {
-                trans = btrfs_start_transaction(rc->extent_root, 1);
+                trans = btrfs_start_transaction(rc->extent_root, 0);
+                if (update_backref_cache(trans, &rc->backref_cache)) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        continue;
+                }
-                ret = find_next_extent(trans, rc, path);
+                ret = find_next_extent(trans, rc, path, &key);
                if (ret < 0)
                        err = ret;
                if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_extent_item);
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-                item_size = btrfs_item_size_nr(path->nodes[0],
-                                               path->slots[0]);
                if (item_size >= sizeof(*ei)) {
                        flags = btrfs_extent_flags(path->nodes[0], ei);
                        ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        ret = add_tree_block(rc, &key, path, &blocks);
                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
                        btrfs_release_path(rc->extent_root, path);
                        ret = 0;
                }
                if (ret < 0) {
-                        err = 0;
+                        err = ret;
                        break;
                }
                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
+                                if (ret != -EAGAIN) {
+                                        err = ret;
+                                        break;
+                                }
+                                rc->extents_found--;
+                                rc->search_start = key.objectid;
+                        }
+                }
+                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                                            rc->block_rsv, 0, 5);
+                if (ret < 0) {
+                        if (ret != -EAGAIN) {
                                err = ret;
+                                WARN_ON(1);
                                break;
                        }
+                        rc->commit_transaction = 1;
                }
-                nr = trans->blocks_used;
+                if (rc->commit_transaction) {
-                btrfs_end_transaction(trans, rc->extent_root);
+                        rc->commit_transaction = 0;
+                        ret = btrfs_commit_transaction(trans, rc->extent_root);
+                        BUG_ON(ret);
+                } else {
+                        nr = trans->blocks_used;
+                        btrfs_end_transaction_throttle(trans, rc->extent_root);
+                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                }
                trans = NULL;
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
                        ret = relocate_data_extent(rc->data_inode,
-                                                   &key, cluster);
+                                                   &key, &rc->cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
                        }
                }
        }
-        btrfs_free_path(path);
+        btrfs_release_path(rc->extent_root, path);
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
        if (trans) {
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
        if (!err) {
-                ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+                ret = relocate_file_extent_cluster(rc->data_inode,
+                                                   &rc->cluster);
                if (ret < 0)
                        err = ret;
        }
-        kfree(cluster);
+        rc->create_reloc_tree = 0;
+        set_reloc_control(rc);
-        rc->create_reloc_root = 0;
+        backref_cache_cleanup(&rc->backref_cache);
-        smp_mb();
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
-        if (rc->extents_found > 0) {
+        err = prepare_to_merge(rc, err);
-                trans = btrfs_start_transaction(rc->extent_root, 1);
-                btrfs_commit_transaction(trans, rc->extent_root);
-        }
        merge_reloc_roots(rc);
+        rc->merge_reloc_tree = 0;
        unset_reloc_control(rc);
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+        btrfs_free_path(path);
        return err;
 }
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root, path);
 out:
@@ -3460,8 +3791,9 @@ out:
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline_for_stack
-                                        struct btrfs_block_group_cache *group)
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_group_cache *group)
 {
        struct inode *inode = NULL;
        struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(root))
                return ERR_CAST(root);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 6);
-        BUG_ON(!trans);
+        if (IS_ERR(trans))
+                return ERR_CAST(trans);
        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
        if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 out:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (err) {
                if (inode)
@@ -3506,6 +3838,21 @@ out:
        return inode;
 }
+static struct reloc_control *alloc_reloc_control(void)
+{
+        struct reloc_control *rc;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return NULL;
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        backref_cache_init(&rc->backref_cache);
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        return rc;
+}
 /*
 * function to relocate all extents in a block group.
 */
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
        int ret;
+        int rw = 0;
        int err = 0;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc)
                return -ENOMEM;
-        mapping_tree_init(&rc->reloc_root_tree);
+        rc->extent_root = extent_root;
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-        INIT_LIST_HEAD(&rc->reloc_roots);
        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
        BUG_ON(!rc->block_group);
-        btrfs_init_workers(&rc->workers, "relocate",
+        if (!rc->block_group->ro) {
-                           fs_info->thread_pool_size, NULL);
+                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+                if (ret) {
-        rc->extent_root = extent_root;
+                        err = ret;
-        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+                        goto out;
+                }
+                rw = 1;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
        while (1) {
-                rc->extents_found = 0;
-                rc->extents_skipped = 0;
                mutex_lock(&fs_info->cleaner_mutex);
                btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
-                        break;
+                        goto out;
                }
                if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
-                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                           rc->extents_skipped >= rc->extents_found) {
-                        iput(rc->data_inode);
-                        rc->data_inode = create_reloc_inode(fs_info,
-                                                            rc->block_group);
-                        if (IS_ERR(rc->data_inode)) {
-                                err = PTR_ERR(rc->data_inode);
-                                rc->data_inode = NULL;
-                                break;
-                        }
-                        rc->stage = MOVE_DATA_EXTENTS;
-                        rc->found_file_extent = 0;
                }
        }
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
+        if (err && rw)
+                btrfs_set_block_group_rw(extent_root, rc->block_group);
        iput(rc->data_inode);
-        btrfs_stop_workers(&rc->workers);
        btrfs_put_block_group(rc->block_group);
        kfree(rc);
        return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        int ret;
-        trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        if (list_empty(&reloc_roots))
                goto out;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc) {
                err = -ENOMEM;
                goto out;
        }
-        mapping_tree_init(&rc->reloc_root_tree);
-        INIT_LIST_HEAD(&rc->reloc_roots);
-        btrfs_init_workers(&rc->workers, "relocate",
-                           root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        rc->merge_reloc_tree = 1;
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                fs_root->reloc_root = reloc_root;
        }
-        trans = btrfs_start_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
        merge_reloc_roots(rc);
        unset_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
 out:
-        if (rc) {
+        kfree(rc);
-                btrfs_stop_workers(&rc->workers);
-                kfree(rc);
-        }
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        btrfs_put_ordered_extent(ordered);
        return 0;
 }
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow)
+{
+        struct reloc_control *rc;
+        struct backref_node *node;
+        int first_cow = 0;
+        int level;
+        int ret;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc)
+                return;
+        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (btrfs_header_generation(buf) <=
+            btrfs_root_last_snapshot(&root->root_item))
+                first_cow = 1;
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+            rc->create_reloc_tree) {
+                WARN_ON(!first_cow && level == 0);
+                node = rc->backref_cache.path[level];
+                BUG_ON(node->bytenr != buf->start &&
+                       node->new_bytenr != buf->start);
+                drop_node_buffer(node);
+                extent_buffer_get(cow);
+                node->eb = cow;
+                node->new_bytenr = cow->start;
+                if (!node->pending) {
+                        list_move_tail(&node->list,
+                                       &rc->backref_cache.pending[level]);
+                        node->pending = 1;
+                }
+                if (first_cow)
+                        __mark_block_processed(rc, node);
+                if (first_cow && level > 0)
+                        rc->nodes_relocated += buf->len;
+        }
+        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+                ret = replace_file_extents(trans, rc, root, cow);
+                BUG_ON(ret);
+        }
+}
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct reloc_control *rc;
+        root = pending->root;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc->merge_reloc_tree)
+                return;
+        root = root->reloc_root;
+        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+        /*
+         * relocation is in the stage of merging trees. the space
+         * used by merging a reloc tree is twice the size of
+         * relocated tree nodes in the worst case. half for cowing
+         * the reloc tree, half for cowing the fs tree. the space
+         * used by cowing the reloc tree will be freed after the
+         * tree is dropped. if we create snapshot, cowing the fs
+         * tree may use more space than it frees. so we need
+         * reserve extra space.
+         */
+        *bytes_to_reserve += rc->nodes_relocated;
+}
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *new_root;
+        struct reloc_control *rc;
+        int ret;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        rc->merging_rsv_size += rc->nodes_relocated;
+        if (rc->merge_reloc_tree) {
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              rc->block_rsv,
+                                              rc->nodes_relocated);
+                BUG_ON(ret);
+        }
+        new_root = pending->snap;
+        reloc_root = create_reloc_root(trans, root->reloc_root,
+                                       new_root->root_key.objectid);
+        __add_reloc_root(reloc_root);
+        new_root->reloc_root = reloc_root;
+        if (rc->create_reloc_tree) {
+                ret = clone_backref_node(trans, rc, root, reloc_root);
+                BUG_ON(ret);
+        }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_key root_key;
+        struct btrfs_root *root;
        int err = 0;
        int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = 0;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
-                ret = btrfs_find_dead_roots(tree_root, key.offset);
+                root_key.objectid = key.offset;
-                if (ret) {
+                key.offset++;
+                root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+                                                  &root_key);
+                if (!IS_ERR(root))
+                        continue;
+                ret = PTR_ERR(root);
+                if (ret != -ENOENT) {
                        err = ret;
                        break;
                }
-                key.offset++;
+                ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
        }
        btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_path *path;
        int ret;
-        u32 refs;
        struct btrfs_root_item *ri;
        struct extent_buffer *leaf;
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        leaf = path->nodes[0];
        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
-        refs = btrfs_disk_root_refs(leaf, ri);
-        BUG_ON(refs != 0);
        ret = btrfs_del_item(trans, root, path);
 out:
        btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..1776dbd8dc98 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
         */
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+        if (IS_ERR(di))
+                return ERR_CAST(di);
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
@@ -390,8 +392,8 @@ setup_root:
        location.offset = 0;
        inode = btrfs_iget(sb, &location, new_root, &new);
-        if (!inode)
+        if (IS_ERR(inode))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(inode);
        /*
         * If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root, 0);
        btrfs_wait_ordered_extents(root, 0, 0);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                /* recover relocation */
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
-                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
-                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list) {
+        list_for_each_entry_rcu(found, head, list)
-                if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+                total_used += found->disk_used;
-                                    BTRFS_BLOCK_GROUP_RAID10|
-                                    BTRFS_BLOCK_GROUP_RAID1)) {
-                        total_used += found->bytes_used;
-                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                                data_used += found->bytes_used;
-                        else
-                                data_used += found->total_bytes;
-                }
-                total_used += found->bytes_used;
-                if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                        data_used += found->bytes_used;
-                else
-                        data_used += found->total_bytes;
-        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (data_used >> bits);
+        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
@@ -811,7 +797,7 @@ static int btrfs_unfreeze(struct super_block *sb)
 static const struct super_operations btrfs_super_ops = {
        .drop_inode     = btrfs_drop_inode,
-        .delete_inode   = btrfs_delete_inode,
+        .evict_inode    = btrfs_evict_inode,
        .put_super      = btrfs_put_super,
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
@@ -832,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 static struct miscdevice btrfs_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = BTRFS_MINOR,
        .name           = "btrfs-control",
        .fops           = &btrfs_ctl_fops
 };
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
 static int btrfs_interface_init(void)
 {
        return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
        TRANS_USERSPACE,
 };
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+        if (!root->fs_info->log_root_recovering &&
+            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+             type == TRANS_USERSPACE))
+                return 1;
+        return 0;
+}
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                             int num_blocks, int type)
+                                                    u64 num_items, int type)
 {
-        struct btrfs_trans_handle *h =
+        struct btrfs_trans_handle *h;
-                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
+again:
+        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        if (!h)
+                return ERR_PTR(-ENOMEM);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (!root->fs_info->log_root_recovering &&
+        if (may_wait_transaction(root, type))
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-             type == TRANS_USERSPACE))
                wait_current_trans(root);
        ret = join_transaction(root);
        BUG_ON(ret);
-        h->transid = root->fs_info->running_transaction->transid;
+        cur_trans = root->fs_info->running_transaction;
-        h->transaction = root->fs_info->running_transaction;
+        cur_trans->use_count++;
-        h->blocks_reserved = num_blocks;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        h->transid = cur_trans->transid;
+        h->transaction = cur_trans;
        h->blocks_used = 0;
        h->block_group = 0;
-        h->alloc_exclude_nr = 0;
+        h->bytes_reserved = 0;
-        h->alloc_exclude_start = 0;
        h->delayed_ref_updates = 0;
+        h->block_rsv = NULL;
-        if (!current->journal_info && type != TRANS_USERSPACE)
+        smp_mb();
-                current->journal_info = h;
+        if (cur_trans->blocked && may_wait_transaction(root, type)) {
+                btrfs_commit_transaction(h, root);
+                goto again;
+        }
+        if (num_items > 0) {
+                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                                                   &retries);
+                if (ret == -EAGAIN) {
+                        btrfs_commit_transaction(h, root);
+                        goto again;
+                }
+                if (ret < 0) {
+                        btrfs_end_transaction(h, root);
+                        return ERR_PTR(ret);
+                }
+        }
-        root->fs_info->running_transaction->use_count++;
+        mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!current->journal_info && type != TRANS_USERSPACE)
+                current->journal_info = h;
        return h;
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks)
+                                                   int num_items)
 {
-        return start_transaction(root, num_blocks, TRANS_START);
+        return start_transaction(root, num_items, TRANS_START);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
-        return start_transaction(root, num_blocks, TRANS_JOIN);
+        return start_transaction(root, 0, TRANS_JOIN);
 }
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
-        return start_transaction(r, num_blocks, TRANS_USERSPACE);
+        return start_transaction(r, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
        mutex_unlock(&root->fs_info->trans_mutex);
 }
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        int ret;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    &root->fs_info->global_block_rsv, 0, 5);
+        return ret ? 1 : 0;
+}
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int updates;
+        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+                return 1;
+        updates = trans->delayed_ref_updates;
+        trans->delayed_ref_updates = 0;
+        if (updates)
+                btrfs_run_delayed_refs(trans, root, updates);
+        return should_end_transaction(trans, root);
+}
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, int throttle)
 {
-        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
+        btrfs_trans_release_metadata(trans, root);
+        if (!root->fs_info->open_ioctl_trans &&
+            should_end_transaction(trans, root))
+                trans->transaction->blocked = 1;
+        if (cur_trans->blocked && !cur_trans->in_commit) {
+                if (throttle)
+                        return btrfs_commit_transaction(trans, root);
+                else
+                        wake_up_process(info->transaction_kthread);
+        }
        mutex_lock(&info->trans_mutex);
-        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans != trans->transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
+                        btrfs_orphan_commit_root(trans, root);
                        if (root->commit_root != root->node) {
                                switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
        struct btrfs_fs_info *info = root->fs_info;
-        int ret;
        struct btrfs_trans_handle *trans;
+        int ret;
        unsigned long nr;
-        smp_mb();
+        if (xchg(&root->defrag_running, 1))
-        if (root->defrag_running)
                return 0;
-        trans = btrfs_start_transaction(root, 1);
        while (1) {
-                root->defrag_running = 1;
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                trans = btrfs_start_transaction(root, 1);
                if (root->fs_info->closing || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
-        smp_mb();
+        return ret;
-        btrfs_end_transaction(trans, root);
-        return 0;
 }
 #if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
-        u64 objectid;
+        int retries = 0;
-        int namelen;
+        u64 to_reserve = 0;
        u64 index = 0;
+        u64 objectid;
-        parent_inode = pending->dentry->d_parent->d_inode;
-        parent_root = BTRFS_I(parent_inode)->root;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
-                ret = -ENOMEM;
+                pending->error = -ENOMEM;
                goto fail;
        }
        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-        if (ret)
+        if (ret) {
+                pending->error = ret;
                goto fail;
+        }
+        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+        if (to_reserve > 0) {
+                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                                          to_reserve, &retries);
+                if (ret) {
+                        pending->error = ret;
+                        goto fail;
+                }
+        }
        key.objectid = objectid;
-        /* record when the snapshot was created in key.offset */
+        key.offset = (u64)-1;
-        key.offset = trans->transid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-        memcpy(&pending->root_key, &key, sizeof(key));
+        trans->block_rsv = &pending->block_rsv;
-        pending->root_key.offset = (u64)-1;
+        dentry = pending->dentry;
+        parent_inode = dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
        /*
         * insert the directory item
         */
-        namelen = strlen(pending->name);
        ret = btrfs_set_inode_index(parent_inode, &index);
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
-                            pending->name, namelen,
+                                dentry->d_name.name, dentry->d_name.len,
-                            parent_inode->i_ino,
+                                parent_inode->i_ino, &key,
-                            &pending->root_key, BTRFS_FT_DIR, index);
+                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
-        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+        btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                         dentry->d_name.len * 2);
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        free_extent_buffer(old);
        btrfs_set_root_node(new_root_item, tmp);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+        /* record when the snapshot was created in key.offset */
-                                new_root_item);
+        key.offset = trans->transid;
-        BUG_ON(ret);
+        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
+        BUG_ON(ret);
-        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+        /*
-                                 pending->root_key.objectid,
+         * insert root back/forward references
+         */
+        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index, pending->name,
+                                 parent_inode->i_ino, index,
-                                 namelen);
+                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        key.offset = (u64)-1;
+        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(IS_ERR(pending->snap));
+        btrfs_reloc_post_snapshot(trans, pending);
+        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
-        return ret;
+        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+        return 0;
 }
 /*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
        return ret;
 }
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->blocked;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
+        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                prepare_to_wait(&cur_trans->writer_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
                else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 */
                btrfs_run_ordered_operations(root, 1);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
                smp_mb();
                if (cur_trans->num_writers > 1 || should_grow)
                        schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                        btrfs_drop_snapshot(root, 0);
+                        btrfs_drop_snapshot(root, NULL, 0);
                else
-                        btrfs_drop_snapshot(root, 1);
+                        btrfs_drop_snapshot(root, NULL, 1);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
+        u64 block_group;
+        u64 bytes_reserved;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
-        struct btrfs_transaction *transaction;
-        u64 block_group;
-        u64 alloc_exclude_start;
-        u64 alloc_exclude_nr;
        unsigned long delayed_ref_updates;
+        struct btrfs_transaction *transaction;
+        struct btrfs_block_rsv *block_rsv;
 };
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
        struct btrfs_root *root;
-        char *name;
+        struct btrfs_root *snap;
-        struct btrfs_key root_key;
+        /* block reservation for the operation */
+        struct btrfs_block_rsv block_rsv;
+        /* extra metadata reseration for relocation */
+        int error;
        struct list_head list;
 };
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                   int num_blocks);
+                                                         int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                                 path->nodes[1], 0,
                                 cache_only, &last_ret,
                                 &root->defrag_progress);
-        WARN_ON(ret && ret != -EAGAIN);
+        if (ret) {
+                WARN_ON(ret == -EAGAIN);
+                goto out;
+        }
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
        }
-        btrfs_release_path(root, path);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root)
 {
        int ret;
+        int err = 0;
        mutex_lock(&root->log_mutex);
        if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->tree_log_mutex);
        if (!root->fs_info->log_root_tree) {
                ret = btrfs_init_log_root_tree(trans, root->fs_info);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
-        if (!root->log_root) {
+        if (err == 0 && !root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
        root->log_batch++;
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
-        return 0;
+        return err;
 }
 /*
@@ -376,7 +379,7 @@ insert:
                        BUG_ON(ret);
                }
        } else if (ret) {
-                BUG();
+                return ret;
        }
        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
                                        path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
-                wc->process_func(root, next, wc, ptr_gen);
                if (*level == 1) {
+                        wc->process_func(root, next, wc, ptr_gen);
                        path->slots[*level]++;
                        if (wc->free) {
                                btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        if (path->nodes[*level] == root->node)
+        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = btrfs_level_size(root, *level);
-        root_owner = btrfs_header_owner(parent);
-        root_gen = btrfs_header_generation(parent);
-        wc->process_func(root, path->nodes[*level], wc,
-                         btrfs_header_generation(path->nodes[*level]));
-        if (wc->free) {
-                next = path->nodes[*level];
-                btrfs_tree_lock(next);
-                clean_tree_block(trans, root, next);
-                btrfs_set_lock_blocking(next);
-                btrfs_wait_tree_block_writeback(next);
-                btrfs_tree_unlock(next);
-                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
-                BUG_ON(ret);
-        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
        cond_resched();
        return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
                        struct extent_buffer *node;
                        node = path->nodes[i];
                        path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&log_root_tree->log_mutex);
        ret = update_log_root(trans, log);
-        BUG_ON(ret);
        mutex_lock(&log_root_tree->log_mutex);
        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        wake_up(&log_root_tree->log_writer_wait);
        }
+        if (ret) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out;
+        }
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
        return 0;
 }
-/*
+static void free_log_tree(struct btrfs_trans_handle *trans,
- * free all the extents used by the tree log.  This should be called
+                          struct btrfs_root *log)
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 {
        int ret;
-        struct btrfs_root *log;
-        struct key;
        u64 start;
        u64 end;
        struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                .process_func = process_one_buffer
        };
-        if (!root->log_root || root->fs_info->log_root_recovering)
-                return 0;
-        log = root->log_root;
        ret = walk_log_tree(trans, log, &wc);
        BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
-        if (log->log_transid > 0) {
-                ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-                                     &log->root_key);
-                BUG_ON(ret);
-        }
-        root->log_root = NULL;
        free_extent_buffer(log->node);
        kfree(log);
+}
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        if (root->log_root) {
+                free_log_tree(trans, root->log_root);
+                root->log_root = NULL;
+        }
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->log_root_tree) {
+                free_log_tree(trans, fs_info->log_root_tree);
+                fs_info->log_root_tree = NULL;
+        }
        return 0;
 }
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        int ret;
+        int err = 0;
        int bytes_del = 0;
        if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_release_path(log, path);
        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
                                         index, name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
                if (ret == 0) {
                        struct btrfs_inode_item *item;
                        u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        ret = 0;
                btrfs_release_path(log, path);
        }
+fail:
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
        else
                key.type = BTRFS_DIR_LOG_INDEX_KEY;
        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src;
+        int err = 0;
        int ret;
        int i;
        int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
        }
        btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
                path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret)
-                        BUG_ON(ret);
+                                err = ret;
-                        last_offset = tmp.offset;
+                        else
+                                last_offset = tmp.offset;
                        goto done;
                }
        }
 done:
-        *last_offset_ret = last_offset;
        btrfs_release_path(root, path);
        btrfs_release_path(log, dst_path);
-        /* insert the log range keys to indicate where the log is valid */
+        if (err == 0) {
-        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                *last_offset_ret = last_offset;
-                                 first_offset, last_offset);
+                /*
-        BUG_ON(ret);
+                 * insert the log range keys to indicate where the log
-        return 0;
+                 * is valid
+                 */
+                ret = insert_dir_log_key(trans, log, path, key_type,
+                                         inode->i_ino, first_offset,
+                                         last_offset);
+                if (ret)
+                        err = ret;
+        }
+        return err;
 }
 /*
@@ -2501,7 +2529,8 @@ again:
                ret = log_dir_items(trans, root, inode, path,
                                    dst_path, key_type, min_key,
                                    &max_key);
-                BUG_ON(ret);
+                if (ret)
+                        return ret;
                if (max_key == (u64)-1)
                        break;
                min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        while (1) {
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                BUG_ON(ret == 0);
-                if (ret != 1)
+                if (ret < 0)
                        break;
                if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
        }
        btrfs_release_path(log, path);
-        return 0;
+        return ret;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_items(trans, log, dst_path,
                                       ins_keys, ins_sizes, nr);
-        BUG_ON(ret);
+        if (ret) {
+                kfree(ins_data);
+                return ret;
+        }
        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
         * we have to do this after the loop above to avoid changing the
         * log tree while trying to change the log tree.
         */
+        ret = 0;
        while (!list_empty(&ordered_sums)) {
                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
                                                   struct btrfs_ordered_sum,
                                                   list);
-                ret = btrfs_csum_file_blocks(trans, log, sums);
+                if (!ret)
-                BUG_ON(ret);
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
                list_del(&sums->list);
                kfree(sums);
        }
-        return 0;
+        return ret;
 }
 /* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
        u32 size;
+        int err = 0;
        int ret;
        int nritems;
        int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
-        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto out_unlock;
+        }
        path->keep_locks = 1;
        while (1) {
@@ -2768,7 +2805,10 @@ again:
                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 1;
                ins_start_slot = path->slots[0];
 next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
                        ret = copy_items(trans, log, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
                        ins_nr = 0;
                }
                btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
                ret = copy_items(trans, log, dst_path, src,
                                 ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 0;
        }
        WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-        return 0;
+        return err;
 }
 /*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
-        start_log_trans(trans, root);
+        ret = start_log_trans(trans, root);
+        if (ret)
+                goto end_trans;
        ret = btrfs_log_inode(trans, root, inode, inode_only);
-        BUG_ON(ret);
+        if (ret)
+                goto end_trans;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         */
        if (S_ISREG(inode->i_mode) &&
            BTRFS_I(inode)->generation <= last_committed &&
-            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
-                        goto no_parent;
+                ret = 0;
+                goto end_trans;
+        }
        inode_only = LOG_INODE_EXISTS;
        while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto end_trans;
                }
                if (IS_ROOT(parent))
                        break;
                parent = parent->d_parent;
        }
-no_parent:
        ret = 0;
+end_trans:
+        if (ret < 0) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 1;
+        }
        btrfs_end_log_trans(root);
 end_no_trans:
        return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        wc.trans = trans;
        wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..dd318ff280b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -258,7 +258,7 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
+                if (cur->bi_rw & REQ_SYNC)
                        num_sync_run++;
                submit_bio(cur->bi_rw, cur);
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
        device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        /* step one, relocate all the extents inside this chunk */
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        BUG_ON(!trans);
        lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                        break;
                BUG_ON(ret);
-                trans = btrfs_start_transaction(dev_root, 1);
+                trans = btrfs_start_transaction(dev_root, 0);
                BUG_ON(!trans);
                ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
        }
        /* Shrinking succeeded, else we would be at "done". */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto done;
-        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
@@ -2654,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int max_errors = 0;
        struct btrfs_multi_bio *multi = NULL;
-        if (multi_ret && !(rw & (1 << BIO_RW)))
+        if (multi_ret && !(rw & REQ_WRITE))
                stripes_allocated = 1;
 again:
        if (multi_ret) {
@@ -2690,7 +2687,7 @@ again:
                mirror_num = 0;
        /* if our multi bio struct is too small, back off and try again */
-        if (rw & (1 << BIO_RW)) {
+        if (rw & REQ_WRITE) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
                        stripes_required = map->num_stripes;
@@ -2700,7 +2697,7 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && (rw & (1 << BIO_RW)) &&
+        if (multi_ret && (rw & REQ_WRITE) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2736,7 +2733,7 @@ again:
        num_stripes = 1;
        stripe_index = 0;
        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-                if (unplug_page || (rw & (1 << BIO_RW)))
+                if (unplug_page || (rw & REQ_WRITE))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -2747,7 +2744,7 @@ again:
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & (1 << BIO_RW))
+                if (rw & REQ_WRITE)
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -2758,7 +2755,7 @@ again:
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (unplug_page || (rw & (1 << BIO_RW)))
+                if (unplug_page || (rw & REQ_WRITE))
                        num_stripes = map->sub_stripes;
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
@@ -2948,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        struct btrfs_pending_bios *pending_bios;
        /* don't bother with additional async steps for reads, right now */
-        if (!(rw & (1 << BIO_RW))) {
+        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
                submit_bio(rw, bio);
                bio_put(bio);
@@ -2967,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
-        if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
+        if (bio->bi_rw & REQ_SYNC)
                pending_bios = &device->pending_sync_bios;
        else
                pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (trans)
                return do_setxattr(trans, inode, name, value, size, flags);
-        ret = btrfs_reserve_metadata_space(root, 2);
+        trans = btrfs_start_transaction(root, 2);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 out:
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 2);
        return ret;
 }
@@ -282,7 +276,7 @@ err:
 * List of handlers for synthetic system.* attributes.  All real ondisk
 * attributes are handled directly.
 */
-struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
        &btrfs_xattr_acl_access_handler,
        &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
 #include <linux/xattr.h>
-extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler *btrfs_xattr_handlers[];
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..3e7dca279d1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
                return;
        invalidate_bh_lrus();
+        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
        return err;
 }
-static void do_thaw_all(struct work_struct *work)
+static void do_thaw_one(struct super_block *sb, void *unused)
 {
-        struct super_block *sb;
        char b[BDEVNAME_SIZE];
+        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                printk(KERN_WARNING "Emergency Thaw on %s\n",
+                       bdevname(sb->s_bdev, b));
+}
-        spin_lock(&sb_lock);
+static void do_thaw_all(struct work_struct *work)
-restart:
+{
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        iterate_supers(do_thaw_one, NULL);
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
-                        printk(KERN_WARNING "Emergency Thaw on %s\n",
-                               bdevname(sb->s_bdev, b));
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
 }
@@ -778,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
-                                 * ll_rw_block() actually writes the current
+                                 * write_dirty_buffer() actually writes the
-                                 * contents - it is a noop if I/O is still in
+                                 * current contents - it is a noop if I/O is
-                                 * flight on potentially older contents.
+                                 * still in flight on potentially older
+                                 * contents.
                                 */
-                                ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -1841,9 +1834,10 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
-static int __block_prepare_write(struct inode *inode, struct page *page,
+int block_prepare_write(struct page *page, unsigned from, unsigned to,
-                unsigned from, unsigned to, get_block_t *get_block)
+                get_block_t *get_block)
 {
+        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
@@ -1916,10 +1910,13 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
-        if (unlikely(err))
+        if (unlikely(err)) {
                page_zero_new_buffers(page, from, to);
+                ClearPageUptodate(page);
+        }
        return err;
 }
+EXPORT_SYMBOL(block_prepare_write);
 static int __block_commit_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to)
@@ -1956,62 +1953,40 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        return 0;
 }
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+                get_block_t *get_block)
+{
+        unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+        return block_prepare_write(page, start, start + len, get_block);
+}
+EXPORT_SYMBOL(__block_write_begin);
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
- * If *pagep is not NULL, then block_write_begin uses the locked page
+ * The filesystem needs to handle block truncation upon failure.
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
-                        loff_t pos, unsigned len, unsigned flags,
+                unsigned flags, struct page **pagep, get_block_t *get_block)
-                        struct page **pagep, void **fsdata,
-                        get_block_t *get_block)
 {
-        struct inode *inode = mapping->host;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        int status = 0;
        struct page *page;
-        pgoff_t index;
+        int status;
-        unsigned start, end;
-        int ownpage = 0;
-        index = pos >> PAGE_CACHE_SHIFT;
+        page = grab_cache_page_write_begin(mapping, index, flags);
-        start = pos & (PAGE_CACHE_SIZE - 1);
+        if (!page)
-        end = start + len;
+                return -ENOMEM;
-        page = *pagep;
-        if (page == NULL) {
-                ownpage = 1;
-                page = grab_cache_page_write_begin(mapping, index, flags);
-                if (!page) {
-                        status = -ENOMEM;
-                        goto out;
-                }
-                *pagep = page;
-        } else
-                BUG_ON(!PageLocked(page));
-        status = __block_prepare_write(inode, page, start, end, get_block);
+        status = __block_write_begin(page, pos, len, get_block);
        if (unlikely(status)) {
-                ClearPageUptodate(page);
+                unlock_page(page);
+                page_cache_release(page);
-                if (ownpage) {
+                page = NULL;
-                        unlock_page(page);
-                        page_cache_release(page);
-                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
-                }
        }
-out:
+        *pagep = page;
        return status;
 }
 EXPORT_SYMBOL(block_write_begin);
@@ -2344,7 +2319,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        err = cont_expand_zero(file, mapping, pos, bytes);
        if (err)
-                goto out;
+                return err;
        zerofrom = *bytes & ~PAGE_CACHE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
@@ -2352,25 +2327,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
                (*bytes)++;
        }
-        *pagep = NULL;
+        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
-        err = block_write_begin(file, mapping, pos, len,
-                                flags, pagep, fsdata, get_block);
-out:
-        return err;
 }
 EXPORT_SYMBOL(cont_write_begin);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
-                        get_block_t *get_block)
-{
-        struct inode *inode = page->mapping->host;
-        int err = __block_prepare_write(inode, page, from, to, get_block);
-        if (err)
-                ClearPageUptodate(page);
-        return err;
-}
-EXPORT_SYMBOL(block_prepare_write);
 int block_commit_write(struct page *page, unsigned from, unsigned to)
 {
        struct inode *inode = page->mapping->host;
@@ -2389,7 +2349,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2474,8 +2434,9 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 /*
 * On entry, the page is fully not uptodate.
 * On exit the page is fully uptodate in the areas outside (from,to)
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2508,8 +2469,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin(mapping, pos, len, flags, pagep,
-                                        fsdata, get_block);
+                                         get_block);
        }
        if (PageMappedToDisk(page))
@@ -2613,9 +2574,6 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
        return ret;
 }
 EXPORT_SYMBOL(nobh_write_begin);
@@ -2955,13 +2913,6 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(buffer_unwritten(bh));
        /*
-         * Mask in barrier bit for a write (could be either a WRITE or a
-         * WRITE_SYNC
-         */
-        if (buffer_ordered(bh) && (rw & WRITE))
-                rw |= WRITE_BARRIER;
-        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (rw & WRITE))
@@ -2999,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
 /**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * %READA option is described in the documentation for generic_make_request()
- * are sent to disk. The fourth %READA option is described in the documentation
+ * which ll_rw_block() calls.
- * for generic_make_request() which ll_rw_block() calls.
 *
 * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
- * clean when doing a write request, and any buffer that appears to be
+ * request, and any buffer that appears to be up-to-date when doing read
- * up-to-date when doing read request.  Further it marks as clean buffers that
+ * request.  Further it marks as clean buffers that are processed for
- * are processed for writing (the buffer cache won't assume that they are
+ * writing (the buffer cache won't assume that they are actually clean
- * actually clean until the buffer gets unlocked).
+ * until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -3030,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
+                if (!trylock_buffer(bh))
-                        lock_buffer(bh);
-                else if (!trylock_buffer(bh))
                        continue;
+                if (rw == WRITE) {
-                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
-                    rw == SWRITE_SYNC_PLUG) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
-                                if (rw == SWRITE_SYNC)
+                                submit_bh(WRITE, bh);
-                                        submit_bh(WRITE_SYNC, bh);
-                                else
-                                        submit_bh(WRITE, bh);
                                continue;
                        }
                } else {
@@ -3059,12 +3002,25 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+        lock_buffer(bh);
+        if (!test_clear_buffer_dirty(bh)) {
+                unlock_buffer(bh);
+                return;
+        }
+        bh->b_end_io = end_buffer_write_sync;
+        get_bh(bh);
+        submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
 /*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 {
        int ret = 0;
@@ -3073,7 +3029,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
-                ret = submit_bh(WRITE_SYNC, bh);
+                ret = submit_bh(rw, bh);
                wait_on_buffer(bh);
                if (buffer_eopnotsupp(bh)) {
                        clear_buffer_eopnotsupp(bh);
@@ -3086,6 +3042,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
        }
        return ret;
 }
+EXPORT_SYMBOL(__sync_dirty_buffer);
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+        return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
 EXPORT_SYMBOL(sync_dirty_buffer);
 /*
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 2906077ac798..a2603e7c0bb5 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -146,7 +146,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
                goto error_unsupported;
        /* get the cache size and blocksize */
-        ret = vfs_statfs(root, &stats);
+        ret = vfs_statfs(&path, &stats);
        if (ret < 0)
                goto error_unsupported;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index c2413561ea75..727caedcdd92 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -552,8 +552,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 {
-        struct fs_struct *fs;
+        struct path path;
-        struct dentry *dir;
        const struct cred *saved_cred;
        int ret;
@@ -573,24 +572,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
        }
        /* extract the directory dentry from the cwd */
-        fs = current->fs;
+        get_fs_pwd(current->fs, &path);
-        read_lock(&fs->lock);
-        dir = dget(fs->pwd.dentry);
-        read_unlock(&fs->lock);
-        if (!S_ISDIR(dir->d_inode->i_mode))
+        if (!S_ISDIR(path.dentry->d_inode->i_mode))
                goto notdir;
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = cachefiles_cull(cache, dir, args);
+        ret = cachefiles_cull(cache, path.dentry, args);
        cachefiles_end_secure(cache, saved_cred);
-        dput(dir);
+        path_put(&path);
        _leave(" = %d", ret);
        return ret;
 notdir:
-        dput(dir);
+        path_put(&path);
        kerror("cull command requires dirfd to be a directory");
        return -ENOTDIR;
@@ -628,8 +624,7 @@ inval:
 */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 {
-        struct fs_struct *fs;
+        struct path path;
-        struct dentry *dir;
        const struct cred *saved_cred;
        int ret;
@@ -649,24 +644,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
        }
        /* extract the directory dentry from the cwd */
-        fs = current->fs;
+        get_fs_pwd(current->fs, &path);
-        read_lock(&fs->lock);
-        dir = dget(fs->pwd.dentry);
-        read_unlock(&fs->lock);
-        if (!S_ISDIR(dir->d_inode->i_mode))
+        if (!S_ISDIR(path.dentry->d_inode->i_mode))
                goto notdir;
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = cachefiles_check_in_use(cache, dir, args);
+        ret = cachefiles_check_in_use(cache, path.dentry, args);
        cachefiles_end_secure(cache, saved_cred);
-        dput(dir);
+        path_put(&path);
        //_leave(" = %d", ret);
        return ret;
 notdir:
-        dput(dir);
+        path_put(&path);
        kerror("inuse command requires dirfd to be a directory");
        return -ENOTDIR;
@@ -683,6 +675,10 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
                         unsigned fnr, unsigned bnr)
 {
        struct kstatfs stats;
+        struct path path = {
+                .mnt    = cache->mnt,
+                .dentry = cache->mnt->mnt_root,
+        };
        int ret;
        //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
@@ -697,7 +693,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache,
        /* find out how many pages of blockdev are available */
        memset(&stats, 0, sizeof(stats));
-        ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+        ret = vfs_statfs(&path, &stats);
        if (ret < 0) {
                if (ret == -EIO)
                        cachefiles_io_error(cache, "statfs failed");
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226da..bd6bc1bde2d7 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do {									\
 #define dbgprintk(FMT, ...) \
        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline void _dbprintk(const char *fmt, ...)
-        __attribute__((format(printf, 1, 2)));
-static inline void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do {							\
 } while (0)
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 #if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
        printk(KERN_ERR "%sobject: OBJ%x\n",
               prefix, object->fscache.debug_id);
-        printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
+        printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
               prefix, fscache_object_states[object->fscache.state],
-               object->fscache.flags, object->fscache.work.flags,
+               object->fscache.flags, work_busy(&object->fscache.work),
               object->fscache.events,
               object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
        printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
                /* if the object we're waiting for is queued for processing,
                 * then just put ourselves on the queue behind it */
-                if (slow_work_is_queued(&xobject->fscache.work)) {
+                if (work_pending(&xobject->fscache.work)) {
                        _debug("queue OBJ%x behind OBJ%x immediately",
                               object->fscache.debug_id,
                               xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
                }
                /* otherwise we sleep until either the object we're waiting for
-                 * is done, or the slow-work facility wants the thread back to
+                 * is done, or the fscache_object is congested */
-                 * do other work */
                wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
                init_wait(&wait);
                requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
                        prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
                        if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
                                break;
-                        requeue = slow_work_sleep_till_thread_needed(
-                                &object->fscache.work, &timeout);
+                        requeue = fscache_object_sleep_till_congested(&timeout);
                } while (timeout > 0 && !requeue);
                finish_wait(wq, &wait);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
        shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-        op->op.flags |= FSCACHE_OP_FAST;
+        op->op.flags |= FSCACHE_OP_ASYNC;
        op->op.processor = cachefiles_read_copier;
        pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
        pagevec_init(&pagevec, 0);
        op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
-        op->op.flags |= FSCACHE_OP_FAST;
+        op->op.flags |= FSCACHE_OP_ASYNC;
        op->op.processor = cachefiles_read_copier;
        INIT_LIST_HEAD(&backpages);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 04b8280582a9..0fcd2640c23f 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,7 +2,8 @@ config CEPH_FS
        tristate "Ceph distributed file system (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
        select LIBCRC32C
-        select CONFIG_CRYPTO_AES
+        select CRYPTO_AES
+        select CRYPTO
        help
          Choose Y or M here to include support for mounting the
          experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be8..278e1172600d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
        messenger.o msgpool.o buffer.o pagelist.o \
        mds_client.o mdsmap.o \
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..efbc604001c8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
        /* dirty the head */
        spin_lock(&inode->i_lock);
-        if (ci->i_wrbuffer_ref_head == 0)
+        if (ci->i_head_snapc == NULL)
                ci->i_head_snapc = ceph_get_snap_context(snapc);
        ++ci->i_wrbuffer_ref_head;
        if (ci->i_wrbuffer_ref == 0)
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(!PageUptodate(page));
+                account_page_dirtied(page, page->mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
@@ -274,7 +268,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
        int rc = 0;
        struct page **pages;
-        struct pagevec pvec;
        loff_t offset;
        u64 len;
@@ -297,8 +290,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        if (rc < 0)
                goto out;
-        /* set uptodate and add to lru in pagevec-sized chunks */
-        pagevec_init(&pvec, 0);
        for (; !list_empty(page_list) && len > 0;
             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                struct page *page =
@@ -312,7 +303,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
-                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                if (add_to_page_cache_lru(page, mapping, page->index,
+                                          GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -323,10 +315,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (pagevec_add(&pvec, page) == 0)
+                page_cache_release(page);
-                        pagevec_lru_add_file(&pvec);   /* add to lru */
        }
-        pagevec_lru_add_file(&pvec);
        rc = 0;
 out:
@@ -356,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
                        break;
                }
        }
-        if (!snapc && ci->i_head_snapc) {
+        if (!snapc && ci->i_wrbuffer_ref_head) {
                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
@@ -421,8 +411,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        if (i_size < page_off + len)
                len = i_size - page_off;
-        dout("writepage %p page %p index %lu on %llu~%u\n",
+        dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
-             inode, page, page->index, page_off, len);
+             inode, page, page->index, page_off, len, snapc);
        writeback_stat = atomic_long_inc_return(&client->writeback_count);
        if (writeback_stat >
@@ -557,7 +547,7 @@ static void writepages_finish(struct ceph_osd_request *req,
                 * page truncation thread, possibly losing some data that
                 * raced its way in
                 */
-                if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+                if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
                        generic_error_remove_page(inode->i_mapping, page);
                unlock_page(page);
@@ -568,7 +558,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_release_pages(req->r_pages, req->r_num_pages);
        if (req->r_pages_from_pool)
                mempool_free(req->r_pages,
-                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
        else
                kfree(req->r_pages);
        ceph_osdc_put_request(req);
@@ -776,7 +766,8 @@ get_more_pages:
                        /* ok */
                        if (locked_pages == 0) {
                                /* prepare async write request */
-                                offset = page->index << PAGE_CACHE_SHIFT;
+                                offset = (unsigned long long)page->index
+                                        << PAGE_CACHE_SHIFT;
                                len = wsize;
                                req = ceph_osdc_new_request(&client->osdc,
                                            &ci->i_layout,
@@ -802,9 +793,12 @@ get_more_pages:
                        dout("%p will write page %p idx %lu\n",
                             inode, page, page->index);
-                        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+                        writeback_stat =
-                        if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+                               atomic_long_inc_return(&client->writeback_count);
-                                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                        if (writeback_stat > CONGESTION_ON_THRESH(
+                                    client->mount_args->congestion_kb)) {
+                                set_bdi_congested(&client->backing_dev_info,
+                                                  BLK_RW_ASYNC);
                        }
                        set_page_writeback(page);
@@ -1041,7 +1035,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
                *pagep = page;
                dout("write_begin file %p inode %p page %p %d~%d\n", file,
-                inode, page, (int)pos, (int)len);
+                     inode, page, (int)pos, (int)len);
                r = ceph_update_writeable_page(file, pos, len, page);
        } while (r == -EAGAIN);
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
index 67b2c030924b..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -1,11 +1,15 @@
 #include <linux/errno.h>
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
 /*
 * base64 encode/decode.
 */
-const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char *pem_key =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 static int encode_bits(int c)
 {
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..6d2e30600627 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
 #include "ceph_debug.h"
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -21,7 +20,7 @@ static u32 supported_protocols[] = {
        CEPH_AUTH_CEPHX
 };
-int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
 {
        switch (protocol) {
        case CEPH_AUTH_NONE:
@@ -134,8 +133,8 @@ bad:
        return -ERANGE;
 }
-int ceph_build_auth_request(struct ceph_auth_client *ac,
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
-                           void *msg_buf, size_t msg_len)
+                                   void *msg_buf, size_t msg_len)
 {
        struct ceph_mon_request_header *monhdr = msg_buf;
        void *p = monhdr + 1;
@@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
        if (ret < 0) {
-                pr_err("error %d building request\n", ret);
+                pr_err("error %d building auth method %s request\n", ret,
+                       ac->ops->name);
                return ret;
        }
        dout(" built request %d bytes\n", ret);
@@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
        if (ret == -EAGAIN) {
                return ceph_build_auth_request(ac, reply_buf, reply_len);
        } else if (ret) {
-                pr_err("authentication error %d\n", ret);
+                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
                return ret;
        }
        return 0;
@@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
        if (!ac->protocol)
                return ceph_auth_build_hello(ac, msg_buf, msg_len);
        BUG_ON(!ac->ops);
-        if (!ac->ops->is_authenticated(ac))
+        if (ac->ops->should_authenticate(ac))
                return ceph_build_auth_request(ac, msg_buf, msg_len);
        return 0;
 }
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 struct ceph_auth_client_ops {
+        const char *name;
        /*
         * true if we are authenticated and can connect to
         * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
        int (*is_authenticated)(struct ceph_auth_client *ac);
        /*
+         * true if we should (re)authenticate, e.g., when our tickets
+         * are getting old and crusty.
+         */
+        int (*should_authenticate)(struct ceph_auth_client *ac);
+        /*
         * build requests and process replies during monitor
         * handshake.  if handle_reply returns -EAGAIN, we build
         * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
        return !xi->starting;
 }
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return xi->starting;
+}
 /*
 * the generic auth code decode the global_id, and we carry no actual
 * authenticate state, so nothing happens here.
@@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .name = "none",
        .reset = reset,
        .destroy = destroy,
        .is_authenticated = is_authenticated,
+        .should_authenticate = should_authenticate,
        .handle_reply = handle_reply,
        .create_authorizer = ceph_auth_none_create_authorizer,
        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
        return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return need != 0;
+}
 static int ceph_x_encrypt_buflen(int ilen)
 {
        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -76,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret,
 /*
 * get existing (or insert new) ticket handler
 */
-struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+static struct ceph_x_ticket_handler *
-                                                 int service)
+get_ticket_handler(struct ceph_auth_client *ac, int service)
 {
        struct ceph_x_ticket_handler *th;
        struct ceph_x_info *xi = ac->private;
@@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        int ret;
        char *dbuf;
        char *ticket_buf;
-        u8 struct_v;
+        u8 reply_struct_v;
        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
        if (!dbuf)
@@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                goto out_dbuf;
        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        struct_v = ceph_decode_8(&p);
+        reply_struct_v = ceph_decode_8(&p);
-        if (struct_v != 1)
+        if (reply_struct_v != 1)
                goto bad;
        num = ceph_decode_32(&p);
        dout("%d tickets\n", num);
        while (num--) {
                int type;
-                u8 struct_v;
+                u8 tkt_struct_v, blob_struct_v;
                struct ceph_x_ticket_handler *th;
                void *dp, *dend;
                int dlen;
@@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                type = ceph_decode_32(&p);
                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                struct_v = ceph_decode_8(&p);
+                tkt_struct_v = ceph_decode_8(&p);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                th = get_ticket_handler(ac, type);
@@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                dend = dbuf + dlen;
                dp = dbuf;
-                struct_v = ceph_decode_8(&dp);
+                tkt_struct_v = ceph_decode_8(&dp);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                tpend = tp + dlen;
                dout(" ticket blob is %d bytes\n", dlen);
                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                struct_v = ceph_decode_8(&tp);
+                blob_struct_v = ceph_decode_8(&tp);
                new_secret_id = ceph_decode_64(&tp);
                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
                if (ret)
@@ -365,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
                th = get_ticket_handler(ac, service);
-                if (!th) {
+                if (IS_ERR(th)) {
                        *pneed |= service;
                        continue;
                }
@@ -388,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th =
                get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
        ceph_x_validate_tickets(ac, &need);
        dout("build_request want %x have %x need %x\n",
@@ -418,7 +432,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                auth->struct_v = 1;
                auth->key = 0;
                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-                        auth->key ^= *u;
+                        auth->key ^= *(__le64 *)u;
                dout(" server_challenge %llx client_challenge %llx key %llx\n",
                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
                     le64_to_cpu(auth->key));
@@ -439,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                        return -ERANGE;
                head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-                BUG_ON(!th);
                ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
                if (ret)
                        return ret;
@@ -482,7 +495,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
                return -EAGAIN;
        }
-        op = le32_to_cpu(head->op);
+        op = le16_to_cpu(head->op);
        result = le32_to_cpu(head->result);
        dout("handle_reply op %d result %d\n", op, result);
        switch (op) {
@@ -494,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
        case CEPHX_GET_PRINCIPAL_SESSION_KEY:
                th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-                BUG_ON(!th);
+                if (IS_ERR(th))
+                        return PTR_ERR(th);
                ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
                                               buf + sizeof(*head), end);
                break;
@@ -552,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
        void *end = p + sizeof(au->reply_buf);
        th = get_ticket_handler(ac, au->service);
-        if (!th)
+        if (IS_ERR(th))
-                return -EIO;  /* hrm! */
+                return PTR_ERR(th);
        ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
        if (ret < 0)
                return ret;
@@ -602,6 +616,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
                remove_ticket_handler(ac, th);
        }
+        if (xi->auth_authorizer.buf)
+                ceph_buffer_put(xi->auth_authorizer.buf);
        kfree(ac->private);
        ac->private = NULL;
 }
@@ -612,13 +629,15 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
        struct ceph_x_ticket_handler *th;
        th = get_ticket_handler(ac, peer_type);
-        if (th && !IS_ERR(th))
+        if (!IS_ERR(th))
                remove_ticket_handler(ac, th);
 }
 static const struct ceph_auth_client_ops ceph_x_ops = {
+        .name = "x",
        .is_authenticated = ceph_x_is_authenticated,
+        .should_authenticate = ceph_x_should_authenticate,
        .build_request = ceph_x_build_request,
        .handle_reply = ceph_x_handle_reply,
        .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index c67535d70aa6..cd39f17021de 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref)
        kfree(b);
 }
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
-{
-        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-        if (b->vec.iov_base) {
-                b->is_vmalloc = false;
-        } else {
-                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-                b->is_vmalloc = true;
-        }
-        if (!b->vec.iov_base)
-                return -ENOMEM;
-        b->alloc_len = len;
-        b->vec.iov_len = len;
-        return 0;
-}
 int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
 {
        size_t len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..5e9da996a151 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps)
        return cap_str[i];
 }
-/*
+void ceph_caps_init(struct ceph_mds_client *mdsc)
- * Cap reservations
- *
- * Maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations.  This ensures that we preallocate
- * memory needed to successfully process an MDS response.  (If an MDS
- * sends us cap information and we fail to process it, we will have
- * problems due to the client and MDS being out of sync.)
- *
- * Reservations are 'owned' by a ceph_cap_reservation context.
- */
-static spinlock_t caps_list_lock;
-static struct list_head caps_list;  /* unused (reserved or unreserved) */
-static int caps_total_count;        /* total caps allocated */
-static int caps_use_count;          /* in use */
-static int caps_reserve_count;      /* unused, reserved */
-static int caps_avail_count;        /* unused, unreserved */
-static int caps_min_count;          /* keep at least this many (unreserved) */
-void __init ceph_caps_init(void)
 {
-        INIT_LIST_HEAD(&caps_list);
+        INIT_LIST_HEAD(&mdsc->caps_list);
-        spin_lock_init(&caps_list_lock);
+        spin_lock_init(&mdsc->caps_list_lock);
 }
-void ceph_caps_finalize(void)
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
 {
        struct ceph_cap *cap;
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        while (!list_empty(&caps_list)) {
+        while (!list_empty(&mdsc->caps_list)) {
-                cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+                cap = list_first_entry(&mdsc->caps_list,
+                                       struct ceph_cap, caps_item);
                list_del(&cap->caps_item);
                kmem_cache_free(ceph_cap_cachep, cap);
        }
-        caps_total_count = 0;
+        mdsc->caps_total_count = 0;
-        caps_avail_count = 0;
+        mdsc->caps_avail_count = 0;
-        caps_use_count = 0;
+        mdsc->caps_use_count = 0;
-        caps_reserve_count = 0;
+        mdsc->caps_reserve_count = 0;
-        caps_min_count = 0;
+        mdsc->caps_min_count = 0;
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
-void ceph_adjust_min_caps(int delta)
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
 {
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        caps_min_count += delta;
+        mdsc->caps_min_count += delta;
-        BUG_ON(caps_min_count < 0);
+        BUG_ON(mdsc->caps_min_count < 0);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
-int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+                      struct ceph_cap_reservation *ctx, int need)
 {
        int i;
        struct ceph_cap *cap;
@@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
        dout("reserve caps ctx=%p need=%d\n", ctx, need);
        /* first reserve any caps that are already allocated */
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        if (caps_avail_count >= need)
+        if (mdsc->caps_avail_count >= need)
                have = need;
        else
-                have = caps_avail_count;
+                have = mdsc->caps_avail_count;
-        caps_avail_count -= have;
+        mdsc->caps_avail_count -= have;
-        caps_reserve_count += have;
+        mdsc->caps_reserve_count += have;
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+                                         mdsc->caps_reserve_count +
-        spin_unlock(&caps_list_lock);
+                                         mdsc->caps_avail_count);
+        spin_unlock(&mdsc->caps_list_lock);
        for (i = have; i < need; i++) {
                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
        }
        BUG_ON(have + alloc != need);
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        caps_total_count += alloc;
+        mdsc->caps_total_count += alloc;
-        caps_reserve_count += alloc;
+        mdsc->caps_reserve_count += alloc;
-        list_splice(&newcaps, &caps_list);
+        list_splice(&newcaps, &mdsc->caps_list);
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+                                         mdsc->caps_reserve_count +
-        spin_unlock(&caps_list_lock);
+                                         mdsc->caps_avail_count);
+        spin_unlock(&mdsc->caps_list_lock);
        ctx->count = need;
        dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
-             ctx, caps_total_count, caps_use_count, caps_reserve_count,
+             ctx, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
        return 0;
 out_alloc_count:
@@ -220,92 +205,104 @@ out_alloc_count:
        return ret;
 }
-int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+                        struct ceph_cap_reservation *ctx)
 {
        dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
        if (ctx->count) {
-                spin_lock(&caps_list_lock);
+                spin_lock(&mdsc->caps_list_lock);
-                BUG_ON(caps_reserve_count < ctx->count);
+                BUG_ON(mdsc->caps_reserve_count < ctx->count);
-                caps_reserve_count -= ctx->count;
+                mdsc->caps_reserve_count -= ctx->count;
-                caps_avail_count += ctx->count;
+                mdsc->caps_avail_count += ctx->count;
                ctx->count = 0;
                dout("unreserve caps %d = %d used + %d resv + %d avail\n",
-                     caps_total_count, caps_use_count, caps_reserve_count,
+                     mdsc->caps_total_count, mdsc->caps_use_count,
-                     caps_avail_count);
+                     mdsc->caps_reserve_count, mdsc->caps_avail_count);
-                BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+                BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-                       caps_avail_count);
+                                                 mdsc->caps_reserve_count +
-                spin_unlock(&caps_list_lock);
+                                                 mdsc->caps_avail_count);
+                spin_unlock(&mdsc->caps_list_lock);
        }
        return 0;
 }
-static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
+                                struct ceph_cap_reservation *ctx)
 {
        struct ceph_cap *cap = NULL;
        /* temporary, until we do something about cap import/export */
-        if (!ctx)
+        if (!ctx) {
-                return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+                if (cap) {
+                        mdsc->caps_use_count++;
+                        mdsc->caps_total_count++;
+                }
+                return cap;
+        }
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
        dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-             ctx, ctx->count, caps_total_count, caps_use_count,
+             ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_reserve_count, caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
        BUG_ON(!ctx->count);
-        BUG_ON(ctx->count > caps_reserve_count);
+        BUG_ON(ctx->count > mdsc->caps_reserve_count);
-        BUG_ON(list_empty(&caps_list));
+        BUG_ON(list_empty(&mdsc->caps_list));
        ctx->count--;
-        caps_reserve_count--;
+        mdsc->caps_reserve_count--;
-        caps_use_count++;
+        mdsc->caps_use_count++;
-        cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+        cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
        list_del(&cap->caps_item);
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+               mdsc->caps_reserve_count + mdsc->caps_avail_count);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
        return cap;
 }
-void ceph_put_cap(struct ceph_cap *cap)
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 {
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
        dout("put_cap %p %d = %d used + %d resv + %d avail\n",
-             cap, caps_total_count, caps_use_count,
+             cap, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_reserve_count, caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
-        caps_use_count--;
+        mdsc->caps_use_count--;
        /*
         * Keep some preallocated caps around (ceph_min_count), to
         * avoid lots of free/alloc churn.
         */
-        if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+        if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
-                caps_total_count--;
+                                      mdsc->caps_min_count) {
+                mdsc->caps_total_count--;
                kmem_cache_free(ceph_cap_cachep, cap);
        } else {
-                caps_avail_count++;
+                mdsc->caps_avail_count++;
-                list_add(&cap->caps_item, &caps_list);
+                list_add(&cap->caps_item, &mdsc->caps_list);
        }
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+               mdsc->caps_reserve_count + mdsc->caps_avail_count);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
 void ceph_reservation_status(struct ceph_client *client,
                             int *total, int *avail, int *used, int *reserved,
                             int *min)
 {
+        struct ceph_mds_client *mdsc = &client->mdsc;
        if (total)
-                *total = caps_total_count;
+                *total = mdsc->caps_total_count;
        if (avail)
-                *avail = caps_avail_count;
+                *avail = mdsc->caps_avail_count;
        if (used)
-                *used = caps_use_count;
+                *used = mdsc->caps_use_count;
        if (reserved)
-                *reserved = caps_reserve_count;
+                *reserved = mdsc->caps_reserve_count;
        if (min)
-                *min = caps_min_count;
+                *min = mdsc->caps_min_count;
 }
 /*
@@ -330,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
        return NULL;
 }
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+        struct ceph_cap *cap;
+        spin_lock(&ci->vfs_inode.i_lock);
+        cap = __get_cap_for_mds(ci, mds);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return cap;
+}
 /*
- * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- * -1.
 */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
 {
        struct ceph_cap *cap;
        int mds = -1;
        struct rb_node *p;
-        /* prefer mds with WR|WRBUFFER|EXCL caps */
+        /* prefer mds with WR|BUFFER|EXCL caps */
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
                mds = cap->mds;
-                if (mseq)
-                        *mseq = cap->mseq;
                if (cap->issued & (CEPH_CAP_FILE_WR |
                                   CEPH_CAP_FILE_BUFFER |
                                   CEPH_CAP_FILE_EXCL))
@@ -358,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode)
 {
        int mds;
        spin_lock(&inode->i_lock);
-        mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+        mds = __ceph_get_cap_mds(ceph_inode(inode));
        spin_unlock(&inode->i_lock);
        return mds;
 }
@@ -477,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
         * Each time we receive FILE_CACHE anew, we increment
         * i_rdcache_gen.
         */
-        if ((issued & CEPH_CAP_FILE_CACHE) &&
+        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-            (had & CEPH_CAP_FILE_CACHE) == 0)
+            (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
                ci->i_rdcache_gen++;
        /*
@@ -537,7 +541,7 @@ retry:
                        new_cap = NULL;
                } else {
                        spin_unlock(&inode->i_lock);
-                        new_cap = get_cap(caps_reservation);
+                        new_cap = get_cap(mdsc, caps_reservation);
                        if (new_cap == NULL)
                                return -ENOMEM;
                        goto retry;
@@ -582,6 +586,7 @@ retry:
                } else {
                        pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
                               realmino);
+                        WARN_ON(!realm);
                }
        }
@@ -621,7 +626,7 @@ retry:
        if (fmode >= 0)
                __ceph_get_fmode(ci, fmode);
        spin_unlock(&inode->i_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -809,7 +814,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
                used |= CEPH_CAP_PIN;
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
-        if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+        if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
@@ -825,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
        int want = 0;
        int mode;
-        for (mode = 0; mode < 4; mode++)
+        for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
                if (ci->i_nr_by_mode[mode])
                        want |= ceph_caps_for_mode(mode);
        return want;
@@ -867,7 +872,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -894,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
                ci->i_auth_cap = NULL;
        if (removed)
-                ceph_put_cap(cap);
+                ceph_put_cap(mdsc, cap);
        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
                struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -937,9 +943,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
             seq, issue_seq, mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        msg->hdr.tid = cpu_to_le64(flush_tid);
@@ -980,6 +986,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
        return 0;
 }
+static void __queue_cap_release(struct ceph_mds_session *session,
+                                u64 ino, u64 cap_id, u32 migrate_seq,
+                                u32 issue_seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        struct ceph_mds_cap_item *item;
+        spin_lock(&session->s_cap_lock);
+        BUG_ON(!session->s_num_cap_releases);
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        dout(" adding %llx release to mds%d msg %p (%d left)\n",
+             ino, session->s_mds, msg, session->s_num_cap_releases);
+        BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+        head = msg->front.iov_base;
+        head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+        item = msg->front.iov_base + msg->front.iov_len;
+        item->ino = cpu_to_le64(ino);
+        item->cap_id = cpu_to_le64(cap_id);
+        item->migrate_seq = cpu_to_le32(migrate_seq);
+        item->seq = cpu_to_le32(issue_seq);
+        session->s_num_cap_releases--;
+        msg->front.iov_len += sizeof(*item);
+        if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                dout(" release msg %p full\n", msg);
+                list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+        } else {
+                dout(" release msg %p at %d/%d (%d)\n", msg,
+                     (int)le32_to_cpu(head->num),
+                     (int)CEPH_CAPS_PER_RELEASE,
+                     (int)msg->front.iov_len);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
 /*
 * Queue cap releases when an inode is dropped from our cache.  Since
 * inode is about to be destroyed, there is no need for i_lock.
@@ -993,41 +1039,9 @@ void ceph_queue_caps_release(struct inode *inode)
        while (p) {
                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
                struct ceph_mds_session *session = cap->session;
-                struct ceph_msg *msg;
-                struct ceph_mds_cap_release *head;
-                struct ceph_mds_cap_item *item;
-                spin_lock(&session->s_cap_lock);
+                __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-                BUG_ON(!session->s_num_cap_releases);
+                                    cap->mseq, cap->issue_seq);
-                msg = list_first_entry(&session->s_cap_releases,
-                                       struct ceph_msg, list_head);
-                dout(" adding %p release to mds%d msg %p (%d left)\n",
-                     inode, session->s_mds, msg, session->s_num_cap_releases);
-                BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-                head = msg->front.iov_base;
-                head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
-                item = msg->front.iov_base + msg->front.iov_len;
-                item->ino = cpu_to_le64(ceph_ino(inode));
-                item->cap_id = cpu_to_le64(cap->cap_id);
-                item->migrate_seq = cpu_to_le32(cap->mseq);
-                item->seq = cpu_to_le32(cap->issue_seq);
-                session->s_num_cap_releases--;
-                msg->front.iov_len += sizeof(*item);
-                if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-                        dout(" release msg %p full\n", msg);
-                        list_move_tail(&msg->list_head,
-                                       &session->s_cap_releases_done);
-                } else {
-                        dout(" release msg %p at %d/%d (%d)\n", msg,
-                             (int)le32_to_cpu(head->num),
-                             (int)CEPH_CAPS_PER_RELEASE,
-                             (int)msg->front.iov_len);
-                }
-                spin_unlock(&session->s_cap_lock);
                p = rb_next(p);
                __ceph_remove_cap(cap);
        }
@@ -1068,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        gid_t gid;
        struct ceph_mds_session *session;
        u64 xattr_version = 0;
+        struct ceph_buffer *xattr_blob = NULL;
        int delayed = 0;
        u64 flush_tid = 0;
        int i;
@@ -1128,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                for (i = 0; i < CEPH_CAP_BITS; i++)
                        if (flushing & (1 << i))
                                ci->i_cap_flush_tid[i] = flush_tid;
+                follows = ci->i_head_snapc->seq;
+        } else {
+                follows = 0;
        }
        keep = cap->implemented;
@@ -1141,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        mtime = inode->i_mtime;
        atime = inode->i_atime;
        time_warp_seq = ci->i_time_warp_seq;
-        follows = ci->i_snap_realm->cached_context->seq;
        uid = inode->i_uid;
        gid = inode->i_gid;
        mode = inode->i_mode;
-        if (dropping & CEPH_CAP_XATTR_EXCL) {
+        if (flushing & CEPH_CAP_XATTR_EXCL) {
                __ceph_build_xattrs_blob(ci);
-                xattr_version = ci->i_xattrs.version + 1;
+                xattr_blob = ci->i_xattrs.blob;
+                xattr_version = ci->i_xattrs.version;
        }
        spin_unlock(&inode->i_lock);
@@ -1156,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
                op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
-                uid, gid, mode,
+                uid, gid, mode, xattr_version, xattr_blob,
-                xattr_version,
-                (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
                follows);
        if (ret < 0) {
                dout("error sending cap msg, must requeue %p\n", inode);
@@ -1166,7 +1183,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        }
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        return delayed;
 }
@@ -1178,10 +1195,16 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 * asynchronously back to the MDS once sync writes complete and dirty
 * data is written out.
 *
+ * Unless @again is true, skip cap_snaps that were already sent to
+ * the MDS (i.e., during this session).
+ *
 * Called under i_lock.  Takes s_mutex as needed.
 */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                        struct ceph_mds_session **psession)
+                        struct ceph_mds_session **psession,
+                        int again)
+                __releases(ci->vfs_inode->i_lock)
+                __acquires(ci->vfs_inode->i_lock)
 {
        struct inode *inode = &ci->vfs_inode;
        int mds;
@@ -1208,7 +1231,7 @@ retry:
                 * pages to be written out.
                 */
                if (capsnap->dirty_pages || capsnap->writing)
-                        continue;
+                        break;
                /*
                 * if cap writeback already occurred, we should have dropped
@@ -1217,7 +1240,20 @@ retry:
                BUG_ON(capsnap->dirty == 0);
                /* pick mds, take s_mutex */
-                mds = __ceph_get_cap_mds(ci, &mseq);
+                if (ci->i_auth_cap == NULL) {
+                        dout("no auth cap (migrating?), doing nothing\n");
+                        goto out;
+                }
+                /* only flush each capsnap once */
+                if (!again && !list_empty(&capsnap->flushing_item)) {
+                        dout("already flushed %p, skipping\n", capsnap);
+                        continue;
+                }
+                mds = ci->i_auth_cap->session->s_mds;
+                mseq = ci->i_auth_cap->mseq;
                if (session && session->s_mds != mds) {
                        dout("oops, wrong session %p mutex\n", session);
                        mutex_unlock(&session->s_mutex);
@@ -1236,8 +1272,8 @@ retry:
                        }
                        /*
                         * if session == NULL, we raced against a cap
-                         * deletion.  retry, and we'll get a better
+                         * deletion or migration.  retry, and we'll
-                         * @mds value next time.
+                         * get a better @mds value next time.
                         */
                        spin_lock(&inode->i_lock);
                        goto retry;
@@ -1251,8 +1287,8 @@ retry:
                              &session->s_cap_snaps_flushing);
                spin_unlock(&inode->i_lock);
-                dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
-                     inode, capsnap, next_follows, capsnap->size);
+                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
                             capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
@@ -1260,7 +1296,7 @@ retry:
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
-                             0, NULL,
+                             capsnap->xattr_version, capsnap->xattr_blob,
                             capsnap->follows);
                next_follows = capsnap->follows + 1;
@@ -1275,6 +1311,7 @@ retry:
        list_del_init(&ci->i_snap_flush_item);
        spin_unlock(&mdsc->snap_flush_lock);
+out:
        if (psession)
                *psession = session;
        else if (session) {
@@ -1288,7 +1325,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
        struct inode *inode = &ci->vfs_inode;
        spin_lock(&inode->i_lock);
-        __ceph_flush_snaps(ci, NULL);
+        __ceph_flush_snaps(ci, NULL, 0);
        spin_unlock(&inode->i_lock);
 }
@@ -1298,7 +1335,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1308,7 +1346,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
             ceph_cap_string(was | mask));
        ci->i_dirty_caps |= mask;
        if (was == 0) {
-                dout(" inode %p now dirty\n", &ci->vfs_inode);
+                if (!ci->i_head_snapc)
+                        ci->i_head_snapc = ceph_get_snap_context(
+                                ci->i_snap_realm->cached_context);
+                dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+                        ci->i_head_snapc);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1336,7 +1378,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1419,7 +1461,6 @@ static int try_nonblocking_invalidate(struct inode *inode)
 */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                     struct ceph_mds_session *session)
-        __releases(session->s_mutex)
 {
        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
        struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1447,7 +1488,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        /* flush snaps first time around only */
        if (!list_empty(&ci->i_cap_snaps))
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
        goto retry_locked;
 retry:
        spin_lock(&inode->i_lock);
@@ -1494,11 +1535,13 @@ retry_locked:
            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
            ci->i_rdcache_gen &&                     /* may have cached pages */
            (file_wanted == 0 ||                     /* no open files */
-             (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+             (revoking & (CEPH_CAP_FILE_CACHE|
+                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
            !tried_invalidate) {
                dout("check_caps trying to invalidate on %p\n", inode);
                if (try_nonblocking_invalidate(inode) < 0) {
-                        if (revoking & CEPH_CAP_FILE_CACHE) {
+                        if (revoking & (CEPH_CAP_FILE_CACHE|
+                                        CEPH_CAP_FILE_LAZYIO)) {
                                dout("check_caps queuing invalidate\n");
                                queue_invalidate = 1;
                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -1663,7 +1706,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1716,10 +1759,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int dirty, i, ret = 1;
+        int i, ret = 1;
        spin_lock(&inode->i_lock);
-        dirty = __ceph_caps_dirty(ci);
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((ci->i_flushing_caps & (1 << i)) &&
                    ci->i_cap_flush_tid[i] <= tid) {
@@ -1775,9 +1817,9 @@ out:
        spin_unlock(&ci->i_unsafe_lock);
 }
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned flush_tid;
        int ret;
@@ -1829,7 +1871,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                        err = wait_event_interruptible(ci->i_cap_wq,
                                       caps_are_flushed(inode, flush_tid));
        } else {
-                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -1862,7 +1905,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                if (cap && cap->session == session) {
                        dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
                             cap, capsnap);
-                        __ceph_flush_snaps(ci, &session);
+                        __ceph_flush_snaps(ci, &session, 1);
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
@@ -2137,7 +2180,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        else if (flushsnaps)
                ceph_flush_snaps(ci);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (put)
                iput(inode);
 }
@@ -2165,7 +2208,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
-                if (!ci->i_wrbuffer_ref_head) {
+                if (ci->i_wrbuffer_ref_head == 0 &&
+                    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+                        BUG_ON(!ci->i_head_snapc);
                        ceph_put_snap_context(ci->i_head_snapc);
                        ci->i_head_snapc = NULL;
                }
@@ -2213,7 +2258,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                iput(inode);
        } else if (complete_capsnap) {
                ceph_flush_snaps(ci);
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        }
        if (drop_capsnap)
                iput(inode);
@@ -2234,12 +2279,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                             struct ceph_mds_session *session,
                             struct ceph_cap *cap,
                             struct ceph_buffer *xattr_buf)
-        __releases(inode->i_lock)
+                __releases(inode->i_lock)
-        __releases(session->s_mutex)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
-        int seq = le32_to_cpu(grant->seq);
+        unsigned seq = le32_to_cpu(grant->seq);
+        unsigned issue_seq = le32_to_cpu(grant->issue_seq);
        int newcaps = le32_to_cpu(grant->caps);
        int issued, implemented, used, wanted, dirty;
        u64 size = le64_to_cpu(grant->size);
@@ -2251,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        int revoked_rdcache = 0;
        int queue_invalidate = 0;
-        dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+        dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
-             inode, cap, mds, seq, ceph_cap_string(newcaps));
+             inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
        dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
                inode->i_size);
@@ -2262,6 +2307,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
         * will invalidate _after_ writeback.)
         */
        if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
            !ci->i_wrbuffer_ref) {
                if (try_nonblocking_invalidate(inode) == 0) {
                        revoked_rdcache = 1;
@@ -2347,21 +2393,29 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        }
        cap->seq = seq;
+        cap->issue_seq = issue_seq;
        /* file layout may have changed */
        ci->i_layout = grant->layout;
        /* revocation, grant, or no-op? */
        if (cap->issued & ~newcaps) {
-                dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+                int revoking = cap->issued & ~newcaps;
-                     ceph_cap_string(newcaps));
-                if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+                dout("revocation: %s -> %s (revoking %s)\n",
-                        writeback = 1; /* will delay ack */
+                     ceph_cap_string(cap->issued),
-                else if (dirty & ~newcaps)
+                     ceph_cap_string(newcaps),
-                        check_caps = 1;  /* initiate writeback in check_caps */
+                     ceph_cap_string(revoking));
-                else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+                if (revoking & used & CEPH_CAP_FILE_BUFFER)
-                           revoked_rdcache)
+                        writeback = 1;  /* initiate writeback; will delay ack */
-                        check_caps = 2;     /* send revoke ack in check_caps */
+                else if (revoking == CEPH_CAP_FILE_CACHE &&
+                         (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+                         queue_invalidate)
+                        ; /* do nothing yet, invalidation will be queued */
+                else if (cap == ci->i_auth_cap)
+                        check_caps = 1; /* check auth cap only */
+                else
+                        check_caps = 2; /* check all caps */
                cap->issued = newcaps;
                cap->implemented |= newcaps;
        } else if (cap->issued == newcaps) {
@@ -2389,7 +2443,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        if (queue_invalidate)
                ceph_queue_invalidate(inode);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
        if (check_caps == 1)
                ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
@@ -2411,7 +2465,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
@@ -2444,19 +2498,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-                wake_up(&mdsc->cap_flushing_wq);
+                wake_up_all(&mdsc->cap_flushing_wq);
                dout(" inode %p now !flushing\n", inode);
                if (ci->i_dirty_caps == 0) {
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
                        drop = 1;
+                        if (ci->i_wrbuffer_ref_head == 0) {
+                                BUG_ON(!ci->i_head_snapc);
+                                ceph_put_snap_context(ci->i_head_snapc);
+                                ci->i_head_snapc = NULL;
+                        }
                } else {
                        BUG_ON(list_empty(&ci->i_dirty_item));
                }
        }
        spin_unlock(&mdsc->cap_dirty_lock);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
 out:
        spin_unlock(&inode->i_lock);
@@ -2552,7 +2611,8 @@ static void handle_cap_trunc(struct inode *inode,
 * caller holds s_mutex
 */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-                              struct ceph_mds_session *session)
+                              struct ceph_mds_session *session,
+                              int *open_target_sessions)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2584,6 +2644,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                        ci->i_cap_exporting_mds = mds;
                        ci->i_cap_exporting_mseq = mseq;
                        ci->i_cap_exporting_issued = cap->issued;
+                        /*
+                         * make sure we have open sessions with all possible
+                         * export targets, so that we get the matching IMPORT
+                         */
+                        *open_target_sessions = 1;
                }
                __ceph_remove_cap(cap);
        }
@@ -2653,12 +2719,16 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_mds_caps *h;
        int mds = session->s_mds;
        int op;
-        u32 seq;
+        u32 seq, mseq;
        struct ceph_vino vino;
        u64 cap_id;
        u64 size, max_size;
        u64 tid;
        void *snaptrace;
+        size_t snaptrace_len;
+        void *flock;
+        u32 flock_len;
+        int open_target_sessions = 0;
        dout("handle_caps from mds%d\n", mds);
@@ -2667,15 +2737,30 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        if (msg->front.iov_len < sizeof(*h))
                goto bad;
        h = msg->front.iov_base;
-        snaptrace = h + 1;
        op = le32_to_cpu(h->op);
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
        cap_id = le64_to_cpu(h->cap_id);
        seq = le32_to_cpu(h->seq);
+        mseq = le32_to_cpu(h->migrate_seq);
        size = le64_to_cpu(h->size);
        max_size = le64_to_cpu(h->max_size);
+        snaptrace = h + 1;
+        snaptrace_len = le32_to_cpu(h->snap_trace_len);
+        if (le16_to_cpu(msg->hdr.version) >= 2) {
+                void *p, *end;
+                p = snaptrace + snaptrace_len;
+                end = msg->front.iov_base + msg->front.iov_len;
+                ceph_decode_32_safe(&p, end, flock_len, bad);
+                flock = p;
+        } else {
+                flock = NULL;
+                flock_len = 0;
+        }
        mutex_lock(&session->s_mutex);
        session->s_seq++;
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2687,7 +2772,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
             vino.snap, inode);
        if (!inode) {
                dout(" i don't have ino %llx\n", vino.ino);
-                goto done;
+                if (op == CEPH_CAP_OP_IMPORT)
+                        __queue_cap_release(session, vino.ino, cap_id,
+                                            mseq, seq);
+                goto flush_cap_releases;
        }
        /* these will work even if we don't have a cap yet */
@@ -2697,12 +2786,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                goto done;
        case CEPH_CAP_OP_EXPORT:
-                handle_cap_export(inode, h, session);
+                handle_cap_export(inode, h, session, &open_target_sessions);
                goto done;
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
-                                  snaptrace, le32_to_cpu(h->snap_trace_len));
+                                  snaptrace, snaptrace_len);
                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
                                session);
                goto done_unlocked;
@@ -2712,10 +2801,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        spin_lock(&inode->i_lock);
        cap = __get_cap_for_mds(ceph_inode(inode), mds);
        if (!cap) {
-                dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
                spin_unlock(&inode->i_lock);
-                goto done;
+                goto flush_cap_releases;
        }
        /* note that each of these drops i_lock for us */
@@ -2739,11 +2828,24 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                       ceph_cap_op_name(op));
        }
+        goto done;
+flush_cap_releases:
+        /*
+         * send any full release message to try to move things
+         * along for the mds (who clearly thinks we still have this
+         * cap).
+         */
+        ceph_add_cap_releases(mdsc, session);
+        ceph_send_cap_releases(mdsc, session);
 done:
        mutex_unlock(&session->s_mutex);
 done_unlocked:
        if (inode)
                iput(inode);
+        if (open_target_sessions)
+                ceph_mdsc_open_export_target_sessions(mdsc, session);
        return;
 bad:
@@ -2863,18 +2965,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *cap;
        struct ceph_mds_request_release *rel = *p;
+        int used, dirty;
        int ret = 0;
-        int used = 0;
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
-        dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+        dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
-             mds, ceph_cap_string(used), ceph_cap_string(drop),
+             inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
             ceph_cap_string(unless));
-        /* only drop unused caps */
+        /* only drop unused, clean caps */
-        drop &= ~used;
+        drop &= ~(used | dirty);
        cap = __get_cap_for_mds(ci, mds);
        if (cap && __cap_is_valid(cap)) {
@@ -2954,6 +3057,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
                memcpy(*p, dentry->d_name.name, dentry->d_name.len);
                *p += dentry->d_name.len;
                rel->dname_seq = cpu_to_le32(di->lease_seq);
+                __ceph_mdsc_drop_dentry_lease(dentry);
        }
        spin_unlock(&dentry->d_lock);
        return ret;
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
index 793f50cb7c22..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -1,5 +1,5 @@
-#ifndef _FS_CEPH_FRAG_H
+#ifndef FS_CEPH_FRAG_H
-#define _FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
 /*
 * "Frags" are a way to describe a subset of a 32-bit number space,
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 79d76bc4303f..3ac6cc7c1156 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 int ceph_flags_to_mode(int flags)
 {
+        int mode;
 #ifdef O_DIRECTORY  /* fixme */
        if ((flags & O_DIRECTORY) == O_DIRECTORY)
                return CEPH_FILE_MODE_PIN;
 #endif
+        if ((flags & O_APPEND) == O_APPEND)
+                flags |= O_WRONLY;
+        if ((flags & O_ACCMODE) == O_RDWR)
+                mode = CEPH_FILE_MODE_RDWR;
+        else if ((flags & O_ACCMODE) == O_WRONLY)
+                mode = CEPH_FILE_MODE_WR;
+        else
+                mode = CEPH_FILE_MODE_RD;
 #ifdef O_LAZY
        if (flags & O_LAZY)
-                return CEPH_FILE_MODE_LAZY;
+                mode |= CEPH_FILE_MODE_LAZY;
 #endif
-        if ((flags & O_APPEND) == O_APPEND)
-                flags |= O_WRONLY;
-        flags &= O_ACCMODE;
+        return mode;
-        if ((flags & O_RDWR) == O_RDWR)
-                return CEPH_FILE_MODE_RDWR;
-        if ((flags & O_WRONLY) == O_WRONLY)
-                return CEPH_FILE_MODE_WR;
-        return CEPH_FILE_MODE_RD;
 }
 int ceph_caps_for_mode(int mode)
 {
-        switch (mode) {
+        int caps = CEPH_CAP_PIN;
-        case CEPH_FILE_MODE_PIN:
-                return CEPH_CAP_PIN;
+        if (mode & CEPH_FILE_MODE_RD)
-        case CEPH_FILE_MODE_RD:
+                caps |= CEPH_CAP_FILE_SHARED |
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-        case CEPH_FILE_MODE_RDWR:
+        if (mode & CEPH_FILE_MODE_WR)
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                caps |= CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
-                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        case CEPH_FILE_MODE_WR:
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
-                        CEPH_CAP_FILE_EXCL |
                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        }
+        if (mode & CEPH_FILE_MODE_LAZY)
-        return 0;
+                caps |= CEPH_CAP_FILE_LAZYIO;
+        return caps;
 }
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..d5619ac86711 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -9,34 +9,20 @@
 * LGPL2
 */
-#ifndef _FS_CEPH_CEPH_FS_H
+#ifndef CEPH_FS_H
-#define _FS_CEPH_CEPH_FS_H
+#define CEPH_FS_H
 #include "msgr.h"
 #include "rados.h"
 /*
- * Ceph release version
- */
-#define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
-#define CEPH_VERSION_PATCH 0
-#define _CEPH_STRINGIFY(x) #x
-#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
-#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
-        "." CEPH_STRINGIFY(z)
-#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
-                                       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
-/*
 * subprotocol versions.  when specific messages types or high-level
 * protocols change, bump the affected components.  we keep rev
 * internal cluster protocols separately from the public,
 * client-facing protocol.
 */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +39,10 @@
 /*
 * feature bits
 */
-#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
 /*
@@ -86,11 +74,15 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_CRYPTO_NONE 0x0
 #define CEPH_CRYPTO_AES  0x1
+#define CEPH_AES_IV "cephsageyudagreg"
 /* security/authentication protocols */
 #define CEPH_AUTH_UNKNOWN       0x0
 #define CEPH_AUTH_NONE          0x1
 #define CEPH_AUTH_CEPHX         0x2
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 /*********************************************
 * message layer
@@ -128,11 +120,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
+/* pool operations */
+enum {
+  POOL_OP_CREATE                        = 0x01,
+  POOL_OP_DELETE                        = 0x02,
+  POOL_OP_AUID_CHANGE                   = 0x03,
+  POOL_OP_CREATE_SNAP                   = 0x11,
+  POOL_OP_DELETE_SNAP                   = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
+};
 struct ceph_mon_request_header {
        __le64 have_version;
        __le16 session_mon;
@@ -155,6 +163,31 @@ struct ceph_mon_statfs_reply {
        struct ceph_statfs st;
 } __attribute__ ((packed));
+const char *ceph_pool_op_name(int op);
+struct ceph_mon_poolop {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 pool;
+        __le32 op;
+        __le64 auid;
+        __le64 snapid;
+        __le32 name_len;
+} __attribute__ ((packed));
+struct ceph_mon_poolop_reply {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 reply_code;
+        __le32 epoch;
+        char has_data;
+        char data[0];
+} __attribute__ ((packed));
+struct ceph_mon_unmanaged_snap {
+        __le64 snapid;
+} __attribute__ ((packed));
 struct ceph_osd_getmap {
        struct ceph_mon_request_header monhdr;
        struct ceph_fsid fsid;
@@ -212,16 +245,18 @@ extern const char *ceph_mds_state_name(int s);
 *  - they also define the lock ordering by the MDS
 *  - a few of these are internal to the mds
 */
-#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
 /* client_session ops */
 enum {
@@ -262,6 +297,8 @@ enum {
        CEPH_MDS_OP_RMXATTR    = 0x01106,
        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
        CEPH_MDS_OP_SETATTR    = 0x01108,
+        CEPH_MDS_OP_SETFILELOCK= 0x01109,
+        CEPH_MDS_OP_GETFILELOCK= 0x00110,
        CEPH_MDS_OP_MKNOD      = 0x01201,
        CEPH_MDS_OP_LINK       = 0x01202,
@@ -308,6 +345,7 @@ union ceph_mds_request_args {
        struct {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
+                __le32 max_bytes;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
@@ -331,6 +369,15 @@ union ceph_mds_request_args {
        struct {
                struct ceph_file_layout layout;
        } __attribute__ ((packed)) setlayout;
+        struct {
+                __u8 rule; /* currently fcntl or flock */
+                __u8 type; /* shared, exclusive, remove*/
+                __le64 pid; /* process id requesting the lock */
+                __le64 pid_namespace;
+                __le64 start; /* initial location to lock */
+                __le64 length; /* num bytes to lock from start */
+                __u8 wait; /* will caller wait for lock to become available? */
+        } __attribute__ ((packed)) filelock_change;
 } __attribute__ ((packed));
 #define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
@@ -425,6 +472,23 @@ struct ceph_mds_reply_dirfrag {
        __le32 dist[];
 } __attribute__ ((packed));
+#define CEPH_LOCK_FCNTL    1
+#define CEPH_LOCK_FLOCK    2
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+struct ceph_filelock {
+        __le64 start;/* file offset to start lock at */
+        __le64 length; /* num bytes to lock; 0 for all following start */
+        __le64 client; /* which client holds the lock */
+        __le64 pid; /* process id holding the lock on the client */
+        __le64 pid_namespace;
+        __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
 /* file access modes */
 #define CEPH_FILE_MODE_PIN        0
 #define CEPH_FILE_MODE_RD         1
@@ -453,9 +517,10 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_SAUTH      2
 #define CEPH_CAP_SLINK      4
 #define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+#define CEPH_CAP_SFILE      8
+#define CEPH_CAP_SFLOCK    20 
-#define CEPH_CAP_BITS       16
+#define CEPH_CAP_BITS       22
 /* composed values */
 #define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
@@ -473,6 +538,9 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
 #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
 #define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
 /* cap masks (for getattr) */
 #define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
@@ -508,7 +576,8 @@ int ceph_flags_to_mode(int flags);
                              CEPH_CAP_FILE_EXCL)
 #define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
 #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+                           CEPH_CAP_PIN)
 #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
                        CEPH_LOCK_IXATTR)
@@ -598,12 +667,21 @@ struct ceph_mds_cap_reconnect {
        __le64 cap_id;
        __le32 wanted;
        __le32 issued;
+        __le64 snaprealm;
+        __le64 pathbase;        /* base ino for our path to this ino */
+        __le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+struct ceph_mds_cap_reconnect_v1 {
+        __le64 cap_id;
+        __le32 wanted;
+        __le32 issued;
        __le64 size;
        struct ceph_timespec mtime, atime;
        __le64 snaprealm;
        __le64 pathbase;        /* base ino for our path to this ino */
 } __attribute__ ((packed));
-/* followed by encoded string */
 struct ceph_mds_snaprealm_reconnect {
        __le64 ino;     /* snap realm base */
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
index 5ac470c433c9..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -1,5 +1,5 @@
-#ifndef _FS_CEPH_HASH_H
+#ifndef FS_CEPH_HASH_H
-#define _FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
 #define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
 #define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..c6179d3a26a2 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
        case CEPH_ENTITY_TYPE_OSD: return "osd";
        case CEPH_ENTITY_TYPE_MON: return "mon";
        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
        case CEPH_ENTITY_TYPE_AUTH: return "auth";
        default: return "unknown";
        }
@@ -29,6 +28,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_TRUNCATE: return "truncate";
        case CEPH_OSD_OP_ZERO: return "zero";
        case CEPH_OSD_OP_WRITEFULL: return "writefull";
+        case CEPH_OSD_OP_ROLLBACK: return "rollback";
        case CEPH_OSD_OP_APPEND: return "append";
        case CEPH_OSD_OP_STARTSYNC: return "startsync";
@@ -45,6 +45,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
        case CEPH_OSD_OP_PULL: return "pull";
        case CEPH_OSD_OP_PUSH: return "push";
@@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op)
        case CEPH_MDS_OP_LSSNAP: return "lssnap";
        case CEPH_MDS_OP_MKSNAP: return "mksnap";
        case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+        case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+        case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
        }
        return "???";
 }
@@ -174,3 +177,17 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
+const char *ceph_pool_op_name(int op)
+{
+        switch (op) {
+        case POOL_OP_CREATE: return "create";
+        case POOL_OP_DELETE: return "delete";
+        case POOL_OP_AUID_CHANGE: return "auid change";
+        case POOL_OP_CREATE_SNAP: return "create snap";
+        case POOL_OP_DELETE_SNAP: return "delete snap";
+        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+        }
+        return "???";
+}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index dcd7e7523700..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_CRUSH_H
+#ifndef CEPH_CRUSH_CRUSH_H
-#define _CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
 #include <linux/types.h>
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index ff48e110e4bb..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_HASH_H
+#ifndef CEPH_CRUSH_HASH_H
-#define _CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
 #define CRUSH_HASH_RJENKINS1   0
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 {
-        dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+        dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
        switch (in->alg) {
        case CRUSH_BUCKET_UNIFORM:
                return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 */
 static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
 {
-        if (weight[item] >= 0x1000)
+        if (weight[item] >= 0x10000)
                return 0;
        if (weight[item] == 0)
                return 1;
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
        int itemtype;
        int collide, reject;
        const int orig_tries = 5; /* attempts before we fall back to search */
-        dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+                bucket->id, x, outpos, numrep);
        for (rep = outpos; rep < numrep; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
@@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map,
                                        BUG_ON(item >= 0 ||
                                               (-1-item) >= map->max_buckets);
                                        in = map->buckets[-1-item];
+                                        retry_bucket = 1;
                                        continue;
                                }
@@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map,
                                        }
                                }
-                                if (recurse_to_leaf &&
+                                reject = 0;
-                                    item < 0 &&
+                                if (recurse_to_leaf) {
-                                    crush_choose(map, map->buckets[-1-item],
+                                        if (item < 0) {
-                                                 weight,
+                                                if (crush_choose(map,
-                                                 x, outpos+1, 0,
+                                                         map->buckets[-1-item],
-                                                 out2, outpos,
+                                                         weight,
-                                                 firstn, 0, NULL) <= outpos) {
+                                                         x, outpos+1, 0,
-                                        reject = 1;
+                                                         out2, outpos,
-                                } else {
+                                                         firstn, 0,
+                                                         NULL) <= outpos)
+                                                        /* didn't get leaf */
+                                                        reject = 1;
+                                        } else {
+                                                /* we already have a leaf! */
+                                                out2[outpos] = item;
+                                        }
+                                }
+                                if (!reject) {
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
@@ -424,12 +437,12 @@ reject:
                        continue;
                }
-                dprintk("choose got %d\n", item);
+                dprintk("CHOOSE got %d\n", item);
                out[outpos] = item;
                outpos++;
        }
-        dprintk("choose returns %d\n", outpos);
+        dprintk("CHOOSE returns %d\n", outpos);
        return outpos;
 }
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
index 98e90046fd9f..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_MAPPER_H
+#ifndef CEPH_CRUSH_MAPPER_H
-#define _CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
 /*
 * CRUSH functions for find rules and then mapping an input to an
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index f704b3b62424..a3e627f63293 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
 }
-const u8 *aes_iv = "cephsageyudagreg";
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_encrypt(const void *key, int key_len,
-                     const void *src, size_t src_len)
+                            void *dst, size_t *dst_len,
+                            const void *src, size_t src_len)
 {
        struct scatterlist sg_in[2], sg_out[1];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-                      const void *src1, size_t src1_len,
+                             size_t *dst_len,
-                      const void *src2, size_t src2_len)
+                             const void *src1, size_t src1_len,
+                             const void *src2, size_t src2_len)
 {
        struct scatterlist sg_in[3], sg_out[1];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_decrypt(const void *key, int key_len,
-                     const void *src, size_t src_len)
+                            void *dst, size_t *dst_len,
+                            const void *src, size_t src_len)
 {
        struct scatterlist sg_in[1], sg_out[2];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_decrypt2(const void *key, int key_len,
+static int ceph_aes_decrypt2(const void *key, int key_len,
-                      void *dst1, size_t *dst1_len,
+                             void *dst1, size_t *dst1_len,
-                      void *dst2, size_t *dst2_len,
+                             void *dst2, size_t *dst2_len,
-                      const void *src, size_t src_len)
+                             const void *src, size_t src_len)
 {
        struct scatterlist sg_in[1], sg_out[3];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index 40b502e6bd89..bdf38607323c 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret,
                         const void *src2, size_t src2_len);
 /* armor.c */
-extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(void *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
 #endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
        struct ceph_client *client = s->private;
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_client *monc = &client->monc;
        struct rb_node *rp;
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
        if (monc->want_next_osdmap)
                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                __u16 op;
-                seq_printf(s, "%lld statfs\n", req->tid);
+                req = rb_entry(rp, struct ceph_mon_generic_request, node);
+                op = le16_to_cpu(req->request->hdr.type);
+                if (op == CEPH_MSG_STATFS)
+                        seq_printf(s, "%lld statfs\n", req->tid);
+                else
+                        seq_printf(s, "%lld unknown\n", req->tid);
        }
        mutex_unlock(&monc->mutex);
@@ -166,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                } else if (req->r_dentry) {
                        path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                                   ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -182,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                if (req->r_old_dentry) {
                        path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
                                                    &pathbase, 0);
+                        if (IS_ERR(path))
+                                path = NULL;
                        spin_lock(&req->r_old_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
                           ceph_ino(req->r_old_dentry->d_parent->d_inode),
@@ -256,7 +265,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 static int caps_show(struct seq_file *s, void *p)
 {
-        struct ceph_client *client = p;
+        struct ceph_client *client = s->private;
        int total, avail, used, reserved, min;
        ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
@@ -286,7 +295,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
        return 0;
 }
-#define DEFINE_SHOW_FUNC(name)                                          \
+#define DEFINE_SHOW_FUNC(name)                                          \
 static int name##_open(struct inode *inode, struct file *file)          \
 {                                                                       \
        struct seq_file *sf;                                            \
@@ -356,8 +365,8 @@ int ceph_debugfs_client_init(struct ceph_client *client)
        int ret = 0;
        char name[80];
-        snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+        snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                 PR_FSID(&client->fsid), client->monc.auth->global_id);
+                 client->monc.auth->global_id);
        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
        if (!client->debugfs_dir)
@@ -427,11 +436,12 @@ int ceph_debugfs_client_init(struct ceph_client *client)
        if (!client->debugfs_caps)
                goto out;
-        client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+        client->debugfs_congestion_kb =
-                                                   0600,
+                debugfs_create_file("writeback_congestion_kb",
-                                                   client->debugfs_dir,
+                                    0600,
-                                                   client,
+                                    client->debugfs_dir,
-                                                   &congestion_kb_fops);
+                                    client,
+                                    &congestion_kb_fops);
        if (!client->debugfs_congestion_kb)
                goto out;
@@ -461,7 +471,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
        debugfs_remove(client->debugfs_dir);
 }
-#else  // CONFIG_DEBUG_FS
+#else  /* CONFIG_DEBUG_FS */
 int __init ceph_debugfs_init(void)
 {
@@ -481,4 +491,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
 {
 }
-#endif  // CONFIG_DEBUG_FS
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 65b3e022eaf5..3d25415afe63 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
 */
 static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 {
-        a->in_addr.ss_family = htons(a->in_addr.ss_family);
+        __be16 ss_family = htons(a->in_addr.ss_family);
+        a->in_addr.ss_family = *(__u16 *)&ss_family;
 }
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
-        a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+        __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+        a->in_addr.ss_family = ntohs(ss_family);
        WARN_ON(a->in_addr.ss_family == 512);
 }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..a1986eb52045 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -27,7 +27,7 @@
 const struct inode_operations ceph_dir_iops;
 const struct file_operations ceph_dir_fops;
-struct dentry_operations ceph_dentry_ops;
+const struct dentry_operations ceph_dentry_ops;
 /*
 * Initialize ceph dentry state.
@@ -46,13 +46,16 @@ int ceph_init_dentry(struct dentry *dentry)
        else
                dentry->d_op = &ceph_snap_dentry_ops;
-        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+        di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
        spin_lock(&dentry->d_lock);
-        if (dentry->d_fsdata) /* lost a race */
+        if (dentry->d_fsdata) {
+                /* lost a race */
+                kmem_cache_free(ceph_dentry_cachep, di);
                goto out_unlock;
+        }
        di->dentry = dentry;
        di->lease_session = NULL;
        dentry->d_fsdata = di;
@@ -91,6 +94,8 @@ static unsigned fpos_off(loff_t p)
 */
 static int __dcache_readdir(struct file *filp,
                            void *dirent, filldir_t filldir)
+                __releases(inode->i_lock)
+                __acquires(inode->i_lock)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_file_info *fi = filp->private_data;
@@ -125,7 +130,8 @@ more:
        dentry = list_entry(p, struct dentry, d_u.d_child);
        di = ceph_dentry(dentry);
        while (1) {
-                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+                     d_unhashed(dentry) ? "!hashed" : "hashed",
                     parent->d_subdirs.prev, parent->d_subdirs.next);
                if (p == &parent->d_subdirs) {
                        fi->at_end = 1;
@@ -229,6 +235,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = client->mount_args->max_readdir;
+        const int max_bytes = client->mount_args->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -261,6 +268,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        spin_lock(&inode->i_lock);
        if ((filp->f_pos == 2 || fi->dentry) &&
            !ceph_test_opt(client, NOASYNCREADDIR) &&
+            ceph_snap(inode) != CEPH_SNAPDIR &&
            (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
                err = __dcache_readdir(filp, dirent, filldir);
@@ -312,6 +320,7 @@ more:
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+                req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
@@ -335,7 +344,7 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 0;
+                        fi->next_offset = 2;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -478,7 +487,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
@@ -568,7 +577,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                        di->offset = ci->i_max_offset++;
                        spin_unlock(&dir->i_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
@@ -582,7 +590,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        /* we only need inode linkage */
@@ -888,13 +896,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
-                new_dentry->d_time = jiffies;
+                ceph_invalidate_dentry_lease(new_dentry);
-                ceph_dentry(new_dentry)->lease_shared_gen = 0;
        }
        ceph_mdsc_put_request(req);
        return err;
 }
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        dentry->d_time = jiffies;
+        ceph_dentry(dentry)->lease_shared_gen = 0;
+        spin_unlock(&dentry->d_lock);
+}
 /*
 * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -972,8 +989,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
-             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+             ceph_dentry(dentry)->offset);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -998,18 +1016,26 @@ out_touch:
 /*
 * When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen.
+ * of the current dir gen or if this is in the snapshot namespace.
 */
 static void ceph_dentry_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = dentry->d_parent->d_inode;
+        struct inode *parent_inode = NULL;
+        u64 snapid = CEPH_NOSNAP;
-        if (parent_inode) {
+        if (!IS_ROOT(dentry)) {
+                parent_inode = dentry->d_parent->d_inode;
+                if (parent_inode)
+                        snapid = ceph_snap(parent_inode);
+        }
+        dout("dentry_release %p parent %p\n", dentry, parent_inode);
+        if (parent_inode && snapid != CEPH_SNAPDIR) {
                struct ceph_inode_info *ci = ceph_inode(parent_inode);
                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen) {
+                if (ci->i_shared_gen == di->lease_shared_gen ||
+                    snapid <= CEPH_MAXSNAP) {
                        dout(" clearing %p complete (d_release)\n",
                             parent_inode);
                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
@@ -1050,7 +1076,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1092,10 +1118,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 * an fsync() on a dir will wait for any uncommitted directory
 * operations to commit.
 */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+static int ceph_dir_fsync(struct file *file, int datasync)
-                          int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct list_head *head = &ci->i_unsafe_dirops;
        struct ceph_mds_request *req;
@@ -1152,7 +1177,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1165,10 +1190,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        struct ceph_dentry_info *di = ceph_dentry(dn);
        struct ceph_mds_client *mdsc;
-        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
-             dn->d_name.len, dn->d_name.name);
+             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1208,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
@@ -1220,14 +1245,16 @@ const struct inode_operations ceph_dir_iops = {
        .create = ceph_create,
 };
-struct dentry_operations ceph_dentry_ops = {
+const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
        .d_release = ceph_dentry_release,
 };
-struct dentry_operations ceph_snapdir_dentry_ops = {
+const struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
+        .d_release = ceph_dentry_release,
 };
-struct dentry_operations ceph_snap_dentry_ops = {
+const struct dentry_operations ceph_snap_dentry_ops = {
+        .d_release = ceph_dentry_release,
 };
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..e38423e82f2e 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -42,32 +42,37 @@ struct ceph_nfs_confh {
 static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                          int connectable)
 {
+        int type;
        struct ceph_nfs_fh *fh = (void *)rawfh;
        struct ceph_nfs_confh *cfh = (void *)rawfh;
        struct dentry *parent = dentry->d_parent;
        struct inode *inode = dentry->d_inode;
-        int type;
+        int connected_handle_length = sizeof(*cfh)/4;
+        int handle_length = sizeof(*fh)/4;
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
-        if (*max_len >= sizeof(*cfh)) {
+        if (*max_len >= connected_handle_length) {
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
                cfh->parent_name_hash = parent->d_name.hash;
-                *max_len = sizeof(*cfh);
+                *max_len = connected_handle_length;
                type = 2;
-        } else if (*max_len > sizeof(*fh)) {
+        } else if (*max_len >= handle_length) {
-                if (connectable)
+                if (connectable) {
-                        return -ENOSPC;
+                        *max_len = connected_handle_length;
+                        return 255;
+                }
                dout("encode_fh %p\n", dentry);
                fh->ino = ceph_ino(dentry->d_inode);
-                *max_len = sizeof(*fh);
+                *max_len = handle_length;
                type = 1;
        } else {
-                return -ENOSPC;
+                *max_len = handle_length;
+                return 255;
        }
        return type;
 }
@@ -93,11 +98,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
                       fh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
@@ -115,7 +120,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -133,7 +138,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
                                               USE_ANY_MDS);
                if (IS_ERR(req))
-                        return ERR_PTR(PTR_ERR(req));
+                        return ERR_CAST(req);
                req->r_ino1 = vino;
                req->r_ino2.ino = cfh->parent_ino;
@@ -149,11 +154,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
        }
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
@@ -202,11 +207,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..66e4da6dba22 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        if (flags & O_CREAT) {
@@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file)
        kmem_cache_free(ceph_file_cachep, cf);
        /* wake up anyone waiting for caps on this inode */
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        return 0;
 }
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
 * allocate a vector new pages
 */
-static struct page **alloc_page_vector(int num_pages)
+static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
        struct page **pages;
        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        pages = kmalloc(sizeof(*pages) * num_pages, flags);
        if (!pages)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < num_pages; i++) {
-                pages[i] = alloc_page(GFP_NOFS);
+                pages[i] = __page_cache_alloc(flags);
                if (pages[i] == NULL) {
                        ceph_release_page_vector(pages, i);
                        return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
                 * in sequence.
                 */
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    &mtime, false, 2);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        num_pages = calc_pages_for(pos, len);
@@ -665,10 +665,10 @@ more:
                 * throw out any page cache pages in this range. this
                 * may block.
                 */
-                truncate_inode_pages_range(inode->i_mapping, pos, 
+                truncate_inode_pages_range(inode->i_mapping, pos,
                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -697,7 +697,7 @@ more:
                         * start_request so that a tid has been assigned.
                         */
                        spin_lock(&ci->i_unsafe_lock);
-                        list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+                        list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
@@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
                             unsigned long nr_segs, loff_t pos)
 {
        struct file *filp = iocb->ki_filp;
+        struct ceph_file_info *fi = filp->private_data;
        loff_t *ppos = &iocb->ki_pos;
        size_t len = iov->iov_len;
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        void *base = iov->iov_base;
+        void __user *base = iov->iov_base;
        ssize_t ret;
-        int got = 0;
+        int want, got = 0;
        int checkeof = 0, read = 0;
        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
             inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
        __ceph_do_pending_vmtruncate(inode);
-        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
-                            &got, -1);
+                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_CACHE;
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
        if (ret < 0)
                goto out;
        dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
             inode, ceph_vinop(inode), pos, (unsigned)len,
             ceph_cap_string(got));
-        if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+        if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
            (inode->i_sb->s_flags & MS_SYNCHRONOUS))
                /* hmm, this isn't really async... */
@@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
                       unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        loff_t endoff = pos + iov->iov_len;
-        int got = 0;
+        int want, got = 0;
        int ret, err;
        if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +829,11 @@ retry_snap:
        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
             inode->i_size);
-        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
-                            &got, endoff);
+                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_BUFFER;
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
        if (ret < 0)
                goto out;
@@ -833,7 +841,7 @@ retry_snap:
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
             ceph_cap_string(got));
-        if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
            (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -844,8 +852,7 @@ retry_snap:
                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-                        err = vfs_fsync_range(file, file->f_path.dentry,
+                        err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
-                                              pos, pos + ret - 1, 1);
                        if (err < 0)
                                ret = err;
                }
@@ -931,6 +938,8 @@ const struct file_operations ceph_file_fops = {
        .aio_write = ceph_aio_write,
        .mmap = ceph_mmap,
        .fsync = ceph_fsync,
+        .lock = ceph_lock,
+        .flock = ceph_flock,
        .splice_read = generic_file_splice_read,
        .splice_write = generic_file_splice_write,
        .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..62377ec37edf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        BUG_ON(!S_ISDIR(parent->i_mode));
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return inode;
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
-                        &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                struct ceph_snap_realm *realm = ci->i_snap_realm;
                dout(" dropping residual ref to snap realm %p\n", realm);
@@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                         * the file is either opened or mmaped
                         */
                        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
-                                      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+                                       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
-                                      CEPH_CAP_FILE_EXCL)) ||
+                                       CEPH_CAP_FILE_EXCL|
+                                       CEPH_CAP_FILE_LAZYIO)) ||
                            mapping_mapped(inode->i_mapping) ||
                            __ceph_caps_file_wanted(ci)) {
                                ci->i_truncate_pending++;
@@ -619,11 +620,12 @@ static int fill_inode(struct inode *inode,
                        memcpy(ci->i_xattrs.blob->vec.iov_base,
                               iinfo->xattr_data, iinfo->xattr_len);
                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+                xattr_blob = NULL;
        }
        inode->i_mapping->a_ops = &ceph_aops;
        inode->i_mapping->backing_dev_info =
-                &ceph_client(inode->i_sb)->backing_dev_info;
+                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -674,14 +676,16 @@ static int fill_inode(struct inode *inode,
                /* set dir completion flag? */
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
-                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                        inode->i_size = ci->i_rbytes;
                break;
        default:
@@ -802,6 +806,37 @@ out_unlock:
 }
 /*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move(&dn->d_u.d_child, &dir->d_subdirs);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
 *
@@ -810,17 +845,19 @@ out_unlock:
 * the caller) if we fail.
 */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-                                    bool *prehash)
+                                    bool *prehash, bool set_offset)
 {
        struct dentry *realdn;
+        BUG_ON(dn->d_inode);
        /* dn must be unhashed */
        if (!d_unhashed(dn))
                d_drop(dn);
        realdn = d_materialise_unique(dn, in);
        if (IS_ERR(realdn)) {
-                pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+                pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
-                       dn, in, ceph_vinop(in));
+                       PTR_ERR(realdn), dn, in, ceph_vinop(in));
                if (prehash)
                        *prehash = false; /* don't rehash on error */
                dn = realdn; /* note realdn contains the error */
@@ -835,44 +872,18 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dn = realdn;
        } else {
                BUG_ON(!ceph_dentry(dn));
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
+        if (set_offset)
+                ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dn->d_parent->d_inode;
-        struct ceph_dentry_info *di;
-        BUG_ON(!inode);
-        di = ceph_dentry(dn);
-        spin_lock(&inode->i_lock);
-        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
-        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-             dn->d_u.d_child.prev, dn->d_u.d_child.next);
-        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
-}
-/*
 * Incorporate results into the local cache.  This is either just
 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
 * after a lookup).
@@ -933,14 +944,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                if (rinfo->head->result == 0 && req->r_locked_dir)
-                        struct ceph_inode_info *ci =
+                        ceph_invalidate_dir_request(req);
-                                ceph_inode(req->r_locked_dir);
-                        dout(" clearing %p complete (empty trace)\n",
-                             req->r_locked_dir);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
                return 0;
        }
@@ -1011,13 +1016,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             req->r_old_dentry->d_name.len,
                             req->r_old_dentry->d_name.name,
                             dn, dn->d_name.len, dn->d_name.name);
                        /* ensure target dentry is invalidated, despite
                           rehashing bug in vfs_rename_dir */
-                        dn->d_time = jiffies;
+                        ceph_invalidate_dentry_lease(dn);
-                        ceph_dentry(dn)->lease_shared_gen = 0;
                        /* take overwritten dentry's readdir offset */
+                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                             ceph_dentry(req->r_old_dentry)->offset);
                        ceph_dentry(req->r_old_dentry)->offset =
                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
                }
@@ -1053,13 +1063,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                d_delete(dn);
                                goto done;
                        }
-                        dn = splice_dentry(dn, in, &have_lease);
+                        dn = splice_dentry(dn, in, &have_lease, true);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        ceph_set_dentry_offset(dn);
                        igrab(in);
                } else if (ceph_ino(in) == vino.ino &&
                           ceph_snap(in) == vino.snap) {
@@ -1097,12 +1106,11 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        goto done;
                }
                dout(" linking snapped dir %p to dn %p\n", in, dn);
-                dn = splice_dentry(dn, in, NULL);
+                dn = splice_dentry(dn, in, NULL, true);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
                }
-                ceph_set_dentry_offset(dn);
                req->r_dentry = dn;  /* may have spliced */
                igrab(in);
                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1194,8 +1202,10 @@ retry_lookup:
                                goto out;
                        }
                        err = ceph_init_dentry(dn);
-                        if (err < 0)
+                        if (err < 0) {
+                                dput(dn);
                                goto out;
+                        }
                } else if (dn->d_inode &&
                           (ceph_ino(dn->d_inode) != vino.ino ||
                            ceph_snap(dn->d_inode) != vino.snap)) {
@@ -1221,26 +1231,31 @@ retry_lookup:
                        in = dn->d_inode;
                } else {
                        in = ceph_get_inode(parent->d_sb, vino);
-                        if (in == NULL) {
+                        if (IS_ERR(in)) {
                                dout("new_inode badness\n");
                                d_delete(dn);
                                dput(dn);
-                                err = -ENOMEM;
+                                err = PTR_ERR(in);
                                goto out;
                        }
-                        dn = splice_dentry(dn, in, NULL);
+                        dn = splice_dentry(dn, in, NULL, false);
+                        if (IS_ERR(dn))
+                                dn = NULL;
                }
                if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
                               req->r_request_started, -1,
                               &req->r_caps_reservation) < 0) {
                        pr_err("fill_inode badness on %p\n", in);
-                        dput(dn);
+                        goto next_item;
-                        continue;
                }
-                update_dentry_lease(dn, rinfo->dir_dlease[i],
+                if (dn)
-                                    req->r_session, req->r_request_started);
+                        update_dentry_lease(dn, rinfo->dir_dlease[i],
-                dput(dn);
+                                            req->r_session,
+                                            req->r_request_started);
+next_item:
+                if (dn)
+                        dput(dn);
        }
        req->r_did_prepopulate = true;
@@ -1429,7 +1444,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
                igrab(inode);
@@ -1489,7 +1504,7 @@ retry:
        if (wrbuffer_refs == 0)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
        if (wake)
-                wake_up(&ci->i_cap_wq);
+                wake_up_all(&ci->i_cap_wq);
 }
@@ -1518,7 +1533,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..76e307d2aba1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
@@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        return 0;
 }
+static long ceph_ioctl_lazyio(struct file *file)
+{
+        struct ceph_file_info *fi = file->private_data;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+                spin_lock(&inode->i_lock);
+                ci->i_nr_by_mode[fi->fmode]--;
+                fi->fmode |= CEPH_FILE_MODE_LAZY;
+                ci->i_nr_by_mode[fi->fmode]++;
+                spin_unlock(&inode->i_lock);
+                dout("ioctl_layzio: file %p marked lazy\n", file);
+                ceph_check_caps(ci, 0, NULL);
+        } else {
+                dout("ioctl_layzio: file %p already lazy\n", file);
+        }
+        return 0;
+}
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case CEPH_IOC_GET_DATALOC:
                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+        case CEPH_IOC_LAZYIO:
+                return ceph_ioctl_lazyio(file);
        }
        return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d059..88451a3b6857 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc {
 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
                                   struct ceph_ioctl_dataloc)
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
 #endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000000..ff4e753aae92
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,260 @@
+#include "ceph_debug.h"
+#include <linux/file.h>
+#include <linux/namei.h>
+#include "super.h"
+#include "mds_client.h"
+#include "pagelist.h"
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+                             u64 pid, u64 pid_ns,
+                             int cmd, u64 start, u64 length, u8 wait)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+             "length: %llu, wait: %d, type`: %d", (int)lock_type,
+             (int)operation, pid, start, length, wait, cmd);
+        req->r_args.filelock_change.rule = lock_type;
+        req->r_args.filelock_change.type = cmd;
+        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        /* This should be adjusted, but I'm not sure if
+           namespaces actually get id numbers*/
+        req->r_args.filelock_change.pid_namespace =
+                cpu_to_le64((u64)pid_ns);
+        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.length = cpu_to_le64(length);
+        req->r_args.filelock_change.wait = wait;
+        err = ceph_mdsc_do_request(mdsc, inode, req);
+        ceph_mdsc_put_request(req);
+        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             (int)operation, pid, start, length, wait, cmd, err);
+        return err;
+}
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        u64 length;
+        u8 lock_cmd;
+        int err;
+        u8 wait = 0;
+        u16 op = CEPH_MDS_OP_SETFILELOCK;
+        fl->fl_nspid = get_pid(task_tgid(current));
+        dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+        /* set wait bit as appropriate, then make command as Ceph expects it*/
+        if (F_SETLKW == cmd)
+                wait = 1;
+        if (F_GETLK == cmd)
+                op = CEPH_MDS_OP_GETFILELOCK;
+        if (F_RDLCK == fl->fl_type)
+                lock_cmd = CEPH_LOCK_SHARED;
+        else if (F_WRLCK == fl->fl_type)
+                lock_cmd = CEPH_LOCK_EXCL;
+        else
+                lock_cmd = CEPH_LOCK_UNLOCK;
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
+                                lock_cmd, fl->fl_start,
+                                length, wait);
+        if (!err) {
+                dout("mds locked, locking locally");
+                err = posix_lock_file(file, fl, NULL);
+                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        /* undo! This should only happen if the kernel detects
+                         * local deadlock. */
+                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                          (u64)fl->fl_pid,
+                                          (u64)(unsigned long)fl->fl_nspid,
+                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                          length, 0);
+                        dout("got %d on posix_lock_file, undid lock", err);
+                }
+        } else {
+                dout("mds returned error code %d", err);
+        }
+        return err;
+}
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        u64 length;
+        u8 lock_cmd;
+        int err;
+        u8 wait = 1;
+        fl->fl_nspid = get_pid(task_tgid(current));
+        dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+        /* set wait bit, then clear it out of cmd*/
+        if (cmd & LOCK_NB)
+                wait = 0;
+        cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
+        /* set command sequence that Ceph wants to see:
+           shared lock, exclusive lock, or unlock */
+        if (LOCK_SH == cmd)
+                lock_cmd = CEPH_LOCK_SHARED;
+        else if (LOCK_EX == cmd)
+                lock_cmd = CEPH_LOCK_EXCL;
+        else
+                lock_cmd = CEPH_LOCK_UNLOCK;
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
+        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+                                file, (u64)fl->fl_pid,
+                                (u64)(unsigned long)fl->fl_nspid,
+                                lock_cmd, fl->fl_start,
+                                length, wait);
+        if (!err) {
+                err = flock_lock_file_wait(file, fl);
+                if (err) {
+                        ceph_lock_message(CEPH_LOCK_FLOCK,
+                                          CEPH_MDS_OP_SETFILELOCK,
+                                          file, (u64)fl->fl_pid,
+                                          (u64)(unsigned long)fl->fl_nspid,
+                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                          length, 0);
+                        dout("got %d on flock_lock_file_wait, undid lock", err);
+                }
+        } else {
+                dout("mds error code %d", err);
+        }
+        return err;
+}
+/**
+ * Must be called with BKL already held. Fills in the passed
+ * counter variables, so you can prepare pagelist metadata before calling
+ * ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+        struct file_lock *lock;
+        *fcntl_count = 0;
+        *flock_count = 0;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_POSIX)
+                        ++(*fcntl_count);
+                else if (lock->fl_flags & FL_FLOCK)
+                        ++(*flock_count);
+        }
+        dout("counted %d flock locks and %d fcntl locks",
+             *flock_count, *fcntl_count);
+}
+/**
+ * Encode the flock and fcntl locks for the given inode into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Must be called with BLK already held, and the lock numbers should have
+ * been gathered under the same lock holding window.
+ */
+int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
+                      int num_fcntl_locks, int num_flock_locks)
+{
+        struct file_lock *lock;
+        struct ceph_filelock cephlock;
+        int err = 0;
+        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+             num_fcntl_locks);
+        err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
+        if (err)
+                goto fail;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_POSIX) {
+                        err = lock_to_ceph_filelock(lock, &cephlock);
+                        if (err)
+                                goto fail;
+                        err = ceph_pagelist_append(pagelist, &cephlock,
+                                           sizeof(struct ceph_filelock));
+                }
+                if (err)
+                        goto fail;
+        }
+        err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
+        if (err)
+                goto fail;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_FLOCK) {
+                        err = lock_to_ceph_filelock(lock, &cephlock);
+                        if (err)
+                                goto fail;
+                        err = ceph_pagelist_append(pagelist, &cephlock,
+                                           sizeof(struct ceph_filelock));
+                }
+                if (err)
+                        goto fail;
+        }
+fail:
+        return err;
+}
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+int lock_to_ceph_filelock(struct file_lock *lock,
+                          struct ceph_filelock *cephlock)
+{
+        int err = 0;
+        cephlock->start = cpu_to_le64(lock->fl_start);
+        cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+        cephlock->client = cpu_to_le64(0);
+        cephlock->pid = cpu_to_le64(lock->fl_pid);
+        cephlock->pid_namespace =
+                cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+        switch (lock->fl_type) {
+        case F_RDLCK:
+                cephlock->type = CEPH_LOCK_SHARED;
+                break;
+        case F_WRLCK:
+                cephlock->type = CEPH_LOCK_EXCL;
+                break;
+        case F_UNLCK:
+                cephlock->type = CEPH_LOCK_UNLOCK;
+                break;
+        default:
+                dout("Have unknown lock type %d", lock->fl_type);
+                err = -EINVAL;
+        }
+        return err;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..fad95f8f2608 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "mds_client.h"
 #include "mon_client.h"
@@ -37,10 +38,15 @@
 * are no longer valid.
 */
+struct ceph_reconnect_state {
+        struct ceph_pagelist *pagelist;
+        bool flock;
+};
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head);
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 /*
@@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req->r_path1);
        kfree(req->r_path2);
        put_request_session(req);
-        ceph_unreserve_caps(&req->r_caps_reservation);
+        ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
        kfree(req);
 }
@@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
 {
        req->r_tid = ++mdsc->last_tid;
        if (req->r_num_caps)
-                ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+                ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
@@ -553,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 *
 * Called under mdsc->mutex.
 */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+        while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+                dentry = dentry->d_parent;
+        return dentry;
+}
 static int __choose_mds(struct ceph_mds_client *mdsc,
                        struct ceph_mds_request *req)
 {
@@ -583,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
        if (req->r_inode) {
                inode = req->r_inode;
        } else if (req->r_dentry) {
-                if (req->r_dentry->d_inode) {
+                struct inode *dir = req->r_dentry->d_parent->d_inode;
+                if (dir->i_sb != mdsc->client->sb) {
+                        /* not this fs! */
+                        inode = req->r_dentry->d_inode;
+                } else if (ceph_snap(dir) != CEPH_NOSNAP) {
+                        /* direct snapped/virtual snapdir requests
+                         * based on parent dir inode */
+                        struct dentry *dn =
+                                get_nonsnap_parent(req->r_dentry->d_parent);
+                        inode = dn->d_inode;
+                        dout("__choose_mds using nonsnap parent %p\n", inode);
+                } else if (req->r_dentry->d_inode) {
+                        /* dentry target */
                        inode = req->r_dentry->d_inode;
                } else {
-                        inode = req->r_dentry->d_parent->d_inode;
+                        /* dir + name */
+                        inode = dir;
                        hash = req->r_dentry->d_name.hash;
                        is_hash = true;
                }
        }
        dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
             (int)hash, mode);
        if (!inode)
@@ -665,10 +694,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
        struct ceph_msg *msg;
        struct ceph_mds_session_head *h;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                pr_err("create_session_msg ENOMEM creating msg\n");
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        h = msg->front.iov_base;
        h->op = cpu_to_le32(op);
@@ -687,7 +716,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int mstate;
        int mds = session->s_mds;
-        int err = 0;
        /* wait for mds to go active? */
        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,17 +726,58 @@ static int __open_session(struct ceph_mds_client *mdsc,
        /* send connect message */
        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-        if (IS_ERR(msg)) {
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-                goto out;
-        }
        ceph_con_send(&session->s_con, msg);
-out:
        return 0;
 }
 /*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                          struct ceph_mds_session *session)
+{
+        struct ceph_mds_info *mi;
+        struct ceph_mds_session *ts;
+        int i, mds = session->s_mds;
+        int target;
+        if (mds >= mdsc->mdsmap->m_max_mds)
+                return;
+        mi = &mdsc->mdsmap->m_info[mds];
+        dout("open_export_target_sessions for mds%d (%d targets)\n",
+             session->s_mds, mi->num_export_targets);
+        for (i = 0; i < mi->num_export_targets; i++) {
+                target = mi->export_targets[i];
+                ts = __ceph_lookup_mds_session(mdsc, target);
+                if (!ts) {
+                        ts = register_session(mdsc, target);
+                        if (IS_ERR(ts))
+                                return;
+                }
+                if (session->s_state == CEPH_MDS_SESSION_NEW ||
+                    session->s_state == CEPH_MDS_SESSION_CLOSING)
+                        __open_session(mdsc, session);
+                else
+                        dout(" mds%d target mds%d %p is %s\n", session->s_mds,
+                             i, ts, session_state_name(ts->s_state));
+                ceph_put_mds_session(ts);
+        }
+}
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                           struct ceph_mds_session *session)
+{
+        mutex_lock(&mdsc->mutex);
+        __open_export_target_sessions(mdsc, session);
+        mutex_unlock(&mdsc->mutex);
+}
+/*
 * session caps
 */
@@ -769,7 +838,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
                        last_inode = NULL;
                }
                if (old_cap) {
-                        ceph_put_cap(old_cap);
+                        ceph_put_cap(session->s_mdsc, old_cap);
                        old_cap = NULL;
                }
@@ -798,18 +867,55 @@ out:
        if (last_inode)
                iput(last_inode);
        if (old_cap)
-                ceph_put_cap(old_cap);
+                ceph_put_cap(session->s_mdsc, old_cap);
        return ret;
 }
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-                                   void *arg)
+                                  void *arg)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = 0;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
-        ceph_remove_cap(cap);
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        if (!__ceph_is_any_real_caps(ci)) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        pr_info(" dropping dirty %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_dirty_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_dirty_caps = 0;
+                        list_del_init(&ci->i_dirty_item);
+                        drop = 1;
+                }
+                if (!list_empty(&ci->i_flushing_item)) {
+                        pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_flushing_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_flushing_caps = 0;
+                        list_del_init(&ci->i_flushing_item);
+                        mdsc->num_cap_flushing--;
+                        drop = 1;
+                }
+                if (drop && ci->i_wrbuffer_ref) {
+                        pr_info(" dropping dirty data for %p %lld\n",
+                                inode, ceph_ino(inode));
+                        ci->i_wrbuffer_ref = 0;
+                        ci->i_wrbuffer_ref_head = 0;
+                        drop++;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&inode->i_lock);
+        while (drop--)
+                iput(inode);
        return 0;
 }
@@ -821,6 +927,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
        dout("remove_session_caps on %p\n", session);
        iterate_session_caps(session, remove_session_caps_cb, NULL);
        BUG_ON(session->s_nr_caps > 0);
+        BUG_ON(!list_empty(&session->s_cap_flushing));
        cleanup_cap_releases(session);
 }
@@ -835,7 +942,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        wake_up(&ci->i_cap_wq);
+        wake_up_all(&ci->i_cap_wq);
        if (arg) {
                spin_lock(&inode->i_lock);
                ci->i_wanted_max_size = 0;
@@ -883,8 +990,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
                ceph_mds_state_name(state));
        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
                                 ++session->s_renew_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        ceph_con_send(&session->s_con, msg);
        return 0;
 }
@@ -931,17 +1038,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
-        int err = 0;
        dout("request_close_session mds%d state %s seq %lld\n",
             session->s_mds, session_state_name(session->s_state),
             session->s_seq);
        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-        else
+        ceph_con_send(&session->s_con, msg);
-                ceph_con_send(&session->s_con, msg);
+        return 0;
-        return err;
 }
 /*
@@ -1035,16 +1140,17 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 *
 * Called under s_mutex.
 */
-static int add_cap_releases(struct ceph_mds_client *mdsc,
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                            struct ceph_mds_session *session,
+                          struct ceph_mds_session *session)
-                            int extra)
 {
-        struct ceph_msg *msg;
+        struct ceph_msg *msg, *partial = NULL;
        struct ceph_mds_cap_release *head;
        int err = -ENOMEM;
+        int extra = mdsc->client->mount_args->cap_release_safety;
+        int num;
-        if (extra < 0)
+        dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
-                extra = mdsc->client->mount_args->cap_release_safety;
+             extra);
        spin_lock(&session->s_cap_lock);
@@ -1053,13 +1159,18 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
                                       struct ceph_msg,
                                 list_head);
                head = msg->front.iov_base;
-                extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+                num = le32_to_cpu(head->num);
+                if (num) {
+                        dout(" partial %p with (%d/%d)\n", msg, num,
+                             (int)CEPH_CAPS_PER_RELEASE);
+                        extra += CEPH_CAPS_PER_RELEASE - num;
+                        partial = msg;
+                }
        }
        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                spin_unlock(&session->s_cap_lock);
                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                   0, 0, NULL);
+                                   GFP_NOFS);
                if (!msg)
                        goto out_unlocked;
                dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1072,19 +1183,14 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
                session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
        }
-        if (!list_empty(&session->s_cap_releases)) {
+        if (partial) {
-                msg = list_first_entry(&session->s_cap_releases,
+                head = partial->front.iov_base;
-                                       struct ceph_msg,
+                num = le32_to_cpu(head->num);
-                                       list_head);
+                dout(" queueing partial %p with %d/%d\n", partial, num,
-                head = msg->front.iov_base;
+                     (int)CEPH_CAPS_PER_RELEASE);
-                if (head->num) {
+                list_move_tail(&partial->list_head,
-                        dout(" queueing non-full %p (%d)\n", msg,
+                               &session->s_cap_releases_done);
-                             le32_to_cpu(head->num));
+                session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
-                        list_move_tail(&msg->list_head,
-                                      &session->s_cap_releases_done);
-                        session->s_num_cap_releases -=
-                                CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
-                }
        }
        err = 0;
        spin_unlock(&session->s_cap_lock);
@@ -1145,16 +1251,14 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 /*
 * called under s_mutex
 */
-static void send_cap_releases(struct ceph_mds_client *mdsc,
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
-                       struct ceph_mds_session *session)
+                            struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
        dout("send_cap_releases mds%d\n", session->s_mds);
-        while (1) {
+        spin_lock(&session->s_cap_lock);
-                spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases_done)) {
-                if (list_empty(&session->s_cap_releases_done))
-                        break;
                msg = list_first_entry(&session->s_cap_releases_done,
                                 struct ceph_msg, list_head);
                list_del_init(&msg->list_head);
@@ -1162,7 +1266,46 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
                ceph_con_send(&session->s_con, msg);
+                spin_lock(&session->s_cap_lock);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        unsigned num;
+        dout("discard_cap_releases mds%d\n", session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        /* zero out the in-progress message */
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        head = msg->front.iov_base;
+        num = le32_to_cpu(head->num);
+        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+        head->num = cpu_to_le32(0);
+        session->s_num_cap_releases += num;
+        /* requeue completed messages */
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                head = msg->front.iov_base;
+                num = le32_to_cpu(head->num);
+                dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+                     num);
+                session->s_num_cap_releases += num;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                list_add(&msg->list_head, &session->s_cap_releases);
        }
        spin_unlock(&session->s_cap_lock);
 }
@@ -1181,6 +1324,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        if (!req)
                return ERR_PTR(-ENOMEM);
+        mutex_init(&req->r_fill_mutex);
+        req->r_mdsc = mdsc;
        req->r_started = jiffies;
        req->r_resend_mds = -1;
        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1396,7 @@ retry:
                        len += 1 + temp->d_name.len;
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        pr_err("build_path corrupt dentry %p\n", dentry);
                        return ERR_PTR(-EINVAL);
                }
        }
@@ -1267,7 +1412,7 @@ retry:
                struct inode *inode = temp->d_inode;
                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                        dout("build_path path+%d: %p SNAPDIR\n",
                             pos, temp);
                } else if (stop_on_nosnap && inode &&
                           ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1423,18 @@ retry:
                                break;
                        strncpy(path + pos, temp->d_name.name,
                                temp->d_name.len);
-                        dout("build_path_dentry path+%d: %p '%.*s'\n",
-                             pos, temp, temp->d_name.len, path + pos);
                }
                if (pos)
                        path[--pos] = '/';
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry\n");
+                        pr_err("build_path corrupt dentry\n");
                        kfree(path);
                        return ERR_PTR(-EINVAL);
                }
        }
        if (pos != 0) {
-                pr_err("build_path_dentry did not end path lookup where "
+                pr_err("build_path did not end path lookup where "
                       "expected, namelen is %d, pos is %d\n", len, pos);
                /* presumably this is only possible if racing with a
                   rename of one of the parent directories (we can not
@@ -1303,7 +1446,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
-        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+        dout("build_path on %p %d built %llx '%.*s'\n",
             dentry, atomic_read(&dentry->d_count), *base, len, path);
        return path;
 }
@@ -1426,9 +1569,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        if (req->r_old_dentry_drop)
                len += req->r_old_dentry->d_name.len;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg) {
+                msg = ERR_PTR(-ENOMEM);
                goto out_free2;
+        }
        msg->hdr.tid = cpu_to_le64(req->r_tid);
@@ -1445,6 +1590,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        ceph_encode_filepath(&p, end, ino1, path1);
        ceph_encode_filepath(&p, end, ino2, path2);
+        /* make note of release offset, in case we need to replay */
+        req->r_request_release_offset = p - msg->front.iov_base;
        /* cap releases */
        releases = 0;
        if (req->r_inode_drop)
@@ -1492,7 +1640,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
        if (req->r_callback)
                req->r_callback(mdsc, req);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
 }
 /*
@@ -1508,18 +1656,53 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        req->r_mds = mds;
        req->r_attempts++;
+        if (req->r_inode) {
+                struct ceph_cap *cap =
+                        ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+                if (cap)
+                        req->r_sent_on_mseq = cap->mseq;
+                else
+                        req->r_sent_on_mseq = -1;
+        }
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+        if (req->r_got_unsafe) {
+                /*
+                 * Replay.  Do not regenerate message (and rebuild
+                 * paths, etc.); just use the original message.
+                 * Rebuilding paths will break for renames because
+                 * d_move mangles the src name.
+                 */
+                msg = req->r_request;
+                rhead = msg->front.iov_base;
+                flags = le32_to_cpu(rhead->flags);
+                flags |= CEPH_MDS_FLAG_REPLAY;
+                rhead->flags = cpu_to_le32(flags);
+                if (req->r_target_inode)
+                        rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+                rhead->num_retry = req->r_attempts - 1;
+                /* remove cap/dentry releases from message */
+                rhead->num_releases = 0;
+                msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+                msg->front.iov_len = req->r_request_release_offset;
+                return 0;
+        }
        if (req->r_request) {
                ceph_msg_put(req->r_request);
                req->r_request = NULL;
        }
        msg = create_request_message(mdsc, req, mds);
        if (IS_ERR(msg)) {
-                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                req->r_err = PTR_ERR(msg);
                complete_request(mdsc, req);
-                return -PTR_ERR(msg);
+                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -1532,13 +1715,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
+        rhead->ino = 0;
        dout(" r_locked_dir = %p\n", req->r_locked_dir);
-        if (req->r_target_inode && req->r_got_unsafe)
-                rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-        else
-                rhead->ino = 0;
        return 0;
 }
@@ -1552,7 +1731,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = -EAGAIN;
-        if (req->r_reply)
+        if (req->r_err || req->r_got_result)
                goto out;
        if (req->r_timeout &&
@@ -1609,7 +1788,7 @@ out:
        return err;
 finish:
-        req->r_reply = ERR_PTR(err);
+        req->r_err = err;
        complete_request(mdsc, req);
        goto out;
 }
@@ -1630,10 +1809,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 /*
 * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
+ * resubmit their requests to a possibly different mds.
- * wake up if their requests has been forwarded to @mds, too.
 */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
        struct rb_node *p;
@@ -1689,64 +1867,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        __register_request(mdsc, req, dir);
        __do_request(mdsc, req);
-        /* wait */
+        if (req->r_err) {
-        if (!req->r_reply) {
+                err = req->r_err;
-                mutex_unlock(&mdsc->mutex);
+                __unregister_request(mdsc, req);
-                if (req->r_timeout) {
+                dout("do_request early error %d\n", err);
-                        err = (long)wait_for_completion_interruptible_timeout(
+                goto out;
-                                &req->r_completion, req->r_timeout);
-                        if (err == 0)
-                                req->r_reply = ERR_PTR(-EIO);
-                        else if (err < 0)
-                                req->r_reply = ERR_PTR(err);
-                } else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-                }
-                mutex_lock(&mdsc->mutex);
        }
-        if (IS_ERR(req->r_reply)) {
+        /* wait */
-                err = PTR_ERR(req->r_reply);
+        mutex_unlock(&mdsc->mutex);
-                req->r_reply = NULL;
+        dout("do_request waiting\n");
+        if (req->r_timeout) {
+                err = (long)wait_for_completion_killable_timeout(
+                        &req->r_completion, req->r_timeout);
+                if (err == 0)
+                        err = -EIO;
+        } else {
+                err = wait_for_completion_killable(&req->r_completion);
+        }
+        dout("do_request waited, got %d\n", err);
+        mutex_lock(&mdsc->mutex);
-                if (err == -ERESTARTSYS) {
+        /* only abort if we didn't race with a real reply */
-                        /* aborted */
+        if (req->r_got_result) {
-                        req->r_aborted = true;
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        } else if (err < 0) {
+                dout("aborted request %lld with %d\n", req->r_tid, err);
-                        if (req->r_locked_dir &&
+                /*
-                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                 * ensure we aren't running concurrently with
-                                struct ceph_inode_info *ci =
+                 * ceph_fill_trace or ceph_readdir_prepopulate, which
-                                        ceph_inode(req->r_locked_dir);
+                 * rely on locks (dir mutex) held by our caller.
+                 */
+                mutex_lock(&req->r_fill_mutex);
+                req->r_err = err;
+                req->r_aborted = true;
+                mutex_unlock(&req->r_fill_mutex);
-                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                if (req->r_locked_dir &&
-                                     req->r_locked_dir);
+                    (req->r_op & CEPH_MDS_OP_WRITE))
-                                spin_lock(&req->r_locked_dir->i_lock);
+                        ceph_invalidate_dir_request(req);
-                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                                ci->i_release_count++;
-                                spin_unlock(&req->r_locked_dir->i_lock);
-                        }
-                } else {
-                        /* clean up this request */
-                        __unregister_request(mdsc, req);
-                        if (!list_empty(&req->r_unsafe_item))
-                                list_del_init(&req->r_unsafe_item);
-                        complete(&req->r_safe_completion);
-                }
-        } else if (req->r_err) {
-                err = req->r_err;
        } else {
-                err = le32_to_cpu(req->r_reply_info.head->result);
+                err = req->r_err;
        }
-        mutex_unlock(&mdsc->mutex);
+out:
+        mutex_unlock(&mdsc->mutex);
        dout("do_request %p done, result %d\n", req, err);
        return err;
 }
 /*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+        struct inode *inode = req->r_locked_dir;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+        ci->i_release_count++;
+        spin_unlock(&inode->i_lock);
+        if (req->r_dentry)
+                ceph_invalidate_dentry_lease(req->r_dentry);
+        if (req->r_old_dentry)
+                ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+/*
 * Handle mds reply.
 *
 * We take the session mutex and parse and process the reply immediately.
@@ -1797,29 +1989,54 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
+        if (req->r_got_safe && !head->safe) {
+                pr_warning("got unsafe after safe on %llu from mds%d\n",
+                           tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
        result = le32_to_cpu(head->result);
        /*
-         * Tolerate 2 consecutive ESTALEs from the same mds.
+         * Handle an ESTALE
-         * FIXME: we should be looking at the cap migrate_seq.
+         * if we're not talking to the authority, send to them
+         * if the authority has changed while we weren't looking,
+         * send to new authority
+         * Otherwise we just have to return an ESTALE
         */
        if (result == -ESTALE) {
-                req->r_direct_mode = USE_AUTH_MDS;
+                dout("got ESTALE on request %llu", req->r_tid);
-                req->r_num_stale++;
+                if (!req->r_inode) {
-                if (req->r_num_stale <= 2) {
+                        /* do nothing; not an authority problem */
+                } else if (req->r_direct_mode != USE_AUTH_MDS) {
+                        dout("not using auth, setting for that now");
+                        req->r_direct_mode = USE_AUTH_MDS;
                        __do_request(mdsc, req);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
+                } else  {
+                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+                        struct ceph_cap *cap =
+                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        dout("already using auth");
+                        if ((!cap || cap != ci->i_auth_cap) ||
+                            (cap->mseq != req->r_sent_on_mseq)) {
+                                dout("but cap changed, so resending");
+                                __do_request(mdsc, req);
+                                mutex_unlock(&mdsc->mutex);
+                                goto out;
+                        }
                }
-        } else {
+                dout("have to return ESTALE on request %llu", req->r_tid);
-                req->r_num_stale = 0;
        }
        if (head->safe) {
                req->r_got_safe = true;
                __unregister_request(mdsc, req);
-                complete(&req->r_safe_completion);
+                complete_all(&req->r_safe_completion);
                if (req->r_got_unsafe) {
                        /*
@@ -1834,15 +2051,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        /* last unsafe request during umount? */
                        if (mdsc->stopping && !__get_oldest_req(mdsc))
-                                complete(&mdsc->safe_umount_waiters);
+                                complete_all(&mdsc->safe_umount_waiters);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
-        }
+        } else {
-        BUG_ON(req->r_reply);
-        if (!head->safe) {
                req->r_got_unsafe = true;
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
        }
@@ -1871,23 +2084,32 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        /* insert trace into our cache */
+        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
        if (err == 0) {
                if (result == 0 && rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
-                ceph_unreserve_caps(&req->r_caps_reservation);
+                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
+        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
 out_err:
-        if (err) {
+        mutex_lock(&mdsc->mutex);
-                req->r_err = err;
+        if (!req->r_aborted) {
+                if (err) {
+                        req->r_err = err;
+                } else {
+                        req->r_reply = msg;
+                        ceph_msg_get(msg);
+                        req->r_got_result = true;
+                }
        } else {
-                req->r_reply = msg;
+                dout("reply arrived after request %lld was aborted\n", tid);
-                ceph_msg_get(msg);
        }
+        mutex_unlock(&mdsc->mutex);
-        add_cap_releases(mdsc, req->r_session, -1);
+        ceph_add_cap_releases(mdsc, req->r_session);
        mutex_unlock(&session->s_mutex);
        /* kick calling process */
@@ -1921,16 +2143,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
        req = __lookup_request(mdsc, tid);
        if (!req) {
-                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
        }
-        if (fwd_seq <= req->r_num_fwd) {
+        if (req->r_aborted) {
-                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                dout("forward tid %llu aborted, unregistering\n", tid);
+                __unregister_request(mdsc, req);
+        } else if (fwd_seq <= req->r_num_fwd) {
+                dout("forward tid %llu to mds%d - old seq %d <= %d\n",
                     tid, next_mds, req->r_num_fwd, fwd_seq);
        } else {
                /* resend. forward race not possible; mds would drop */
-                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+                BUG_ON(req->r_err);
+                BUG_ON(req->r_got_result);
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
                put_request_session(req);
@@ -1984,6 +2211,8 @@ static void handle_session(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_SESSION_OPEN:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect success\n", session->s_mds);
                session->s_state = CEPH_MDS_SESSION_OPEN;
                renewed_caps(mdsc, session, 0);
                wake = 1;
@@ -1997,10 +2226,12 @@ static void handle_session(struct ceph_mds_session *session,
                break;
        case CEPH_SESSION_CLOSE:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
-                complete(&mdsc->session_close_waiters);
+                wake_up_all(&mdsc->session_close_wq);
-                kick_requests(mdsc, mds, 0);      /* cur only */
+                kick_requests(mdsc, mds);
                break;
        case CEPH_SESSION_STALE:
@@ -2066,9 +2297,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                          void *arg)
 {
-        struct ceph_mds_cap_reconnect rec;
+        union {
+                struct ceph_mds_cap_reconnect v2;
+                struct ceph_mds_cap_reconnect_v1 v1;
+        } rec;
+        size_t reclen;
        struct ceph_inode_info *ci;
-        struct ceph_pagelist *pagelist = arg;
+        struct ceph_reconnect_state *recon_state = arg;
+        struct ceph_pagelist *pagelist = recon_state->pagelist;
        char *path;
        int pathlen, err;
        u64 pathbase;
@@ -2088,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
-                        BUG_ON(err);
+                        goto out_dput;
                }
        } else {
                path = NULL;
@@ -2096,25 +2332,55 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        }
        err = ceph_pagelist_encode_string(pagelist, path, pathlen);
        if (err)
-                goto out;
+                goto out_free;
        spin_lock(&inode->i_lock);
        cap->seq = 0;        /* reset cap seq */
        cap->issue_seq = 0;  /* and issue_seq */
-        rec.cap_id = cpu_to_le64(cap->cap_id);
-        rec.pathbase = cpu_to_le64(pathbase);
+        if (recon_state->flock) {
-        rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+                rec.v2.cap_id = cpu_to_le64(cap->cap_id);
-        rec.issued = cpu_to_le32(cap->issued);
+                rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
-        rec.size = cpu_to_le64(inode->i_size);
+                rec.v2.issued = cpu_to_le32(cap->issued);
-        ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+                rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-        ceph_encode_timespec(&rec.atime, &inode->i_atime);
+                rec.v2.pathbase = cpu_to_le64(pathbase);
-        rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+                rec.v2.flock_len = 0;
+                reclen = sizeof(rec.v2);
+        } else {
+                rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+                rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+                rec.v1.issued = cpu_to_le32(cap->issued);
+                rec.v1.size = cpu_to_le64(inode->i_size);
+                ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
+                ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+                rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+                rec.v1.pathbase = cpu_to_le64(pathbase);
+                reclen = sizeof(rec.v1);
+        }
        spin_unlock(&inode->i_lock);
-        err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+        if (recon_state->flock) {
+                int num_fcntl_locks, num_flock_locks;
+                lock_kernel();
+                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+                rec.v2.flock_len = (2*sizeof(u32) +
+                                    (num_fcntl_locks+num_flock_locks) *
+                                    sizeof(struct ceph_filelock));
+                err = ceph_pagelist_append(pagelist, &rec, reclen);
+                if (!err)
+                        err = ceph_encode_locks(inode, pagelist,
+                                                num_fcntl_locks,
+                                                num_flock_locks);
+                unlock_kernel();
+        } else {
+                err = ceph_pagelist_append(pagelist, &rec, reclen);
+        }
-out:
+out_free:
        kfree(path);
+out_dput:
        dput(dentry);
        return err;
 }
@@ -2132,59 +2398,53 @@ out:
 *
 * called with mdsc->mutex held.
 */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session)
 {
-        struct ceph_mds_session *session = NULL;
        struct ceph_msg *reply;
        struct rb_node *p;
+        int mds = session->s_mds;
        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
+        struct ceph_reconnect_state recon_state;
-        pr_info("reconnect to recovering mds%d\n", mds);
+        pr_info("mds%d reconnect start\n", mds);
        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
        if (!pagelist)
                goto fail_nopagelist;
        ceph_pagelist_init(pagelist);
-        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
-        if (IS_ERR(reply)) {
+        if (!reply)
-                err = PTR_ERR(reply);
                goto fail_nomsg;
-        }
-        /* find session */
-        session = __ceph_lookup_mds_session(mdsc, mds);
-        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
-        if (session) {
+        mutex_lock(&session->s_mutex);
-                mutex_lock(&session->s_mutex);
+        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        session->s_seq = 0;
-                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        ceph_con_open(&session->s_con,
-                session->s_seq = 0;
+                      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-                ceph_con_open(&session->s_con,
+        /* replay unsafe requests */
-                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+        replay_unsafe_requests(mdsc, session);
-                /* replay unsafe requests */
-                replay_unsafe_requests(mdsc, session);
-        } else {
-                dout("no session for mds%d, will send short reconnect\n",
-                     mds);
-        }
        down_read(&mdsc->snap_rwsem);
-        if (!session)
-                goto send;
        dout("session %p state %s\n", session,
             session_state_name(session->s_state));
+        /* drop old cap expires; we're about to reestablish that state */
+        discard_cap_releases(mdsc, session);
        /* traverse this session's caps */
        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
        if (err)
                goto fail;
-        err = iterate_session_caps(session, encode_caps_cb, pagelist);
+        recon_state.pagelist = pagelist;
+        recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+        err = iterate_session_caps(session, encode_caps_cb, &recon_state);
        if (err < 0)
                goto fail;
@@ -2208,36 +2468,31 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                        goto fail;
        }
-send:
        reply->pagelist = pagelist;
+        if (recon_state.flock)
+                reply->hdr.version = cpu_to_le16(2);
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
-        session->s_state = CEPH_MDS_SESSION_OPEN;
        mutex_unlock(&session->s_mutex);
        mutex_lock(&mdsc->mutex);
        __wake_requests(mdsc, &session->s_waiting);
        mutex_unlock(&mdsc->mutex);
-        ceph_put_mds_session(session);
        up_read(&mdsc->snap_rwsem);
-        mutex_lock(&mdsc->mutex);
        return;
 fail:
        ceph_msg_put(reply);
        up_read(&mdsc->snap_rwsem);
        mutex_unlock(&session->s_mutex);
-        ceph_put_mds_session(session);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
        kfree(pagelist);
 fail_nopagelist:
        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-        mutex_lock(&mdsc->mutex);
        return;
 }
@@ -2266,9 +2521,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                oldstate = ceph_mdsmap_get_state(oldmap, i);
                newstate = ceph_mdsmap_get_state(newmap, i);
-                dout("check_new_map mds%d state %s -> %s (session %s)\n",
+                dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
                     i, ceph_mds_state_name(oldstate),
+                     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
                     ceph_mds_state_name(newstate),
+                     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
                     session_state_name(s->s_state));
                if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2290,7 +2547,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                        }
                        /* kick any requests waiting on the recovering mds */
-                        kick_requests(mdsc, i, 1);
+                        kick_requests(mdsc, i);
                } else if (oldstate == newstate) {
                        continue;  /* nothing new with this mds */
                }
@@ -2299,26 +2556,40 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                 * send reconnect?
                 */
                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                    newstate >= CEPH_MDS_STATE_RECONNECT) {
-                        send_mds_reconnect(mdsc, i);
+                        mutex_unlock(&mdsc->mutex);
+                        send_mds_reconnect(mdsc, s);
+                        mutex_lock(&mdsc->mutex);
+                }
                /*
-                 * kick requests on any mds that has gone active.
+                 * kick request on any mds that has gone active.
-                 *
-                 * kick requests on cur or forwarder: we may have sent
-                 * the request to mds1, mds1 told us it forwarded it
-                 * to mds2, but then we learn mds1 failed and can't be
-                 * sure it successfully forwarded our request before
-                 * it died.
                 */
                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
                    newstate >= CEPH_MDS_STATE_ACTIVE) {
-                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        if (oldstate != CEPH_MDS_STATE_CREATING &&
-                        kick_requests(mdsc, i, 1);
+                            oldstate != CEPH_MDS_STATE_STARTING)
+                                pr_info("mds%d recovery completed\n", s->s_mds);
+                        kick_requests(mdsc, i);
                        ceph_kick_flushing_caps(mdsc, s);
                        wake_up_session_caps(s, 1);
                }
        }
+        for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+                s = mdsc->sessions[i];
+                if (!s)
+                        continue;
+                if (!ceph_mdsmap_is_laggy(newmap, i))
+                        continue;
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG ||
+                    s->s_state == CEPH_MDS_SESSION_CLOSING) {
+                        dout(" connecting to export targets of laggy mds%d\n",
+                             i);
+                        __open_export_target_sessions(mdsc, s);
+                }
+        }
 }
@@ -2349,6 +2620,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        struct ceph_dentry_info *di;
        int mds = session->s_mds;
        struct ceph_mds_lease *h = msg->front.iov_base;
+        u32 seq;
        struct ceph_vino vino;
        int mask;
        struct qstr dname;
@@ -2362,6 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
        mask = le16_to_cpu(h->mask);
+        seq = le32_to_cpu(h->seq);
        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
        if (dname.len != get_unaligned_le32(h+1))
@@ -2372,8 +2645,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        /* lookup inode */
        inode = ceph_find_inode(sb, vino);
-        dout("handle_lease '%s', mask %d, ino %llx %p\n",
+        dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
-             ceph_lease_op_name(h->action), mask, vino.ino, inode);
+             ceph_lease_op_name(h->action), mask, vino.ino, inode,
+             dname.len, dname.name);
        if (inode == NULL) {
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;
@@ -2398,7 +2672,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        switch (h->action) {
        case CEPH_MDS_LEASE_REVOKE:
                if (di && di->lease_session == session) {
-                        h->seq = cpu_to_le32(di->lease_seq);
+                        if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+                                h->seq = cpu_to_le32(di->lease_seq);
                        __ceph_mdsc_drop_dentry_lease(dentry);
                }
                release = 1;
@@ -2412,7 +2687,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                        unsigned long duration =
                                le32_to_cpu(h->duration_ms) * HZ / 1000;
-                        di->lease_seq = le32_to_cpu(h->seq);
+                        di->lease_seq = seq;
                        dentry->d_time = di->lease_renew_from + duration;
                        di->lease_renew_after = di->lease_renew_from +
                                (duration >> 1);
@@ -2457,12 +2732,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
        dnamelen = dentry->d_name.len;
        len += dnamelen;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
                return;
        lease = msg->front.iov_base;
        lease->action = action;
-        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->mask = cpu_to_le16(1);
        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
        lease->seq = cpu_to_le32(seq);
@@ -2492,7 +2767,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        BUG_ON(inode == NULL);
        BUG_ON(dentry == NULL);
-        BUG_ON(mask != CEPH_LOCK_DN);
+        BUG_ON(mask == 0);
        /* is dentry lease valid? */
        spin_lock(&dentry->d_lock);
@@ -2602,8 +2877,10 @@ static void delayed_work(struct work_struct *work)
                        send_renew_caps(mdsc, s);
                else
                        ceph_con_keepalive(&s->s_con);
-                add_cap_releases(mdsc, s, -1);
+                ceph_add_cap_releases(mdsc, s);
-                send_cap_releases(mdsc, s);
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG)
+                        ceph_send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
@@ -2620,8 +2897,11 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        mdsc->client = client;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        if (mdsc->mdsmap == NULL)
+                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
-        init_completion(&mdsc->session_close_waiters);
+        init_waitqueue_head(&mdsc->session_close_wq);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
        mdsc->sessions = NULL;
        mdsc->max_sessions = 0;
@@ -2645,6 +2925,10 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        init_waitqueue_head(&mdsc->cap_flushing_wq);
        spin_lock_init(&mdsc->dentry_lru_lock);
        INIT_LIST_HEAD(&mdsc->dentry_lru);
+        ceph_caps_init(mdsc);
+        ceph_adjust_min_caps(mdsc, client->min_caps);
        return 0;
 }
@@ -2689,6 +2973,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
        drop_leases(mdsc);
        ceph_flush_dirty_caps(mdsc);
        wait_requests(mdsc);
+        /*
+         * wait for reply handlers to drop their request refs and
+         * their inode/dcache refs
+         */
+        ceph_msgr_flush();
 }
 /*
@@ -2740,6 +3030,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return;
        dout("sync\n");
        mutex_lock(&mdsc->mutex);
        want_tid = mdsc->last_tid;
@@ -2753,6 +3046,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+        int i, n = 0;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return true;
+        mutex_lock(&mdsc->mutex);
+        for (i = 0; i < mdsc->max_sessions; i++)
+                if (mdsc->sessions[i])
+                        n++;
+        mutex_unlock(&mdsc->mutex);
+        return n == 0;
+}
 /*
 * called after sb is ro.
@@ -2761,45 +3071,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_session *session;
        int i;
-        int n;
        struct ceph_client *client = mdsc->client;
-        unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+        unsigned long timeout = client->mount_args->mount_timeout * HZ;
        dout("close_sessions\n");
-        mutex_lock(&mdsc->mutex);
        /* close sessions */
-        started = jiffies;
+        mutex_lock(&mdsc->mutex);
-        while (time_before(jiffies, started + timeout)) {
+        for (i = 0; i < mdsc->max_sessions; i++) {
-                dout("closing sessions\n");
+                session = __ceph_lookup_mds_session(mdsc, i);
-                n = 0;
+                if (!session)
-                for (i = 0; i < mdsc->max_sessions; i++) {
+                        continue;
-                        session = __ceph_lookup_mds_session(mdsc, i);
-                        if (!session)
-                                continue;
-                        mutex_unlock(&mdsc->mutex);
-                        mutex_lock(&session->s_mutex);
-                        __close_session(mdsc, session);
-                        mutex_unlock(&session->s_mutex);
-                        ceph_put_mds_session(session);
-                        mutex_lock(&mdsc->mutex);
-                        n++;
-                }
-                if (n == 0)
-                        break;
-                if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
-                        break;
-                dout("waiting for sessions to close\n");
                mutex_unlock(&mdsc->mutex);
-                wait_for_completion_timeout(&mdsc->session_close_waiters,
+                mutex_lock(&session->s_mutex);
-                                            timeout);
+                __close_session(mdsc, session);
+                mutex_unlock(&session->s_mutex);
+                ceph_put_mds_session(session);
                mutex_lock(&mdsc->mutex);
        }
+        mutex_unlock(&mdsc->mutex);
+        dout("waiting for sessions to close\n");
+        wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+                           timeout);
        /* tear down remaining sessions */
+        mutex_lock(&mdsc->mutex);
        for (i = 0; i < mdsc->max_sessions; i++) {
                if (mdsc->sessions[i]) {
                        session = get_session(mdsc->sessions[i]);
@@ -2812,9 +3109,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
                        mutex_lock(&mdsc->mutex);
                }
        }
        WARN_ON(!list_empty(&mdsc->cap_delay_list));
        mutex_unlock(&mdsc->mutex);
        ceph_cleanup_empty_realms(mdsc);
@@ -2831,6 +3126,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
        if (mdsc->mdsmap)
                ceph_mdsmap_destroy(mdsc->mdsmap);
        kfree(mdsc->sessions);
+        ceph_caps_finalize(mdsc);
 }
@@ -2922,9 +3218,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
-        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+        pr_warning("mds%d closed our session\n", s->s_mds);
-               s->s_mds);
+        send_mds_reconnect(mdsc, s);
 }
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3328,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&mdsc->client->monc);
 }
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
        .get = con_get,
        .put = con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 struct ceph_mds_request {
        u64 r_tid;                   /* transaction id */
        struct rb_node r_node;
+        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
        int r_mds;
@@ -165,6 +166,8 @@ struct ceph_mds_request {
        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
        struct inode *r_target_inode;       /* resulting inode */
+        struct mutex r_fill_mutex;
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
@@ -186,6 +189,7 @@ struct ceph_mds_request {
        int r_old_inode_drop, r_old_inode_unless;
        struct ceph_msg  *r_request;  /* original request */
+        int r_request_release_offset;
        struct ceph_msg  *r_reply;
        struct ceph_mds_reply_info_parsed r_reply_info;
        int r_err;
@@ -204,8 +208,8 @@ struct ceph_mds_request {
        int               r_attempts;   /* resend attempts */
        int               r_num_fwd;    /* number of forward attempts */
-        int               r_num_stale;
        int               r_resend_mds; /* mds to resend to next, if any*/
+        u32               r_sent_on_mseq; /* cap mseq request was sent at*/
        struct kref       r_kref;
        struct list_head  r_wait;
@@ -213,7 +217,7 @@ struct ceph_mds_request {
        struct completion r_safe_completion;
        ceph_mds_request_callback_t r_callback;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-        bool              r_got_unsafe, r_got_safe;
+        bool              r_got_unsafe, r_got_safe, r_got_result;
        bool              r_did_prepopulate;
        u32               r_readdir_offset;
@@ -230,7 +234,8 @@ struct ceph_mds_client {
        struct mutex            mutex;         /* all nested structures */
        struct ceph_mdsmap      *mdsmap;
-        struct completion       safe_umount_waiters, session_close_waiters;
+        struct completion       safe_umount_waiters;
+        wait_queue_head_t       session_close_wq;
        struct list_head        waiting_for_map;
        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
@@ -264,6 +269,27 @@ struct ceph_mds_client {
        spinlock_t        cap_dirty_lock;   /* protects above items */
        wait_queue_head_t cap_flushing_wq;
+        /*
+         * Cap reservations
+         *
+         * Maintain a global pool of preallocated struct ceph_caps, referenced
+         * by struct ceph_caps_reservations.  This ensures that we preallocate
+         * memory needed to successfully process an MDS response.  (If an MDS
+         * sends us cap information and we fail to process it, we will have
+         * problems due to the client and MDS being out of sync.)
+         *
+         * Reservations are 'owned' by a ceph_cap_reservation context.
+         */
+        spinlock_t      caps_list_lock;
+        struct          list_head caps_list; /* unused (reserved or
+                                                unreserved) */
+        int             caps_total_count;    /* total caps allocated */
+        int             caps_use_count;      /* in use */
+        int             caps_reserve_count;  /* unused, reserved */
+        int             caps_avail_count;    /* unused, unreserved */
+        int             caps_min_count;      /* keep at least this many
+                                                (unreserved) */
 #ifdef CONFIG_DEBUG_FS
        struct dentry     *debugfs_file;
 #endif
@@ -301,6 +327,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct inode *inode,
                                    struct dentry *dn, int mask);
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -318,6 +346,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
        kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
@@ -332,4 +365,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
                                 struct ceph_msg *msg);
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                          struct ceph_mds_session *session);
 #endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfef..040be6d1150b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
+                struct ceph_timespec laggy_since;
                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
                global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                state_seq = ceph_decode_64(p);
                ceph_decode_copy(p, &addr, sizeof(addr));
                ceph_decode_addr(&addr);
-                *p += sizeof(struct ceph_timespec);
+                ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
@@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        m->m_info[mds].global_id = global_id;
                        m->m_info[mds].state = state;
                        m->m_info[mds].addr = addr;
+                        m->m_info[mds].laggy =
+                                (laggy_since.tv_sec != 0 ||
+                                 laggy_since.tv_nsec != 0);
                        m->m_info[mds].num_export_targets = num_export_targets;
                        if (num_export_targets) {
                                m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index eacc131aa5cb..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -13,6 +13,7 @@ struct ceph_mds_info {
        struct ceph_entity_addr addr;
        s32 state;
        int num_export_targets;
+        bool laggy;
        u32 *export_targets;
 };
@@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
        return m->m_info[w].state;
 }
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+        if (w >= 0 && w < m->m_max_mds)
+                return m->m_info[w].laggy;
+        return false;
+}
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
 extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..2502d76fcec1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,23 +39,12 @@ static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
-const char *ceph_name_type_str(int t)
-{
-        switch (t) {
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-        default: return "???";
-        }
-}
 /*
 * nicely render a sockaddr as a string.
 */
 #define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
@@ -64,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        int i;
        char *s;
        struct sockaddr_in *in4 = (void *)ss;
-        unsigned char *quad = (void *)&in4->sin_addr.s_addr;
        struct sockaddr_in6 *in6 = (void *)ss;
        spin_lock(&addr_str_lock);
@@ -76,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
        switch (ss->ss_family) {
        case AF_INET:
-                sprintf(s, "%u.%u.%u.%u:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                        (unsigned int)quad[0],
+                         (unsigned int)ntohs(in4->sin_port));
-                        (unsigned int)quad[1],
-                        (unsigned int)quad[2],
-                        (unsigned int)quad[3],
-                        (unsigned int)ntohs(in4->sin_port));
                break;
        case AF_INET6:
-                sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+                snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                        in6->sin6_addr.s6_addr16[0],
+                         (unsigned int)ntohs(in6->sin6_port));
-                        in6->sin6_addr.s6_addr16[1],
-                        in6->sin6_addr.s6_addr16[2],
-                        in6->sin6_addr.s6_addr16[3],
-                        in6->sin6_addr.s6_addr16[4],
-                        in6->sin6_addr.s6_addr16[5],
-                        in6->sin6_addr.s6_addr16[6],
-                        in6->sin6_addr.s6_addr16[7],
-                        (unsigned int)ntohs(in6->sin6_port));
                break;
        default:
@@ -132,6 +108,12 @@ void ceph_msgr_exit(void)
        destroy_workqueue(ceph_msgr_wq);
 }
+void ceph_msgr_flush(void)
+{
+        flush_workqueue(ceph_msgr_wq);
+}
 /*
 * socket callback functions
 */
@@ -221,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
 */
 static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 {
-        struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+        struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
        struct socket *sock;
        int ret;
        BUG_ON(con->sock);
-        ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+        ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+                               IPPROTO_TCP, &sock);
        if (ret)
                return ERR_PTR(ret);
        con->sock = sock;
@@ -240,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-        ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+        ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+                                 O_NONBLOCK);
        if (ret == -EINPROGRESS) {
                dout("connect %s EINPROGRESS sk_state = %u\n",
                     pr_addr(&con->peer_addr.in_addr),
@@ -340,6 +324,7 @@ static void reset_connection(struct ceph_connection *con)
                ceph_msg_put(con->out_msg);
                con->out_msg = NULL;
        }
+        con->out_keepalive_pending = false;
        con->in_seq = 0;
        con->in_seq_acked = 0;
 }
@@ -357,6 +342,7 @@ void ceph_con_close(struct ceph_connection *con)
        clear_bit(WRITE_PENDING, &con->state);
        mutex_lock(&con->mutex);
        reset_connection(con);
+        con->peer_global_seq = 0;
        cancel_delayed_work(&con->work);
        mutex_unlock(&con->mutex);
        queue_con(con);
@@ -661,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1013,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end,
                struct sockaddr_in *in4 = (void *)ss;
                struct sockaddr_in6 *in6 = (void *)ss;
                int port;
+                char delim = ',';
+                if (*p == '[') {
+                        delim = ']';
+                        p++;
+                }
                memset(ss, 0, sizeof(*ss));
                if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                             ',', &ipend)) {
+                             delim, &ipend))
                        ss->ss_family = AF_INET;
-                } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                    ',', &ipend)) {
+                                  delim, &ipend))
                        ss->ss_family = AF_INET6;
-                } else {
+                else
                        goto bad;
-                }
                p = ipend;
+                if (delim == ']') {
+                        if (*p != ']') {
+                                dout("missing matching ']'\n");
+                                goto bad;
+                        }
+                        p++;
+                }
                /* port? */
                if (p < end && *p == ':') {
                        port = 0;
@@ -1059,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end,
        return 0;
 bad:
-        pr_err("parse_ips bad ip '%s'\n", c);
+        pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
        return -EINVAL;
 }
@@ -1082,11 +1081,11 @@ static int process_banner(struct ceph_connection *con)
                   sizeof(con->peer_addr)) != 0 &&
            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-                pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+                pr_warning("wrong peer, want %s/%d, got %s/%d\n",
                           pr_addr(&con->peer_addr.in_addr),
-                           le64_to_cpu(con->peer_addr.nonce),
+                           (int)le32_to_cpu(con->peer_addr.nonce),
                           pr_addr(&con->actual_peer_addr.in_addr),
-                           le64_to_cpu(con->actual_peer_addr.nonce));
+                           (int)le32_to_cpu(con->actual_peer_addr.nonce));
                con->error_msg = "wrong peer at address";
                return -1;
        }
@@ -1233,6 +1232,7 @@ static int process_connect(struct ceph_connection *con)
                clear_bit(CONNECTING, &con->state);
                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
                con->connect_seq++;
+                con->peer_features = server_feat;
                dout("process_connect got READY gseq %d cseq %d (%d)\n",
                     con->peer_global_seq,
                     le32_to_cpu(con->in_reply.connect_seq),
@@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con)
 static int read_partial_message_section(struct ceph_connection *con,
-                                        struct kvec *section, unsigned int sec_len,
+                                        struct kvec *section,
-                                        u32 *crc)
+                                        unsigned int sec_len, u32 *crc)
 {
        int left;
        int ret;
@@ -1399,22 +1399,22 @@ static int read_partial_message(struct ceph_connection *con)
        if (!con->in_msg) {
                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
                     con->in_hdr.front_len, con->in_hdr.data_len);
+                skip = 0;
                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                if (skip) {
                        /* skip this message */
-                        dout("alloc_msg returned NULL, skipping message\n");
+                        dout("alloc_msg said skip message\n");
+                        BUG_ON(con->in_msg);
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
                        con->in_seq++;
                        return 0;
                }
-                if (IS_ERR(con->in_msg)) {
+                if (!con->in_msg) {
-                        ret = PTR_ERR(con->in_msg);
-                        con->in_msg = NULL;
                        con->error_msg =
                                "error allocating memory for incoming message";
-                        return ret;
+                        return -ENOMEM;
                }
                m = con->in_msg;
                m->front.iov_len = 0;    /* haven't read it yet */
@@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con)
        /* middle */
        if (m->middle) {
-                ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+                ret = read_partial_message_section(con, &m->middle->vec,
+                                                   middle_len,
                                                   &con->in_middle_crc);
                if (ret <= 0)
                        return ret;
@@ -1514,14 +1515,14 @@ static void process_message(struct ceph_connection *con)
        /* if first message, set peer_name */
        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src.name;
+                con->peer_name = msg->hdr.src;
        con->in_seq++;
        mutex_unlock(&con->mutex);
        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src.name),
+             ENTITY_NAME(msg->hdr.src),
             le16_to_cpu(msg->hdr.type),
             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
             le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1547,6 @@ static int try_write(struct ceph_connection *con)
        dout("try_write start %p state %lu nref %d\n", con, con->state,
             atomic_read(&con->nref));
-        mutex_lock(&con->mutex);
 more:
        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -1639,7 +1639,6 @@ do_next:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_write done on %p\n", con);
        return ret;
 }
@@ -1651,7 +1650,6 @@ out:
 */
 static int try_read(struct ceph_connection *con)
 {
-        struct ceph_messenger *msgr;
        int ret = -1;
        if (!con->sock)
@@ -1661,9 +1659,6 @@ static int try_read(struct ceph_connection *con)
                return 0;
        dout("try_read start on %p\n", con);
-        msgr = con->msgr;
-        mutex_lock(&con->mutex);
 more:
        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1753,6 @@ more:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_read done on %p\n", con);
        return ret;
@@ -1830,6 +1824,8 @@ more:
        dout("con_work %p start, clearing QUEUED\n", con);
        clear_bit(QUEUED, &con->state);
+        mutex_lock(&con->mutex);
        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                dout("con_work CLOSED\n");
                con_close_socket(con);
@@ -1844,11 +1840,16 @@ more:
        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
            try_read(con) < 0 ||
            try_write(con) < 0) {
+                mutex_unlock(&con->mutex);
                backoff = 1;
                ceph_fault(con);     /* error/fault path */
+                goto done_unlocked;
        }
 done:
+        mutex_unlock(&con->mutex);
+done_unlocked:
        clear_bit(BUSY, &con->state);
        dout("con->state=%lu\n", con->state);
        if (test_bit(QUEUED, &con->state)) {
@@ -1920,7 +1921,7 @@ out:
        /*
         * in case we faulted due to authentication, invalidate our
         * current tickets so that we can get new ones.
-         */
+         */
        if (con->auth_retry && con->ops->invalidate_authorizer) {
                dout("calling invalidate_authorizer()\n");
                con->ops->invalidate_authorizer(con);
@@ -1947,7 +1948,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
        /* the zero page is needed if a request is "canceled" while the message
         * is being written over the socket */
-        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
        if (!msgr->zero_page) {
                kfree(msgr);
                return ERR_PTR(-ENOMEM);
@@ -1987,9 +1988,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
        }
        /* set src+dst */
-        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src = con->msgr->inst.name;
-        msg->hdr.src.addr = con->msgr->my_enc_addr;
-        msg->hdr.orig_src = msg->hdr.src;
        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
@@ -2020,20 +2019,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
        mutex_lock(&con->mutex);
        if (!list_empty(&msg->list_head)) {
-                dout("con_revoke %p msg %p\n", con, msg);
+                dout("con_revoke %p msg %p - was on queue\n", con, msg);
                list_del_init(&msg->list_head);
                ceph_msg_put(msg);
                msg->hdr.seq = 0;
-                if (con->out_msg == msg) {
+        }
-                        ceph_msg_put(con->out_msg);
+        if (con->out_msg == msg) {
-                        con->out_msg = NULL;
+                dout("con_revoke %p msg %p - was sending\n", con, msg);
-                }
+                con->out_msg = NULL;
                if (con->out_kvec_is_msg) {
                        con->out_skip = con->out_kvec_bytes;
                        con->out_kvec_is_msg = false;
                }
-        } else {
+                ceph_msg_put(msg);
-                dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+                msg->hdr.seq = 0;
        }
        mutex_unlock(&con->mutex);
 }
@@ -2083,12 +2082,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
 * construct a new message with given type, size
 * the new msg has a ref count of 1.
 */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-                              int page_len, int page_off, struct page **pages)
 {
        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), GFP_NOFS);
+        m = kmalloc(sizeof(*m), flags);
        if (m == NULL)
                goto out;
        kref_init(&m->kref);
@@ -2100,8 +2098,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->hdr.version = 0;
        m->hdr.front_len = cpu_to_le32(front_len);
        m->hdr.middle_len = 0;
-        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_len = 0;
-        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.data_off = 0;
        m->hdr.reserved = 0;
        m->footer.front_crc = 0;
        m->footer.middle_crc = 0;
@@ -2115,11 +2113,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        /* front */
        if (front_len) {
                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                        m->front.iov_base = __vmalloc(front_len, flags,
                                                      PAGE_KERNEL);
                        m->front_is_vmalloc = true;
                } else {
-                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                        m->front.iov_base = kmalloc(front_len, flags);
                }
                if (m->front.iov_base == NULL) {
                        pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2133,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->middle = NULL;
        /* data */
-        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->nr_pages = 0;
-        m->pages = pages;
+        m->pages = NULL;
        m->pagelist = NULL;
-        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+        dout("ceph_msg_new %p front %d\n", m, front_len);
-             m->nr_pages);
        return m;
 out2:
        ceph_msg_put(m);
 out:
-        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return ERR_PTR(-ENOMEM);
+        return NULL;
 }
 /*
@@ -2190,29 +2187,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                mutex_unlock(&con->mutex);
                msg = con->ops->alloc_msg(con, hdr, skip);
                mutex_lock(&con->mutex);
-                if (IS_ERR(msg))
+                if (!msg || *skip)
-                        return msg;
-                if (*skip)
                        return NULL;
        }
        if (!msg) {
                *skip = 0;
-                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                msg = ceph_msg_new(type, front_len, GFP_NOFS);
                if (!msg) {
                        pr_err("unable to allocate msg type %d len %d\n",
                               type, front_len);
-                        return ERR_PTR(-ENOMEM);
+                        return NULL;
                }
        }
        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len) {
+        if (middle_len && !msg->middle) {
                ret = ceph_alloc_middle(con, msg);
                if (ret < 0) {
                        ceph_msg_put(msg);
-                        return msg;
+                        return NULL;
                }
        }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
                                        int *skip);
 };
-extern const char *ceph_name_type_str(int t);
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 struct ceph_messenger {
        struct ceph_entity_inst inst;    /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
        struct ceph_entity_addr peer_addr; /* peer address */
        struct ceph_entity_name peer_name; /* peer name */
        struct ceph_entity_addr peer_addr_for_me;
+        unsigned peer_features;
        u32 connect_seq;      /* identify the most recent connection
                                 attempt for this connection, client */
        u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
        struct list_head out_queue;
        struct list_head out_sent;   /* sending or sent but unacked */
        u64 out_seq;                 /* last message queued for send */
-        u64 out_seq_sent;            /* last message sent */
        bool out_keepalive_pending;
        u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
 extern struct ceph_messenger *ceph_messenger_create(
        struct ceph_entity_addr *myaddr);
@@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-                                     int page_len, int page_off,
-                                     struct page **pages);
 extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..b2a5a3e4a671 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
 * resend any outstanding requests.
 */
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 static int __validate_auth(struct ceph_mon_client *monc);
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
        monc->pending_auth = 1;
        monc->m_auth->front.iov_len = len;
        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_con_revoke(monc->con, monc->m_auth);
        ceph_msg_get(monc->m_auth);  /* keep our ref */
        ceph_con_send(monc->con, monc->m_auth);
 }
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
             monc->want_next_osdmap);
        if ((__sub_expired(monc) && !monc->sub_sent) ||
            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg;
+                struct ceph_msg *msg = monc->m_subscribe;
                struct ceph_mon_subscribe_item *i;
                void *p, *end;
-                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-                if (!msg)
-                        return;
                p = msg->front.iov_base;
-                end = p + msg->front.iov_len;
+                end = p + msg->front_max;
                dout("__send_subscribe to 'mdsmap' %u+\n",
                     (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
                msg->front.iov_len = p - msg->front.iov_base;
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_send(monc->con, msg);
+                ceph_con_revoke(monc->con, msg);
+                ceph_con_send(monc->con, ceph_msg_get(msg));
                monc->sub_sent = jiffies | 1;  /* never 0 */
        }
@@ -347,20 +345,20 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 out:
        mutex_unlock(&monc->mutex);
-        wake_up(&client->auth_wq);
+        wake_up_all(&client->auth_wq);
 }
 /*
- * statfs
+ * generic requests (e.g., statfs, poolop)
 */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
        struct ceph_mon_client *monc, u64 tid)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        struct rb_node *n = monc->generic_request_tree.rb_node;
        while (n) {
-                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                req = rb_entry(n, struct ceph_mon_generic_request, node);
                if (tid < req->tid)
                        n = n->rb_left;
                else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
        return NULL;
 }
-static void __insert_statfs(struct ceph_mon_client *monc,
+static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_statfs_request *new)
+                            struct ceph_mon_generic_request *new)
 {
-        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node **p = &monc->generic_request_tree.rb_node;
        struct rb_node *parent = NULL;
-        struct ceph_mon_statfs_request *req = NULL;
+        struct ceph_mon_generic_request *req = NULL;
        while (*p) {
                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                req = rb_entry(parent, struct ceph_mon_generic_request, node);
                if (new->tid < req->tid)
                        p = &(*p)->rb_left;
                else if (new->tid > req->tid)
@@ -390,113 +388,290 @@ static void __insert_statfs(struct ceph_mon_client *monc,
        }
        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->statfs_request_tree);
+        rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+static void release_generic_request(struct kref *kref)
+{
+        struct ceph_mon_generic_request *req =
+                container_of(kref, struct ceph_mon_generic_request, kref);
+        if (req->reply)
+                ceph_msg_put(req->reply);
+        if (req->request)
+                ceph_msg_put(req->request);
+        kfree(req);
+}
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_put(&req->kref, release_generic_request);
+}
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_get(&req->kref);
+}
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                         struct ceph_msg_header *hdr,
+                                         int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        struct ceph_mon_generic_request *req;
+        u64 tid = le64_to_cpu(hdr->tid);
+        struct ceph_msg *m;
+        mutex_lock(&monc->mutex);
+        req = __lookup_generic_req(monc, tid);
+        if (!req) {
+                dout("get_generic_reply %lld dne\n", tid);
+                *skip = 1;
+                m = NULL;
+        } else {
+                dout("get_generic_reply %lld got %p\n", tid, req->reply);
+                m = ceph_msg_get(req->reply);
+                /*
+                 * we don't need to track the connection reading into
+                 * this reply because we only have one open connection
+                 * at a time, ever.
+                 */
+        }
+        mutex_unlock(&monc->mutex);
+        return m;
 }
+static int do_generic_request(struct ceph_mon_client *monc,
+                              struct ceph_mon_generic_request *req)
+{
+        int err;
+        /* register request */
+        mutex_lock(&monc->mutex);
+        req->tid = ++monc->last_tid;
+        req->request->hdr.tid = cpu_to_le64(req->tid);
+        __insert_generic_request(monc, req);
+        monc->num_generic_requests++;
+        ceph_con_send(monc->con, ceph_msg_get(req->request));
+        mutex_unlock(&monc->mutex);
+        err = wait_for_completion_interruptible(&req->completion);
+        mutex_lock(&monc->mutex);
+        rb_erase(&req->node, &monc->generic_request_tree);
+        monc->num_generic_requests--;
+        mutex_unlock(&monc->mutex);
+        if (!err)
+                err = req->result;
+        return err;
+}
+/*
+ * statfs
+ */
 static void handle_statfs_reply(struct ceph_mon_client *monc,
                                struct ceph_msg *msg)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        tid = le64_to_cpu(msg->hdr.tid);
        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_statfs(monc, tid);
+        req = __lookup_generic_req(monc, tid);
        if (req) {
-                *req->buf = reply->st;
+                *(struct ceph_statfs *)req->buf = reply->st;
                req->result = 0;
+                get_generic_request(req);
        }
        mutex_unlock(&monc->mutex);
-        if (req)
+        if (req) {
-                complete(&req->completion);
+                complete_all(&req->completion);
+                put_generic_request(req);
+        }
        return;
 bad:
-        pr_err("corrupt statfs reply, no tid\n");
+        pr_err("corrupt generic reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
 /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
 */
-static int send_statfs(struct ceph_mon_client *monc,
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-                       struct ceph_mon_statfs_request *req)
 {
-        struct ceph_msg *msg;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
+        int err;
+        req = kzalloc(sizeof(*req), GFP_NOFS);
+        if (!req)
+                return -ENOMEM;
+        kref_init(&req->kref);
+        req->buf = buf;
+        req->buf_len = sizeof(*buf);
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
-        dout("send_statfs tid %llu\n", req->tid);
+        /* fill out request */
-        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        h = req->request->front.iov_base;
-        if (IS_ERR(msg))
-                return PTR_ERR(msg);
-        req->request = msg;
-        msg->hdr.tid = cpu_to_le64(req->tid);
-        h = msg->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
-        ceph_con_send(monc->con, msg);
-        return 0;
+        err = do_generic_request(monc, req);
+out:
+        kref_put(&req->kref, release_generic_request);
+        return err;
 }
 /*
- * Do a synchronous statfs().
+ * pool ops
 */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+                                char *dst, size_t dst_len)
 {
-        struct ceph_mon_statfs_request req;
+        u32 buf_len;
-        int err;
-        req.buf = buf;
+        if (src_len != sizeof(u32) + dst_len)
-        init_completion(&req.completion);
+                return -EINVAL;
-        /* allocate memory for reply */
+        buf_len = le32_to_cpu(*(u32 *)src);
-        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+        if (buf_len != dst_len)
-        if (err)
+                return -EINVAL;
-                return err;
-        /* register request */
+        memcpy(dst, src + sizeof(u32), dst_len);
-        mutex_lock(&monc->mutex);
+        return 0;
-        req.tid = ++monc->last_tid;
+}
-        req.last_attempt = jiffies;
-        req.delay = BASE_DELAY_INTERVAL;
-        __insert_statfs(monc, &req);
-        monc->num_statfs_requests++;
-        mutex_unlock(&monc->mutex);
-        /* send request and wait */
+static void handle_poolop_reply(struct ceph_mon_client *monc,
-        err = send_statfs(monc, &req);
+                                struct ceph_msg *msg)
-        if (!err)
+{
-                err = wait_for_completion_interruptible(&req.completion);
+        struct ceph_mon_generic_request *req;
+        struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*reply))
+                goto bad;
+        dout("handle_poolop_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        rb_erase(&req.node, &monc->statfs_request_tree);
+        req = __lookup_generic_req(monc, tid);
-        monc->num_statfs_requests--;
+        if (req) {
-        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+                if (req->buf_len &&
+                    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+                                     msg->front.iov_len - sizeof(*reply),
+                                     req->buf, req->buf_len) < 0) {
+                        mutex_unlock(&monc->mutex);
+                        goto bad;
+                }
+                req->result = le32_to_cpu(reply->reply_code);
+                get_generic_request(req);
+        }
        mutex_unlock(&monc->mutex);
+        if (req) {
+                complete(&req->completion);
+                put_generic_request(req);
+        }
+        return;
-        if (!err)
+bad:
-                err = req.result;
+        pr_err("corrupt generic reply, tid %llu\n", tid);
+        ceph_msg_dump(msg);
+}
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+                        u32 pool, u64 snapid,
+                        char *buf, int len)
+{
+        struct ceph_mon_generic_request *req;
+        struct ceph_mon_poolop *h;
+        int err;
+        req = kzalloc(sizeof(*req), GFP_NOFS);
+        if (!req)
+                return -ENOMEM;
+        kref_init(&req->kref);
+        req->buf = buf;
+        req->buf_len = len;
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
+        /* fill out request */
+        req->request->hdr.version = cpu_to_le16(2);
+        h = req->request->front.iov_base;
+        h->monhdr.have_version = 0;
+        h->monhdr.session_mon = cpu_to_le16(-1);
+        h->monhdr.session_mon_tid = 0;
+        h->fsid = monc->monmap->fsid;
+        h->pool = cpu_to_le32(pool);
+        h->op = cpu_to_le32(op);
+        h->auid = 0;
+        h->snapid = cpu_to_le64(snapid);
+        h->name_len = 0;
+        err = do_generic_request(monc, req);
+out:
+        kref_put(&req->kref, release_generic_request);
        return err;
 }
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                            u32 pool, u64 *snapid)
+{
+        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                   pool, 0, (char *)snapid, sizeof(*snapid));
+}
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                            u32 pool, u64 snapid)
+{
+        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                   pool, snapid, 0, 0);
+}
 /*
- * Resend pending statfs requests.
+ * Resend pending generic requests.
 */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct rb_node *p;
-        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                send_statfs(monc, req);
+                ceph_con_revoke(monc->con, req->request);
+                ceph_con_send(monc->con, ceph_msg_get(req->request));
        }
 }
@@ -586,26 +761,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msg pools */
+        /* msgs */
-        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+        err = -ENOMEM;
-                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-        if (err < 0)
+                                     sizeof(struct ceph_mon_subscribe_ack),
+                                     GFP_NOFS);
+        if (!monc->m_subscribe_ack)
                goto out_monmap;
-        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (err < 0)
+        if (!monc->m_subscribe)
-                goto out_pool1;
+                goto out_subscribe_ack;
-        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-        if (err < 0)
+        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-                goto out_pool2;
+        if (!monc->m_auth_reply)
+                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
        monc->pending_auth = 0;
-        if (IS_ERR(monc->m_auth)) {
+        if (!monc->m_auth)
-                err = PTR_ERR(monc->m_auth);
+                goto out_auth_reply;
-                monc->m_auth = NULL;
-                goto out_pool3;
-        }
        monc->cur_mon = -1;
        monc->hunting = true;
@@ -613,8 +788,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->sub_sent = 0;
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->statfs_request_tree = RB_ROOT;
+        monc->generic_request_tree = RB_ROOT;
-        monc->num_statfs_requests = 0;
+        monc->num_generic_requests = 0;
        monc->last_tid = 0;
        monc->have_mdsmap = 0;
@@ -622,12 +797,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->want_next_osdmap = 1;
        return 0;
-out_pool3:
+out_auth_reply:
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_auth_reply);
-out_pool2:
+out_subscribe:
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_subscribe);
-out_pool1:
+out_subscribe_ack:
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
        kfree(monc->monmap);
 out:
@@ -651,9 +826,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
        ceph_msg_put(monc->m_auth);
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_auth_reply);
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe);
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
        kfree(monc->monmap);
 }
@@ -662,8 +837,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                              struct ceph_msg *msg)
 {
        int ret;
+        int was_auth = 0;
        mutex_lock(&monc->mutex);
+        if (monc->auth->ops)
+                was_auth = monc->auth->ops->is_authenticated(monc->auth);
        monc->pending_auth = 0;
        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
                                     msg->front.iov_len,
@@ -671,17 +849,18 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                                     monc->m_auth->front_max);
        if (ret < 0) {
                monc->client->auth_err = ret;
-                wake_up(&monc->client->auth_wq);
+                wake_up_all(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
-        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
                dout("authenticated, starting session\n");
                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-                monc->client->msgr->inst.name.num = monc->auth->global_id;
+                monc->client->msgr->inst.name.num =
+                                        cpu_to_le64(monc->auth->global_id);
                __send_subscribe(monc);
-                __resend_statfs(monc);
+                __resend_generic_request(monc);
        }
        mutex_unlock(&monc->mutex);
 }
@@ -735,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                handle_statfs_reply(monc, msg);
                break;
+        case CEPH_MSG_POOLOP_REPLY:
+                handle_poolop_reply(monc, msg);
+                break;
        case CEPH_MSG_MON_MAP:
                ceph_monc_handle_map(monc, msg);
                break;
@@ -770,18 +953,18 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                m = ceph_msg_get(monc->m_subscribe_ack);
                break;
+        case CEPH_MSG_POOLOP_REPLY:
        case CEPH_MSG_STATFS_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                return get_generic_reply(con, hdr, skip);
-                break;
        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                m = ceph_msg_get(monc->m_auth_reply);
                break;
        case CEPH_MSG_MON_MAP:
        case CEPH_MSG_MDS_MAP:
        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                m = ceph_msg_new(type, front_len, GFP_NOFS);
                break;
        }
@@ -826,7 +1009,7 @@ out:
        mutex_unlock(&monc->mutex);
 }
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
        .get = ceph_con_get,
        .put = ceph_con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..8e396f2c0963 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
 #define _FS_CEPH_MON_CLIENT_H
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 #include "messenger.h"
-#include "msgpool.h"
 struct ceph_client;
 struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 /*
@@ -40,17 +40,20 @@ struct ceph_mon_request {
 };
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
 * to the caller
 */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        struct ceph_statfs *buf;
+        void *buf;
+        int buf_len;
        struct completion completion;
-        unsigned long last_attempt, delay; /* jiffies */
        struct ceph_msg *request;  /* original request */
+        struct ceph_msg *reply;    /* and reply */
 };
 struct ceph_mon_client {
@@ -61,7 +64,7 @@ struct ceph_mon_client {
        struct delayed_work delayed_work;
        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth;
+        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
        int pending_auth;
        bool hunting;
@@ -70,14 +73,9 @@ struct ceph_mon_client {
        struct ceph_connection *con;
        bool have_fsid;
-        /* msg pools */
+        /* pending generic requests */
-        struct ceph_msgpool msgpool_subscribe_ack;
+        struct rb_root generic_request_tree;
-        struct ceph_msgpool msgpool_statfs_reply;
+        int num_generic_requests;
-        struct ceph_msgpool msgpool_auth_reply;
-        /* pending statfs requests */
-        struct rb_root statfs_request_tree;
-        int num_statfs_requests;
        u64 last_tid;
        /* mds/osd map */
@@ -114,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                                   u32 pool, u64 *snapid);
+extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                                   u32 pool, u64 snapid);
 #endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
 #include "msgpool.h"
-/*
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
- * We use msg pools to preallocate memory for messages we expect to
+{
- * receive over the wire, to avoid getting ourselves into OOM
+        struct ceph_msgpool *pool = arg;
- * conditions at unexpected times.  We take use a few different
+        void *p;
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+        p = ceph_msg_new(0, pool->front_len, gfp_mask);
+        if (!p)
+                pr_err("msgpool %s alloc failed\n", pool->name);
+        return p;
+}
-/*
+static void free_fn(void *element, void *arg)
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
 {
-        struct ceph_msg *msg;
+        ceph_msg_put(element);
-        while (pool->num < pool->min) {
-                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-                     pool->min);
-                spin_unlock(&pool->lock);
-                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-                spin_lock(&pool->lock);
-                if (IS_ERR(msg))
-                        return PTR_ERR(msg);
-                msg->pool = pool;
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-        }
-        while (pool->num > pool->min) {
-                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-                     pool->min, msg);
-                list_del_init(&msg->list_head);
-                pool->num--;
-                ceph_msg_kfree(msg);
-        }
-        return 0;
 }
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int min, bool blocking)
+                      int front_len, int size, bool blocking, const char *name)
 {
-        int ret;
-        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-        spin_lock_init(&pool->lock);
        pool->front_len = front_len;
-        INIT_LIST_HEAD(&pool->msgs);
+        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        pool->num = 0;
+        if (!pool->pool)
-        pool->min = min;
+                return -ENOMEM;
-        pool->blocking = blocking;
+        pool->name = name;
-        init_waitqueue_head(&pool->wait);
+        return 0;
-        spin_lock(&pool->lock);
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
 }
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-        dout("msgpool_destroy %p\n", pool);
+        mempool_destroy(pool->pool);
-        spin_lock(&pool->lock);
-        pool->min = 0;
-        __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
 }
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                  int front_len)
 {
-        int ret;
+        if (front_len > pool->front_len) {
+                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-        spin_lock(&pool->lock);
+                       pool->name, front_len, pool->front_len);
-        dout("msgpool_resv %p delta %d\n", pool, delta);
-        pool->min += delta;
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-        wait_queue_t wait;
-        struct ceph_msg *msg;
-        if (front_len && front_len > pool->front_len) {
-                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-                       pool, front_len, pool->front_len);
                WARN_ON(1);
                /* try to alloc a fresh message */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                return ceph_msg_new(0, front_len, GFP_NOFS);
-                if (!IS_ERR(msg))
-                        return msg;
-        }
-        if (!front_len)
-                front_len = pool->front_len;
-        if (pool->blocking) {
-                /* mempool_t behavior; first try to alloc */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                if (!IS_ERR(msg))
-                        return msg;
        }
-        while (1) {
+        return mempool_alloc(pool->pool, GFP_NOFS);
-                spin_lock(&pool->lock);
-                if (likely(pool->num)) {
-                        msg = list_entry(pool->msgs.next, struct ceph_msg,
-                                         list_head);
-                        list_del_init(&msg->list_head);
-                        pool->num--;
-                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-                             pool->num, pool->min);
-                        spin_unlock(&pool->lock);
-                        return msg;
-                }
-                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-                       pool->min, pool->blocking ? "waiting" : "may fail");
-                spin_unlock(&pool->lock);
-                if (!pool->blocking) {
-                        WARN_ON(1);
-                        /* maybe we can allocate it now? */
-                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                        if (!IS_ERR(msg))
-                                return msg;
-                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
-                        return ERR_PTR(-ENOMEM);
-                }
-                init_wait(&wait);
-                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-                schedule();
-                finish_wait(&pool->wait, &wait);
-        }
 }
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-        spin_lock(&pool->lock);
+        /* reset msg front_len; user may have changed it */
-        if (pool->num < pool->min) {
+        msg->front.iov_len = pool->front_len;
-                /* reset msg front_len; user may have changed it */
+        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                msg->front.iov_len = pool->front_len;
-                msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                kref_set(&msg->kref, 1);  /* retake a single ref */
+        kref_init(&msg->kref);  /* retake single ref */
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                wake_up(&pool->wait);
-        } else {
-                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                ceph_msg_kfree(msg);
-        }
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
+#include <linux/mempool.h>
 #include "messenger.h"
 /*
@@ -8,18 +9,15 @@
 * avoid unexpected OOM conditions.
 */
 struct ceph_msgpool {
-        spinlock_t lock;
+        const char *name;
+        mempool_t *pool;
        int front_len;          /* preallocated payload size */
-        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-        int num, min;           /* cur, min # msgs in the pool */
-        bool blocking;
-        wait_queue_head_t wait;
 };
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking);
+                             int front_len, int size, bool blocking,
+                             const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
                                         int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -1,5 +1,5 @@
-#ifndef __MSGR_H
+#ifndef CEPH_MSGR_H
-#define __MSGR_H
+#define CEPH_MSGR_H
 /*
 * Data types for message passing layer used by Ceph.
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
 /*
 * message header
 */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
        __le64 seq;       /* message seq# for this session */
        __le64 tid;       /* transaction id */
        __le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
        __le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_name src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..3b5571b8ce22 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
                          struct ceph_osd *kickosd);
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                req = kzalloc(sizeof(*req), GFP_NOFS);
        }
        if (req == NULL)
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
        else
                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        req->r_reply = msg;
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
        memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
 {
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref))
+        if (atomic_dec_and_test(&osd->o_ref)) {
+                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+                if (osd->o_authorizer)
+                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
                kfree(osd);
+        }
 }
 /*
@@ -544,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 */
 static void __cancel_request(struct ceph_osd_request *req)
 {
-        if (req->r_sent) {
+        if (req->r_sent && req->r_osd) {
                ceph_con_revoke(&req->r_osd->o_con, req->r_request);
                req->r_sent = 0;
        }
@@ -656,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
        reqhead->reassert_version = req->r_reassert_version;
        req->r_stamp = jiffies;
-        list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+        list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
        ceph_msg_get(req->r_request); /* send consumes a ref */
        ceph_con_send(&req->r_osd->o_con, req->r_request);
@@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work)
         * should mark the osd as failed and we should find out about
         * it from an updated osd map.
         */
-        while (!list_empty(&osdc->req_lru)) {
+        while (timeout && !list_empty(&osdc->req_lru)) {
                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
                                 r_req_lru_item);
@@ -857,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        if (req->r_callback)
                req->r_callback(req, msg);
        else
-                complete(&req->r_completion);
+                complete_all(&req->r_completion);
        if (flags & CEPH_OSD_FLAG_ONDISK) {
                if (req->r_safe_callback)
                        req->r_safe_callback(req, msg);
-                complete(&req->r_safe_completion);  /* fsync waiter */
+                complete_all(&req->r_safe_completion);  /* fsync waiter */
        }
 done:
@@ -1078,6 +1083,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
+        wake_up_all(&osdc->client->auth_wq);
        return;
 bad:
@@ -1087,45 +1093,6 @@ bad:
        return;
 }
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-                         struct ceph_msg_header *hdr,
-                         struct ceph_osd_request *req,
-                         u64 tid,
-                         struct ceph_msg *m)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int ret = -1;
-        int data_len = le32_to_cpu(hdr->data_len);
-        unsigned data_off = le16_to_cpu(hdr->data_off);
-        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-        if (!osd)
-                return -1;
-        osdc = osd->o_osdc;
-        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-             tid, req->r_num_pages, want);
-        if (unlikely(req->r_num_pages < want))
-                goto out;
-        m->pages = req->r_pages;
-        m->nr_pages = req->r_num_pages;
-        ret = 0; /* success */
-out:
-        BUG_ON(ret < 0 || m->nr_pages < want);
-        return ret;
-}
 /*
 * Register request, send initial attempt.
 */
@@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->req_mempool)
                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true);
+                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
        return 0;
@@ -1302,13 +1271,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                    NULL, 0, truncate_seq, truncate_size, NULL,
                                    false, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short read due to an object boundary */
        req->r_pages = pages;
-        num_pages = calc_pages_for(off, *plen);
-        req->r_num_pages = num_pages;
        dout("readpages  final extent is %llu~%llu (%d pages)\n",
             off, *plen, req->r_num_pages);
@@ -1345,12 +1312,11 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                    snapc, do_sync,
                                    truncate_seq, truncate_size, mtime,
                                    nofail, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short write due to an object boundary */
        req->r_pages = pages;
-        req->r_num_pages = calc_pages_for(off, len);
        dout("writepages %llu~%llu (%d pages)\n", off, len,
             req->r_num_pages);
@@ -1375,7 +1341,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
        int type = le16_to_cpu(msg->hdr.type);
        if (!osd)
-                return;
+                goto out;
        osdc = osd->o_osdc;
        switch (type) {
@@ -1390,11 +1356,13 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                pr_err("received unknown message type %d %s\n", type,
                       ceph_msg_type_name(type));
        }
+out:
        ceph_msg_put(msg);
 }
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
 */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -1407,7 +1375,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        int front = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
        u64 tid;
-        int err;
        tid = le64_to_cpu(hdr->tid);
        mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1392,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                     req->r_reply, req->r_con_filling_msg);
                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
                ceph_con_put(req->r_con_filling_msg);
+                req->r_con_filling_msg = NULL;
        }
        if (front > req->r_reply->front.iov_len) {
                pr_warning("get_reply front %d > preallocated %d\n",
                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (IS_ERR(m))
+                if (!m)
                        goto out;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
@@ -1439,12 +1407,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        m = ceph_msg_get(req->r_reply);
        if (data_len > 0) {
-                err = __prepare_pages(con, hdr, req, tid, m);
+                unsigned data_off = le16_to_cpu(hdr->data_off);
-                if (err < 0) {
+                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+                if (unlikely(req->r_num_pages < want)) {
+                        pr_warning("tid %lld reply %d > expected %d pages\n",
+                                   tid, want, m->nr_pages);
                        *skip = 1;
                        ceph_msg_put(m);
-                        m = ERR_PTR(err);
+                        m = NULL;
+                        goto out;
                }
+                m->pages = req->r_pages;
+                m->nr_pages = req->r_num_pages;
        }
        *skip = 0;
        req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1441,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, 0, 0, NULL);
+                return ceph_msg_new(type, front, GFP_NOFS);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
@@ -1498,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con)
 * authentication
 */
 static int get_authorizer(struct ceph_connection *con,
-                          void **buf, int *len, int *proto,
+                          void **buf, int *len, int *proto,
-                          void **reply_buf, int *reply_len, int force_new)
+                          void **reply_buf, int *reply_len, int force_new)
 {
        struct ceph_osd *o = con->private;
        struct ceph_osd_client *osdc = o->o_osdc;
@@ -1519,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con,
                        &o->o_authorizer_reply_buf,
                        &o->o_authorizer_reply_buf_len);
                if (ret)
-                return ret;
+                        return ret;
        }
        *proto = ac->protocol;
@@ -1552,7 +1527,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&osdc->client->monc);
 }
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
        .get = get_osd_con,
        .put = put_osd_con,
        .dispatch = dispatch,
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..e31f118f1392 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
        kfree(pi);
 }
-void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
+        unsigned n, m;
        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
        calc_pg_masks(pi);
-        *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+        /* num_snaps * snap_info_t */
+        n = le32_to_cpu(pi->v.num_snaps);
+        while (n--) {
+                ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+                                 sizeof(struct ceph_timespec), bad);
+                *p += sizeof(u64) +       /* key */
+                        1 + sizeof(u64) + /* u8, snapid */
+                        sizeof(struct ceph_timespec);
+                m = ceph_decode_32(p);    /* snap name */
+                *p += m;
+        }
        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+        return 0;
+bad:
+        return -EINVAL;
 }
 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
@@ -568,9 +586,12 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                if (ev > CEPH_PG_POOL_VERSION) {
                        pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
                                   ev, CEPH_PG_POOL_VERSION);
+                        kfree(pi);
                        goto bad;
                }
-                __decode_pool(p, pi);
+                err = __decode_pool(p, end, pi);
+                if (err < 0)
+                        goto bad;
                __insert_pg_pool(&map->pg_pools, pi);
        }
@@ -706,7 +727,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                     len, *p, end);
                newcrush = crush_decode(*p, min(*p+len, end));
                if (IS_ERR(newcrush))
-                        return ERR_PTR(PTR_ERR(newcrush));
+                        return ERR_CAST(newcrush);
+                *p += len;
        }
        /* new flags? */
@@ -758,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        pi->id = pool;
                        __insert_pg_pool(&map->pg_pools, pi);
                }
-                __decode_pool(p, pi);
+                err = __decode_pool(p, end, pi);
+                if (err < 0)
+                        goto bad;
        }
        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
                goto bad;
@@ -829,12 +853,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                /* remove any? */
                while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
                                                node)->pgid, pgid) <= 0) {
-                        struct rb_node *cur = rbp;
+                        struct ceph_pg_mapping *cur =
+                                rb_entry(rbp, struct ceph_pg_mapping, node);
                        rbp = rb_next(rbp);
-                        dout(" removed pg_temp %llx\n",
+                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                             *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                        rb_erase(&cur->node, &map->pg_temp);
-                                               node)->pgid);
+                        kfree(cur);
-                        rb_erase(cur, &map->pg_temp);
                }
                if (pglen) {
@@ -850,19 +875,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        for (j = 0; j < pglen; j++)
                                pg->osds[j] = ceph_decode_32(p);
                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err)
+                        if (err) {
+                                kfree(pg);
                                goto bad;
+                        }
                        dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
                             pglen);
                }
        }
        while (rbp) {
-                struct rb_node *cur = rbp;
+                struct ceph_pg_mapping *cur =
+                        rb_entry(rbp, struct ceph_pg_mapping, node);
                rbp = rb_next(rbp);
-                dout(" removed pg_temp %llx\n",
+                dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+                rb_erase(&cur->node, &map->pg_temp);
-                                       node)->pgid);
+                kfree(cur);
-                rb_erase(cur, &map->pg_temp);
        }
        /* ignore the rest */
@@ -1020,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
                                 pool->v.type, pool->v.size);
        if (ruleno < 0) {
-                pr_err("no crush rule pool %d type %d size %d\n",
+                pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                       poolid, pool->v.type, pool->v.size);
+                       poolid, pool->v.crush_ruleset, pool->v.type,
+                       pool->v.size);
                return NULL;
        }
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..46a368b6dce5 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -5,10 +5,18 @@
 #include "pagelist.h"
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+        struct page *page = list_entry(pl->head.prev, struct page,
+                                       lru);
+        kunmap(page);
+}
 int ceph_pagelist_release(struct ceph_pagelist *pl)
 {
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        while (!list_empty(&pl->head)) {
                struct page *page = list_first_entry(&pl->head, struct page,
                                                     lru);
@@ -20,13 +28,13 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-        struct page *page = alloc_page(GFP_NOFS);
+        struct page *page = __page_cache_alloc(GFP_NOFS);
        if (!page)
                return -ENOMEM;
        pl->room += PAGE_SIZE;
        list_add_tail(&page->lru, &pl->head);
        if (pl->mapped_tail)
-                kunmap(pl->mapped_tail);
+                ceph_pagelist_unmap_tail(pl);
        pl->mapped_tail = kmap(page);
        return 0;
 }
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -1,5 +1,5 @@
-#ifndef __RADOS_H
+#ifndef CEPH_RADOS_H
-#define __RADOS_H
+#define CEPH_RADOS_H
 /*
 * Data types for the Ceph distributed object storage layer RADOS
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
        __le64 snap_seq;          /* seq for per-pool snapshot */
        __le32 snap_epoch;        /* epoch of last snap */
        __le32 num_snaps;
-        __le32 num_removed_snap_intervals;
+        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 uid;
+        __le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 /*
@@ -203,11 +203,13 @@ enum {
        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+        CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
        /** attrs **/
        /* read */
        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
        /* write */
        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -271,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op)
        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
 }
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
 #define CEPH_OSD_TMAP_HDR 'h'
 #define CEPH_OSD_TMAP_SET 's'
 #define CEPH_OSD_TMAP_RM  'r'
@@ -296,6 +302,7 @@ enum {
        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+        CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
 };
 enum {
@@ -305,6 +312,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/* xattr comparison */
+enum {
+        CEPH_OSD_CMPXATTR_OP_NOP = 0,
+        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+        CEPH_OSD_CMPXATTR_OP_NE  = 2,
+        CEPH_OSD_CMPXATTR_OP_GT  = 3,
+        CEPH_OSD_CMPXATTR_OP_GTE = 4,
+        CEPH_OSD_CMPXATTR_OP_LT  = 5,
+        CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+enum {
+        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+        CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
 /*
 * an individual object operation.  each may be accompanied by some data
 * payload
@@ -321,6 +344,8 @@ struct ceph_osd_op {
                struct {
                        __le32 name_len;
                        __le32 value_len;
+                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
                } __attribute__ ((packed)) xattr;
                struct {
                        __u8 class_len;
@@ -331,6 +356,9 @@ struct ceph_osd_op {
                struct {
                        __le64 cookie, count;
                } __attribute__ ((packed)) pgls;
+                struct {
+                        __le64 snapid;
+                } __attribute__ ((packed)) snap;
        };
        __le32 payload_len;
 } __attribute__ ((packed));
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..190b6c4a6f2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -119,6 +119,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
        INIT_LIST_HEAD(&realm->children);
        INIT_LIST_HEAD(&realm->child_item);
        INIT_LIST_HEAD(&realm->empty_item);
+        INIT_LIST_HEAD(&realm->dirty_item);
        INIT_LIST_HEAD(&realm->inodes_with_caps);
        spin_lock_init(&realm->inodes_with_caps_lock);
        __insert_snap_realm(&mdsc->snap_realms, realm);
@@ -435,7 +436,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
-        int used;
+        int used, dirty;
        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
        if (!capsnap) {
@@ -445,6 +446,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        spin_lock(&inode->i_lock);
        used = __ceph_caps_used(ci);
+        dirty = __ceph_caps_dirty(ci);
        if (__ceph_have_pending_cap_snap(ci)) {
                /* there is no point in queuing multiple "pending" cap_snaps,
                   as no new writes are allowed to start when pending, so any
@@ -452,27 +454,37 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
                kfree(capsnap);
-        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+                   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+                             CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
                struct ceph_snap_context *snapc = ci->i_head_snapc;
+                dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+                     capsnap, snapc);
                igrab(inode);
+                
                atomic_set(&capsnap->nref, 1);
                capsnap->ci = ci;
                INIT_LIST_HEAD(&capsnap->ci_item);
                INIT_LIST_HEAD(&capsnap->flushing_item);
-                capsnap->follows = snapc->seq - 1;
+                capsnap->follows = snapc->seq;
                capsnap->issued = __ceph_caps_issued(ci, NULL);
-                capsnap->dirty = __ceph_caps_dirty(ci);
+                capsnap->dirty = dirty;
                capsnap->mode = inode->i_mode;
                capsnap->uid = inode->i_uid;
                capsnap->gid = inode->i_gid;
-                /* fixme? */
+                if (dirty & CEPH_CAP_XATTR_EXCL) {
-                capsnap->xattr_blob = NULL;
+                        __ceph_build_xattrs_blob(ci);
-                capsnap->xattr_len = 0;
+                        capsnap->xattr_blob =
+                                ceph_buffer_get(ci->i_xattrs.blob);
+                        capsnap->xattr_version = ci->i_xattrs.version;
+                } else {
+                        capsnap->xattr_blob = NULL;
+                        capsnap->xattr_version = 0;
+                }
                /* dirty page count moved from _head to this cap_snap;
                   all subsequent writes page dirties occur _after_ this
@@ -480,7 +492,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
                ci->i_wrbuffer_ref_head = 0;
                capsnap->context = snapc;
-                ci->i_head_snapc = NULL;
+                ci->i_head_snapc =
+                        ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                dout(" new snapc is %p\n", ci->i_head_snapc);
                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
                if (used & CEPH_CAP_FILE_WR) {
@@ -512,7 +526,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
@@ -539,6 +553,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        return 1;  /* caller may want to ceph_flush_snaps */
 }
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+        struct ceph_inode_info *ci;
+        struct inode *lastinode = NULL;
+        struct ceph_snap_realm *child;
+        dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+        spin_lock(&realm->inodes_with_caps_lock);
+        list_for_each_entry(ci, &realm->inodes_with_caps,
+                            i_snap_realm_item) {
+                struct inode *inode = igrab(&ci->vfs_inode);
+                if (!inode)
+                        continue;
+                spin_unlock(&realm->inodes_with_caps_lock);
+                if (lastinode)
+                        iput(lastinode);
+                lastinode = inode;
+                ceph_queue_cap_snap(ci);
+                spin_lock(&realm->inodes_with_caps_lock);
+        }
+        spin_unlock(&realm->inodes_with_caps_lock);
+        if (lastinode)
+                iput(lastinode);
+        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item)
+                queue_realm_cap_snaps(child);
+        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
 /*
 * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
@@ -556,6 +605,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
        struct ceph_snap_realm *realm;
        int invalidate = 0;
        int err = -ENOMEM;
+        LIST_HEAD(dirty_realms);
        dout("update_snap_trace deletion=%d\n", deletion);
 more:
@@ -578,45 +628,6 @@ more:
                }
        }
-        if (le64_to_cpu(ri->seq) > realm->seq) {
-                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
-                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
-                /*
-                 * if the realm seq has changed, queue a cap_snap for every
-                 * inode with open caps.  we do this _before_ we update
-                 * the realm info so that we prepare for writeback under the
-                 * _previous_ snap context.
-                 *
-                 * ...unless it's a snap deletion!
-                 */
-                if (!deletion) {
-                        struct ceph_inode_info *ci;
-                        struct inode *lastinode = NULL;
-                        spin_lock(&realm->inodes_with_caps_lock);
-                        list_for_each_entry(ci, &realm->inodes_with_caps,
-                                            i_snap_realm_item) {
-                                struct inode *inode = igrab(&ci->vfs_inode);
-                                if (!inode)
-                                        continue;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                if (lastinode)
-                                        iput(lastinode);
-                                lastinode = inode;
-                                ceph_queue_cap_snap(ci);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                        }
-                        spin_unlock(&realm->inodes_with_caps_lock);
-                        if (lastinode)
-                                iput(lastinode);
-                        dout("update_snap_trace cap_snaps queued\n");
-                }
-        } else {
-                dout("update_snap_trace %llx %p seq %lld unchanged\n",
-                     realm->ino, realm, realm->seq);
-        }
        /* ensure the parent is correct */
        err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
        if (err < 0)
@@ -624,6 +635,8 @@ more:
        invalidate += err;
        if (le64_to_cpu(ri->seq) > realm->seq) {
+                dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+                     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
                /* update realm parameters, snap lists */
                realm->seq = le64_to_cpu(ri->seq);
                realm->created = le64_to_cpu(ri->created);
@@ -641,9 +654,17 @@ more:
                if (err < 0)
                        goto fail;
+                /* queue realm for cap_snap creation */
+                list_add(&realm->dirty_item, &dirty_realms);
                invalidate = 1;
        } else if (!realm->cached_context) {
+                dout("update_snap_trace %llx %p seq %lld new\n",
+                     realm->ino, realm, realm->seq);
                invalidate = 1;
+        } else {
+                dout("update_snap_trace %llx %p seq %lld unchanged\n",
+                     realm->ino, realm, realm->seq);
        }
        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
@@ -656,6 +677,14 @@ more:
        if (invalidate)
                rebuild_snap_realms(realm);
+        /*
+         * queue cap snaps _after_ we've built the new snap contexts,
+         * so that i_head_snapc can be set appropriately.
+         */
+        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+                queue_realm_cap_snaps(realm);
+        }
        __cleanup_empty_realms(mdsc);
        return 0;
@@ -688,7 +717,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
                igrab(inode);
                spin_unlock(&mdsc->snap_flush_lock);
                spin_lock(&inode->i_lock);
-                __ceph_flush_snaps(ci, &session);
+                __ceph_flush_snaps(ci, &session, 0);
                spin_unlock(&inode->i_lock);
                iput(inode);
                spin_lock(&mdsc->snap_flush_lock);
@@ -789,6 +818,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        };
                        struct inode *inode = ceph_find_inode(sb, vino);
                        struct ceph_inode_info *ci;
+                        struct ceph_snap_realm *oldrealm;
                        if (!inode)
                                continue;
@@ -814,18 +844,19 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        dout(" will move %p to split realm %llx %p\n",
                             inode, realm->ino, realm);
                        /*
-                         * Remove the inode from the realm's inode
+                         * Move the inode to the new realm
-                         * list, but don't add it to the new realm
-                         * yet.  We don't want the cap_snap to be
-                         * queued (again) by ceph_update_snap_trace()
-                         * below.  Queue it _now_, under the old context.
                         */
                        spin_lock(&realm->inodes_with_caps_lock);
                        list_del_init(&ci->i_snap_realm_item);
+                        list_add(&ci->i_snap_realm_item,
+                                 &realm->inodes_with_caps);
+                        oldrealm = ci->i_snap_realm;
+                        ci->i_snap_realm = realm;
                        spin_unlock(&realm->inodes_with_caps_lock);
                        spin_unlock(&inode->i_lock);
-                        ceph_queue_cap_snap(ci);
+                        ceph_get_snap_realm(mdsc, realm);
+                        ceph_put_snap_realm(mdsc, oldrealm);
                        iput(inode);
                        continue;
@@ -853,43 +884,9 @@ skip_inode:
        ceph_update_snap_trace(mdsc, p, e,
                               op == CEPH_SNAP_OP_DESTROY);
-        if (op == CEPH_SNAP_OP_SPLIT) {
+        if (op == CEPH_SNAP_OP_SPLIT)
-                /*
-                 * ok, _now_ add the inodes into the new realm.
-                 */
-                for (i = 0; i < num_split_inos; i++) {
-                        struct ceph_vino vino = {
-                                .ino = le64_to_cpu(split_inos[i]),
-                                .snap = CEPH_NOSNAP,
-                        };
-                        struct inode *inode = ceph_find_inode(sb, vino);
-                        struct ceph_inode_info *ci;
-                        if (!inode)
-                                continue;
-                        ci = ceph_inode(inode);
-                        spin_lock(&inode->i_lock);
-                        if (list_empty(&ci->i_snap_realm_item)) {
-                                struct ceph_snap_realm *oldrealm =
-                                        ci->i_snap_realm;
-                                dout(" moving %p to split realm %llx %p\n",
-                                     inode, realm->ino, realm);
-                                spin_lock(&realm->inodes_with_caps_lock);
-                                list_add(&ci->i_snap_realm_item,
-                                         &realm->inodes_with_caps);
-                                ci->i_snap_realm = realm;
-                                spin_unlock(&realm->inodes_with_caps_lock);
-                                ceph_get_snap_realm(mdsc, realm);
-                                ceph_put_snap_realm(mdsc, oldrealm);
-                        }
-                        spin_unlock(&inode->i_lock);
-                        iput(inode);
-                }
                /* we took a reference when we created the realm, above */
                ceph_put_snap_realm(mdsc, realm);
-        }
        __cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..9922628532b2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -2,20 +2,18 @@
 #include "ceph_debug.h"
 #include <linux/backing-dev.h>
+#include <linux/ctype.h>
 #include <linux/fs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 #include "decode.h"
 #include "super.h"
@@ -92,7 +90,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = le64_to_cpu(st.num_objects);
        buf->f_ffree = -1;
-        buf->f_namelen = PATH_MAX;
+        buf->f_namelen = NAME_MAX;
        buf->f_frsize = PAGE_CACHE_SIZE;
        /* leave fsid little-endian, regardless of host endianness */
@@ -104,15 +102,52 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
-static int ceph_syncfs(struct super_block *sb, int wait)
+static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-        dout("sync_fs %d\n", wait);
+        struct ceph_client *client = ceph_sb_to_client(sb);
-        ceph_osdc_sync(&ceph_client(sb)->osdc);
-        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        if (!wait) {
-        dout("sync_fs %d done\n", wait);
+                dout("sync_fs (non-blocking)\n");
+                ceph_flush_dirty_caps(&client->mdsc);
+                dout("sync_fs (non-blocking) done\n");
+                return 0;
+        }
+        dout("sync_fs (blocking)\n");
+        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+        dout("sync_fs (blocking) done\n");
        return 0;
 }
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
 /**
 * ceph_show_options - Show mount options in /proc/mounts
@@ -125,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        struct ceph_mount_args *args = client->mount_args;
        if (args->flags & CEPH_OPT_FSID)
-                seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+                seq_printf(m, ",fsid=%pU", &args->fsid);
-                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
-                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
        if (args->flags & CEPH_OPT_NOSHARE)
                seq_puts(m, ",noshare");
        if (args->flags & CEPH_OPT_DIRSTAT)
@@ -138,6 +171,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",nocrc");
        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
+        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                         args->osd_keepalive_timeout);
+        if (args->wsize)
+                seq_printf(m, ",wsize=%d", args->wsize);
+        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", args->rsize);
+        if (args->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         args->caps_wanted_delay_min);
+        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           args->caps_wanted_delay_max);
+        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           args->cap_release_safety);
+        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
        if (args->name)
@@ -161,35 +223,6 @@ static void ceph_inode_init_once(void *foo)
        inode_init_once(&ci->vfs_inode);
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
 static int __init init_caches(void)
 {
        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -254,7 +287,7 @@ static const struct super_operations ceph_super_ops = {
        .alloc_inode    = ceph_alloc_inode,
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
-        .sync_fs        = ceph_syncfs,
+        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
        .statfs         = ceph_statfs,
@@ -297,9 +330,6 @@ const char *ceph_msg_type_name(int type)
 * mount options
 */
 enum {
-        Opt_fsidmajor,
-        Opt_fsidminor,
-        Opt_monport,
        Opt_wsize,
        Opt_rsize,
        Opt_osdtimeout,
@@ -308,10 +338,13 @@ enum {
        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
+        Opt_cap_release_safety,
        Opt_readdir_max_entries,
+        Opt_readdir_max_bytes,
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
+        Opt_fsid,
        Opt_snapdirname,
        Opt_name,
        Opt_secret,
@@ -328,9 +361,6 @@ enum {
 };
 static match_table_t arg_tokens = {
-        {Opt_fsidmajor, "fsidmajor=%ld"},
-        {Opt_fsidminor, "fsidminor=%ld"},
-        {Opt_monport, "monport=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_osdtimeout, "osdtimeout=%d"},
@@ -339,9 +369,12 @@ static match_table_t arg_tokens = {
        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_cap_release_safety, "cap_release_safety=%d"},
        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
+        {Opt_fsid, "fsid=%s"},
        {Opt_snapdirname, "snapdirname=%s"},
        {Opt_name, "name=%s"},
        {Opt_secret, "secret=%s"},
@@ -357,6 +390,36 @@ static match_table_t arg_tokens = {
        {-1, NULL}
 };
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+        int i = 0;
+        char tmp[3];
+        int err = -EINVAL;
+        int d;
+        dout("parse_fsid '%s'\n", str);
+        tmp[2] = 0;
+        while (*str && i < 16) {
+                if (ispunct(*str)) {
+                        str++;
+                        continue;
+                }
+                if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                        break;
+                tmp[0] = str[0];
+                tmp[1] = str[1];
+                if (sscanf(tmp, "%x", &d) < 1)
+                        break;
+                fsid->fsid[i] = d & 0xff;
+                i++;
+                str += 2;
+        }
+        if (i == 16)
+                err = 0;
+        dout("parse_fsid ret %d got fsid %pU", err, fsid);
+        return err;
+}
 static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                                                const char *dev_name,
@@ -388,8 +451,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = 1024;
+        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        args->congestion_kb = default_congestion_kb();
        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -439,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                        dout("got token %d\n", token);
                }
                switch (token) {
-                case Opt_fsidmajor:
-                        *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
-                        break;
-                case Opt_fsidminor:
-                        *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
-                        break;
                case Opt_ip:
                        err = ceph_parse_ips(argstr[0].from,
                                             argstr[0].to,
@@ -455,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                        args->flags |= CEPH_OPT_MYIP;
                        break;
+                case Opt_fsid:
+                        err = parse_fsid(argstr[0].from, &args->fsid);
+                        if (err == 0)
+                                args->flags |= CEPH_OPT_FSID;
+                        break;
                case Opt_snapdirname:
                        kfree(args->snapdir_name);
                        args->snapdir_name = kstrndup(argstr[0].from,
@@ -485,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_osdkeepalivetimeout:
                        args->osd_keepalive_timeout = intval;
                        break;
+                case Opt_osd_idle_ttl:
+                        args->osd_idle_ttl = intval;
+                        break;
                case Opt_mount_timeout:
                        args->mount_timeout = intval;
                        break;
@@ -497,6 +563,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_readdir_max_entries:
                        args->max_readdir = intval;
                        break;
+                case Opt_readdir_max_bytes:
+                        args->max_readdir_bytes = intval;
+                        break;
                case Opt_congestion_kb:
                        args->congestion_kb = intval;
                        break;
@@ -597,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
        /* caps */
        client->min_caps = args->max_readdir;
-        ceph_adjust_min_caps(client->min_caps);
        /* subsystems */
        err = ceph_monc_init(&client->monc, client);
@@ -636,10 +704,16 @@ static void ceph_destroy_client(struct ceph_client *client)
        /* unmount */
        ceph_mdsc_stop(&client->mdsc);
-        ceph_monc_stop(&client->monc);
        ceph_osdc_stop(&client->osdc);
-        ceph_adjust_min_caps(-client->min_caps);
+        /*
+         * make sure mds and osd connections close out before destroying
+         * the auth module, which is needed to free those connections'
+         * ceph_authorizers.
+         */
+        ceph_msgr_flush();
+        ceph_monc_stop(&client->monc);
        ceph_debugfs_client_cleanup(client);
        destroy_workqueue(client->wb_wq);
@@ -665,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 {
        if (client->have_fsid) {
                if (ceph_fsid_compare(&client->fsid, fsid)) {
-                        pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+                        pr_err("bad fsid, had %pU got %pU",
-                               PR_FSID(&client->fsid), PR_FSID(fsid));
+                               &client->fsid, fsid);
                        return -1;
                }
        } else {
-                pr_info("client%lld fsid " FSID_FORMAT "\n",
+                pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-                        client->monc.auth->global_id, PR_FSID(fsid));
+                        fsid);
                memcpy(&client->fsid, fsid, sizeof(*fsid));
                ceph_debugfs_client_init(client);
                client->have_fsid = true;
@@ -682,9 +756,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-        return client->monc.monmap && client->monc.monmap->epoch;
+        return client->monc.monmap && client->monc.monmap->epoch &&
+               client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 /*
@@ -704,7 +779,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        dout("open_root_inode opening '%s'\n", path);
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_path1 = kstrdup(path, GFP_NOFS);
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
@@ -762,7 +837,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
        if (err < 0)
                goto out;
-        while (!have_mon_map(client)) {
+        while (!have_mon_and_osd_map(client)) {
                err = -EIO;
                if (timeout && time_after_eq(jiffies, started + timeout))
                        goto out;
@@ -770,8 +845,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                /* wait */
                dout("mount waiting for mon_map\n");
                err = wait_event_interruptible_timeout(client->auth_wq,
-                               have_mon_map(client) || (client->auth_err < 0),
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                               timeout);
+                       timeout);
                if (err == -EINTR || err == -ERESTARTSYS)
                        goto out;
                if (client->auth_err < 0) {
@@ -884,6 +959,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
        int err;
@@ -893,7 +970,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
                client->backing_dev_info.ra_pages =
                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                           atomic_long_inc_return(&bdi_seq));
        if (!err)
                sb->s_bdi = &client->backing_dev_info;
        return err;
@@ -932,9 +1010,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                goto out;
        }
-        if (ceph_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != client) {
                ceph_destroy_client(client);
-                client = ceph_client(sb);
+                client = ceph_sb_to_client(sb);
                dout("get_sb got existing client %p\n", client);
        } else {
                dout("get_sb using new client %p\n", client);
@@ -952,8 +1030,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 out_splat:
        ceph_mdsc_close_sessions(&client->mdsc);
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
        goto out_final;
 out:
@@ -999,8 +1076,6 @@ static int __init init_ceph(void)
        if (ret)
                goto out_msgr;
-        ceph_caps_init();
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
                goto out_icache;
@@ -1025,7 +1100,6 @@ static void __exit exit_ceph(void)
 {
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
-        ceph_caps_finalize();
        destroy_caches();
        ceph_msgr_exit();
        ceph_debugfs_cleanup();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..b87638e84c4b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
-#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
@@ -32,6 +31,12 @@
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 /*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+/*
 * mount options
 */
 #define CEPH_OPT_FSID             (1<<0)
@@ -52,24 +57,25 @@
 struct ceph_mount_args {
        int sb_flags;
+        int flags;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
        int num_mon;
        struct ceph_entity_addr *mon_addr;
-        int flags;
        int mount_timeout;
        int osd_idle_ttl;
-        int caps_wanted_delay_min, caps_wanted_delay_max;
-        struct ceph_fsid fsid;
-        struct ceph_entity_addr my_addr;
-        int wsize;
-        int rsize;            /* max readahead */
-        int max_readdir;      /* max readdir size */
-        int congestion_kb;      /* max readdir size */
        int osd_timeout;
        int osd_keepalive_timeout;
+        int wsize;
+        int rsize;            /* max readahead */
+        int congestion_kb;    /* max writeback in flight */
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        int cap_release_safety;
+        int max_readdir;       /* max readdir result (entires) */
+        int max_readdir_bytes; /* max readdir result (bytes) */
        char *snapdir_name;   /* default ".snap" */
        char *name;
        char *secret;
-        int cap_release_safety;
 };
 /*
@@ -80,13 +86,14 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 #define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 /*
 * Delay telling the MDS we no longer want caps, in case we reopen
 * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +103,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 /* mount state */
 enum {
@@ -160,12 +168,6 @@ struct ceph_client {
 #endif
 };
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -214,8 +216,7 @@ struct ceph_cap_snap {
        uid_t uid;
        gid_t gid;
-        void *xattr_blob;
+        struct ceph_buffer *xattr_blob;
-        int xattr_len;
        u64 xattr_version;
        u64 size;
@@ -227,8 +228,11 @@ struct ceph_cap_snap {
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 {
-        if (atomic_dec_and_test(&capsnap->nref))
+        if (atomic_dec_and_test(&capsnap->nref)) {
+                if (capsnap->xattr_blob)
+                        ceph_buffer_put(capsnap->xattr_blob);
                kfree(capsnap);
+        }
 }
 /*
@@ -340,7 +344,8 @@ struct ceph_inode_info {
        unsigned i_cap_exporting_issued;
        struct ceph_cap_reservation i_cap_migration_resv;
        struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
-        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
        int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
@@ -564,11 +569,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 /* what the mds thinks we want */
 extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
-extern void ceph_caps_init(void);
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
-extern void ceph_caps_finalize(void);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(int delta);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
-extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+                             struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+                               struct ceph_cap_reservation *ctx);
 extern void ceph_reservation_status(struct ceph_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
@@ -683,6 +690,8 @@ struct ceph_snap_realm {
        struct list_head empty_item;     /* if i have ref==0 */
+        struct list_head dirty_item;     /* if realm needs new context */
        /* the current set of snaps for this realm */
        struct ceph_snap_context *cached_context;
@@ -742,13 +751,6 @@ extern struct kmem_cache *ceph_file_cachep;
 extern const char *ceph_msg_type_name(int type);
 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
-        "%02x%02x%02x%02x%02x%02x"
-#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
-                (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
-                (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
-                (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
@@ -810,20 +812,24 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
        __ceph_remove_cap(cap);
        spin_unlock(&inode->i_lock);
 }
-extern void ceph_put_cap(struct ceph_cap *cap);
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+                         struct ceph_cap *cap);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+                                             int mds);
 extern int ceph_get_cap_mds(struct inode *inode);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                                       struct ceph_snap_context *snapc);
 extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
-                               struct ceph_mds_session **psession);
+                               struct ceph_mds_session **psession,
+                               int again);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                            struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -861,7 +867,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct inode_operations ceph_dir_iops;
-extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -871,6 +877,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 /*
 * our d_ops vary depending on whether the inode is live,
@@ -891,6 +898,14 @@ extern void ceph_debugfs_cleanup(void);
 extern int ceph_debugfs_client_init(struct ceph_client *client);
 extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+/* locks.c */
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
+                             int p_locks, int f_locks);
+extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 {
        if (dentry && dentry->d_parent)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
 static bool ceph_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, XATTR_SECURITY_PREFIX,
+        return !strncmp(name, "ceph.", 5) ||
+               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "ceph.dir.entries", ceph_vxattrcb_entries},
-        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "ceph.dir.files", ceph_vxattrcb_files},
-        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
        { true, NULL, NULL }
 };
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { true, "ceph.layout", ceph_vxattrcb_layout},
        { NULL, NULL }
 };
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
                ci->i_xattrs.names_size -= xattr->name_len;
                ci->i_xattrs.vals_size -= xattr->val_len;
        }
-        if (!xattr) {
-                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-                       xattr->val);
-                return -ENOMEM;
-        }
        ci->i_xattrs.names_size += name_len;
        ci->i_xattrs.vals_size += val_len;
        if (val)
@@ -342,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 }
 static int __build_xattrs(struct inode *inode)
+        __releases(inode->i_lock)
+        __acquires(inode->i_lock)
 {
        u32 namelen;
        u32 numattr = 0;
@@ -488,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
                ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
                ci->i_xattrs.prealloc_blob = NULL;
                ci->i_xattrs.dirty = false;
+                ci->i_xattrs.version++;
        }
 }
@@ -574,7 +572,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
             ci->i_xattrs.version, ci->i_xattrs.index_version);
        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto list_xattr;
        } else {
                spin_unlock(&inode->i_lock);
@@ -622,7 +620,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +639,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                        return -ENOMEM;
                err = -ENOMEM;
                for (i = 0; i < nr_pages; i++) {
-                        pages[i] = alloc_page(GFP_NOFS);
+                        pages[i] = __page_cache_alloc(GFP_NOFS);
                        if (!pages[i]) {
                                nr_pages = i;
                                goto out;
@@ -779,7 +777,7 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = &client->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..143d393881cb 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
 #include <linux/cdev.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/tty.h>
 #include "internal.h"
@@ -39,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = {
 #endif
                /* permit direct mmap, for read, write or exec */
                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
+                /* no writeback happens */
+                BDI_CAP_NO_ACCT_AND_WRITEBACK),
 };
 static struct kobj_map *cdev_map;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 80f352596807..917b7d449bb2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,6 @@ config CIFS
        tristate "CIFS support (advanced network filesystem, SMBFS successor)"
        depends on INET
        select NLS
-        select SLOW_WORK
        help
          This is the client VFS module for the Common Internet File System
          (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH
          If unsure, say N.
 config CIFS_UPCALL
-          bool "Kerberos/SPNEGO advanced session setup"
+        bool "Kerberos/SPNEGO advanced session setup"
-          depends on CIFS && KEYS
+        depends on CIFS && KEYS
-          help
+        select DNS_RESOLVER
-            Enables an upcall mechanism for CIFS which accesses
+        help
-            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+          Enables an upcall mechanism for CIFS which accesses userspace helper
-            Kerberos tickets which are needed to mount to certain secure servers
+          utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
-            (for which more secure Kerberos authentication is required). If
+          which are needed to mount to certain secure servers (for which more
-            unsure, say N.
+          secure Kerberos authentication is required). If unsure, say N.
 config CIFS_XATTR
        bool "CIFS extended attributes"
@@ -122,6 +121,7 @@ config CIFS_DEBUG2
 config CIFS_DFS_UPCALL
          bool "DFS feature support"
          depends on CIFS && KEYS
+          select DNS_RESOLVER
          help
            Distributed File System (DFS) support is used to access shares
            transparently in an enterprise name space, even if the share
@@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL
            IP addresses) which is needed for implicit mounts of DFS junction
            points. If unsure, say N.
+config CIFS_FSCACHE
+          bool "Provide CIFS client caching support (EXPERIMENTAL)"
+          depends on EXPERIMENTAL
+          depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
+          help
+            Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
+            to be cached locally on disk through the general filesystem cache
+            manager. If unsure, say N.
 config CIFS_EXPERIMENTAL
          bool "CIFS Experimental Features (EXPERIMENTAL)"
          depends on CIFS && EXPERIMENTAL
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 9948c0030e86..adefa60a9bdc 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
 cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o
+cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075f..7099a526f775 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
+  fsc           Enable local disk caching using FS-Cache (off by default). This
+                option could be useful to improve performance on a slow link,
+                heavily loaded server and/or network where reading from the
+                disk is faster than reading from the server (over the network).
+                This could also impact scalability positively as the
+                number of calls to the server are reduced. However, local
+                caching is not suitable for all workloads for e.g. read-once
+                type workloads. So, you need to consider carefully your
+                workload/scenario before using this option. Currently, local
+                disk caching is functional for CIFS files opened as read-only.
  dir_mode      If CIFS Unix extensions are not supported by the server 
                this overrides the default mode for directory inodes.
  port          attempt to contact the server on this tcp port, before
@@ -568,8 +578,9 @@ module can be displayed via modinfo.
 Misc /proc/fs/cifs Flags and Debug Info
 =======================================
 Informational pseudo-files:
-DebugData               Displays information about active CIFS sessions
+DebugData               Displays information about active CIFS sessions and
-                        and shares, as well as the cifs.ko version.
+                        shares, features enabled as well as the cifs.ko
+                        version.
 Stats                   Lists summary resource usage information as well as per
                        share statistics, if CONFIG_CIFS_STATS in enabled
                        in the kernel configuration.
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index a20bea598933..cfd1ce34e0bc 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
 int
 decode_negTokenInit(unsigned char *security_blob, int length,
-                    enum securityEnum *secType)
+                    struct TCP_Server_Info *server)
 {
        struct asn1_ctx ctx;
        unsigned char *end;
        unsigned char *sequence_end;
        unsigned long *oid = NULL;
        unsigned int cls, con, tag, oidlen, rc;
-        bool use_ntlmssp = false;
-        bool use_kerberos = false;
-        bool use_kerberosu2u = false;
-        bool use_mskerberos = false;
        /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
@@ -510,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* GSSAPI header */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        } else if ((cls != ASN1_APL) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
+                cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
                return 0;
        }
@@ -535,56 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        /* SPNEGO OID not present or garbled -- bail out */
        if (!rc) {
-                cFYI(1, ("Error decoding negTokenInit header"));
+                cFYI(1, "Error decoding negTokenInit header");
                return 0;
        }
        /* SPNEGO */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* negTokenInit */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, "Error decoding negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
                   || (tag != ASN1_EOC)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
        /* sequence of */
        if (asn1_header_decode
            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, "Error decoding 2nd part of negTokenInit");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1,
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     cls, con, tag, end, *end);
-                      cls, con, tag, end, *end));
                return 0;
        }
@@ -592,37 +584,33 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        while (!asn1_eoc_decode(&ctx, sequence_end)) {
                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
                if (!rc) {
-                        cFYI(1,
+                        cFYI(1, "Error decoding negTokenInit hdr exit2");
-                             ("Error decoding negTokenInit hdr exit2"));
                        return 0;
                }
                if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
                        if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-                                cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
+                                cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
-                                         "0x%lx 0x%lx", oidlen, *oid,
+                                        "0x%lx 0x%lx", oidlen, *oid,
-                                         *(oid + 1), *(oid + 2), *(oid + 3)));
+                                        *(oid + 1), *(oid + 2), *(oid + 3));
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
-                                                MSKRB5_OID_LEN) &&
+                                                MSKRB5_OID_LEN))
-                                                !use_mskerberos)
+                                        server->sec_mskerberos = true;
-                                        use_mskerberos = true;
                                else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-                                                     KRB5U2U_OID_LEN) &&
+                                                     KRB5U2U_OID_LEN))
-                                                     !use_kerberosu2u)
+                                        server->sec_kerberosu2u = true;
-                                        use_kerberosu2u = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
-                                                     KRB5_OID_LEN) &&
+                                                     KRB5_OID_LEN))
-                                                     !use_kerberos)
+                                        server->sec_kerberos = true;
-                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
-                                        use_ntlmssp = true;
+                                        server->sec_ntlmssp = true;
                                kfree(oid);
                        }
                } else {
-                        cFYI(1, ("Should be an oid what is going on?"));
+                        cFYI(1, "Should be an oid what is going on?");
                }
        }
@@ -632,54 +620,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                   no mechListMic (e.g. NTLMSSP instead of KRB5) */
                if (ctx.error == ASN1_ERR_DEC_EMPTY)
                        goto decode_negtoken_exit;
-                cFYI(1, ("Error decoding last part negTokenInit exit3"));
+                cFYI(1, "Error decoding last part negTokenInit exit3");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
                /* tag = 3 indicating mechListMIC */
-                cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* sequence */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit5"));
+                cFYI(1, "Error decoding last part negTokenInit exit5");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
                   || (tag != ASN1_SEQ)) {
-                cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
-                        cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
        }
        /* sequence of */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+                cFYI(1, "Error decoding last part negTokenInit exit 7");
                return 0;
        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
        /* general string */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                cFYI(1, ("Error decoding last part negTokenInit exit9"));
+                cFYI(1, "Error decoding last part negTokenInit exit9");
                return 0;
        } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
                   || (tag != ASN1_GENSTR)) {
-                cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+                cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-                         cls, con, tag, end, *end));
+                        cls, con, tag, end, *end);
                return 0;
        }
-        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+        cFYI(1, "Need to call asn1_octets_decode() function for %s",
-                 ctx.pointer)); /* is this UTF-8 or ASCII? */
+                ctx.pointer);   /* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
-        if (use_kerberos)
-                *secType = Kerberos;
-        else if (use_mskerberos)
-                *secType = MSKerberos;
-        else if (use_ntlmssp)
-                *secType = RawNTLMSSP;
        return 1;
 }
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
new file mode 100644
index 000000000000..224d7bbd1fcc
--- /dev/null
+++ b/fs/cifs/cache.c
@@ -0,0 +1,331 @@
+/*
+ *   fs/cifs/cache.c - CIFS filesystem cache index structure definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifs_debug.h"
+/*
+ * CIFS filesystem definition for FS-Cache
+ */
+struct fscache_netfs cifs_fscache_netfs = {
+        .name = "cifs",
+        .version = 0,
+};
+/*
+ * Register CIFS for caching with FS-Cache
+ */
+int cifs_fscache_register(void)
+{
+        return fscache_register_netfs(&cifs_fscache_netfs);
+}
+/*
+ * Unregister CIFS for caching
+ */
+void cifs_fscache_unregister(void)
+{
+        fscache_unregister_netfs(&cifs_fscache_netfs);
+}
+/*
+ * Key layout of CIFS server cache index object
+ */
+struct cifs_server_key {
+        uint16_t        family;         /* address family */
+        uint16_t        port;           /* IP port */
+        union {
+                struct in_addr  ipv4_addr;
+                struct in6_addr ipv6_addr;
+        } addr[0];
+};
+/*
+ * Server object keyed by {IPaddress,port,family} tuple
+ */
+static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
+                                   void *buffer, uint16_t maxbuf)
+{
+        const struct TCP_Server_Info *server = cookie_netfs_data;
+        const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr;
+        struct cifs_server_key *key = buffer;
+        uint16_t key_len = sizeof(struct cifs_server_key);
+        memset(key, 0, key_len);
+        /*
+         * Should not be a problem as sin_family/sin6_family overlays
+         * sa_family field
+         */
+        switch (sa->sa_family) {
+        case AF_INET:
+                key->family = server->addr.sockAddr.sin_family;
+                key->port = server->addr.sockAddr.sin_port;
+                key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr;
+                key_len += sizeof(key->addr[0].ipv4_addr);
+                break;
+        case AF_INET6:
+                key->family = server->addr.sockAddr6.sin6_family;
+                key->port = server->addr.sockAddr6.sin6_port;
+                key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr;
+                key_len += sizeof(key->addr[0].ipv6_addr);
+                break;
+        default:
+                cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family);
+                key_len = 0;
+                break;
+        }
+        return key_len;
+}
+/*
+ * Server object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_server_index_def = {
+        .name = "CIFS.server",
+        .type = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key = cifs_server_get_key,
+};
+/*
+ * Auxiliary data attached to CIFS superblock within the cache
+ */
+struct cifs_fscache_super_auxdata {
+        u64     resource_id;            /* unique server resource id */
+};
+static char *extract_sharename(const char *treename)
+{
+        const char *src;
+        char *delim, *dst;
+        int len;
+        /* skip double chars at the beginning */
+        src = treename + 2;
+        /* share name is always preceded by '\\' now */
+        delim = strchr(src, '\\');
+        if (!delim)
+                return ERR_PTR(-EINVAL);
+        delim++;
+        len = strlen(delim);
+        /* caller has to free the memory */
+        dst = kstrndup(delim, len, GFP_KERNEL);
+        if (!dst)
+                return ERR_PTR(-ENOMEM);
+        return dst;
+}
+/*
+ * Superblock object currently keyed by share name
+ */
+static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
+                                   uint16_t maxbuf)
+{
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        char *sharename;
+        uint16_t len;
+        sharename = extract_sharename(tcon->treeName);
+        if (IS_ERR(sharename)) {
+                cFYI(1, "CIFS: couldn't extract sharename\n");
+                sharename = NULL;
+                return 0;
+        }
+        len = strlen(sharename);
+        if (len > maxbuf)
+                return 0;
+        memcpy(buffer, sharename, len);
+        kfree(sharename);
+        return len;
+}
+static uint16_t
+cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer,
+                           uint16_t maxbuf)
+{
+        struct cifs_fscache_super_auxdata auxdata;
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.resource_id = tcon->resource_id;
+        if (maxbuf > sizeof(auxdata))
+                maxbuf = sizeof(auxdata);
+        memcpy(buffer, &auxdata, maxbuf);
+        return maxbuf;
+}
+static enum
+fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data,
+                                              const void *data,
+                                              uint16_t datalen)
+{
+        struct cifs_fscache_super_auxdata auxdata;
+        const struct cifsTconInfo *tcon = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.resource_id = tcon->resource_id;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+/*
+ * Superblock object for FS-Cache
+ */
+const struct fscache_cookie_def cifs_fscache_super_index_def = {
+        .name = "CIFS.super",
+        .type = FSCACHE_COOKIE_TYPE_INDEX,
+        .get_key = cifs_super_get_key,
+        .get_aux = cifs_fscache_super_get_aux,
+        .check_aux = cifs_fscache_super_check_aux,
+};
+/*
+ * Auxiliary data attached to CIFS inode within the cache
+ */
+struct cifs_fscache_inode_auxdata {
+        struct timespec last_write_time;
+        struct timespec last_change_time;
+        u64             eof;
+};
+static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t maxbuf)
+{
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        uint16_t keylen;
+        /* use the UniqueId as the key */
+        keylen = sizeof(cifsi->uniqueid);
+        if (keylen > maxbuf)
+                keylen = 0;
+        else
+                memcpy(buffer, &cifsi->uniqueid, keylen);
+        return keylen;
+}
+static void
+cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size)
+{
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        *size = cifsi->vfs_inode.i_size;
+}
+static uint16_t
+cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer,
+                           uint16_t maxbuf)
+{
+        struct cifs_fscache_inode_auxdata auxdata;
+        const struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.eof = cifsi->server_eof;
+        auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+        auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+        if (maxbuf > sizeof(auxdata))
+                maxbuf = sizeof(auxdata);
+        memcpy(buffer, &auxdata, maxbuf);
+        return maxbuf;
+}
+static enum
+fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
+                                              const void *data,
+                                              uint16_t datalen)
+{
+        struct cifs_fscache_inode_auxdata auxdata;
+        struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        if (datalen != sizeof(auxdata))
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        memset(&auxdata, 0, sizeof(auxdata));
+        auxdata.eof = cifsi->server_eof;
+        auxdata.last_write_time = cifsi->vfs_inode.i_mtime;
+        auxdata.last_change_time = cifsi->vfs_inode.i_ctime;
+        if (memcmp(data, &auxdata, datalen) != 0)
+                return FSCACHE_CHECKAUX_OBSOLETE;
+        return FSCACHE_CHECKAUX_OKAY;
+}
+static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+        struct cifsInodeInfo *cifsi = cookie_netfs_data;
+        struct pagevec pvec;
+        pgoff_t first;
+        int loop, nr_pages;
+        pagevec_init(&pvec, 0);
+        first = 0;
+        cFYI(1, "cifs inode 0x%p now uncached", cifsi);
+        for (;;) {
+                nr_pages = pagevec_lookup(&pvec,
+                                          cifsi->vfs_inode.i_mapping, first,
+                                          PAGEVEC_SIZE - pagevec_count(&pvec));
+                if (!nr_pages)
+                        break;
+                for (loop = 0; loop < nr_pages; loop++)
+                        ClearPageFsCache(pvec.pages[loop]);
+                first = pvec.pages[nr_pages - 1]->index + 1;
+                pvec.nr = nr_pages;
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+}
+const struct fscache_cookie_def cifs_fscache_inode_object_def = {
+        .name           = "CIFS.uniqueid",
+        .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+        .get_key        = cifs_fscache_inode_get_key,
+        .get_attr       = cifs_fscache_inode_get_attr,
+        .get_aux        = cifs_fscache_inode_get_aux,
+        .check_aux      = cifs_fscache_inode_check_aux,
+        .now_uncached   = cifs_fscache_inode_now_uncached,
+};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0cf..eb1ba493489f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
 #ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(struct smb_hdr *smb)
 {
-        cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+        cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
                  smb->Command, smb->Status.CifsError,
-                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
+                  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-        cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+        cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
 }
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        if (server == NULL)
                return;
-        cERROR(1, ("Dump pending requests:"));
+        cERROR(1, "Dump pending requests:");
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
                        mid_entry->midState,
                        (int)mid_entry->command,
                        mid_entry->pid,
                        mid_entry->tsk,
-                        mid_entry->mid));
+                        mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-                cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+                cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
                        mid_entry->largeBuf,
                        mid_entry->resp_buf,
                        mid_entry->when_received,
-                        jiffies));
+                        jiffies);
 #endif /* STATS2 */
-                cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+                cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-                          mid_entry->multiEnd));
+                          mid_entry->multiEnd);
                if (mid_entry->resp_buf) {
                        cifs_dump_detail(mid_entry->resp_buf);
                        cifs_dump_mem("existing buf: ",
@@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+        seq_printf(m, "Features: ");
+#ifdef CONFIG_CIFS_DFS_UPCALL
+        seq_printf(m, "dfs");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_FSCACHE
+        seq_printf(m, "fscache");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+        seq_printf(m, "lanman");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_POSIX
+        seq_printf(m, "posix");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_UPCALL
+        seq_printf(m, "spnego");
+        seq_putc(m, ' ');
+#endif
+#ifdef CONFIG_CIFS_XATTR
+        seq_printf(m, "xattr");
+#endif
+        seq_putc(m, '\n');
        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
        seq_printf(m, "Servers:");
@@ -716,7 +741,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-        seq_printf(m, "0x%x\n", extended_security);
+        seq_printf(m, "0x%x\n", global_secflags);
        return 0;
 }
@@ -744,13 +769,13 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
                /* single char or single char followed by null */
                c = flags_string[0];
                if (c == '0' || c == 'n' || c == 'N') {
-                        extended_security = CIFSSEC_DEF; /* default */
+                        global_secflags = CIFSSEC_DEF; /* default */
                        return count;
                } else if (c == '1' || c == 'y' || c == 'Y') {
-                        extended_security = CIFSSEC_MAX;
+                        global_secflags = CIFSSEC_MAX;
                        return count;
                } else if (!isdigit(c)) {
-                        cERROR(1, ("invalid flag %c", c));
+                        cERROR(1, "invalid flag %c", c);
                        return -EINVAL;
                }
        }
@@ -758,26 +783,26 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
        flags = simple_strtoul(flags_string, NULL, 0);
-        cFYI(1, ("sec flags 0x%x", flags));
+        cFYI(1, "sec flags 0x%x", flags);
        if (flags <= 0)  {
-                cERROR(1, ("invalid security flags %s", flags_string));
+                cERROR(1, "invalid security flags %s", flags_string);
                return -EINVAL;
        }
        if (flags & ~CIFSSEC_MASK) {
-                cERROR(1, ("attempt to set unsupported security flags 0x%x",
+                cERROR(1, "attempt to set unsupported security flags 0x%x",
-                        flags & ~CIFSSEC_MASK));
+                        flags & ~CIFSSEC_MASK);
                return -EINVAL;
        }
        /* flags look ok - update the global security flags for cifs module */
-        extended_security = flags;
+        global_secflags = flags;
-        if (extended_security & CIFSSEC_MUST_SIGN) {
+        if (global_secflags & CIFSSEC_MUST_SIGN) {
                /* requiring signing implies signing is allowed */
-                extended_security |= CIFSSEC_MAY_SIGN;
+                global_secflags |= CIFSSEC_MAY_SIGN;
-                cFYI(1, ("packet signing now required"));
+                cFYI(1, "packet signing now required");
-        } else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
+        } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
-                cFYI(1, ("packet signing disabled"));
+                cFYI(1, "packet signing disabled");
        }
        /* BB should we turn on MAY flags for other MUST options? */
        return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa7..aa316891ac0c 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
 */
 #ifdef CIFS_DEBUG
 /* information message: e.g., configuration, major event */
 extern int cifsFYI;
-#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg)
+#define cifsfyi(fmt, arg...)                                            \
+do {                                                                    \
+        if (cifsFYI & CIFS_INFO)                                        \
+                printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
-#define cFYI(button,prspec) if (button) cifsfyi prspec
+#define cFYI(set, fmt, arg...)                  \
+do {                                            \
+        if (set)                                \
+                cifsfyi(fmt, ##arg);            \
+} while (0)
-#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg)
+#define cifswarn(fmt, arg...)                   \
+        printk(KERN_WARNING fmt "\n", ##arg)
 /* debug event message: */
 extern int cifsERROR;
-#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg)
+#define cEVENT(fmt, arg...)                                             \
+do {                                                                    \
+        if (cifsERROR)                                                  \
+                printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);    \
+} while (0)
 /* error event message: e.g., i/o error */
-#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg)
+#define cifserror(fmt, arg...)                                  \
+do {                                                            \
+        if (cifsERROR)                                          \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);  \
+} while (0)
-#define cERROR(button, prspec) if (button) cifserror prspec
+#define cERROR(set, fmt, arg...)                \
+do {                                            \
+        if (set)                                \
+                cifserror(fmt, ##arg);          \
+} while (0)
 /*
 *      debug OFF
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(button, prspec)
+#define cERROR(set, fmt, arg...)
-#define cEVENT(format, arg...)
+#define cEVENT(fmt, arg...)
-#define cFYI(button, prspec)
+#define cFYI(set, fmt, arg...)
-#define cifserror(format, arg...)
+#define cifserror(fmt, arg...)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 78e4d2a3a68b..d6ced7aa23cf 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -85,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
        /* find server name end */
        pSep = memchr(UNC+2, '\\', len-2);
        if (!pSep) {
-                cERROR(1, ("%s: no server name end in node name: %s",
+                cERROR(1, "%s: no server name end in node name: %s",
-                        __func__, node_name));
+                        __func__, node_name);
                kfree(UNC);
                return ERR_PTR(-EINVAL);
        }
@@ -141,17 +141,16 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
-        if (rc != 0) {
+        if (rc < 0) {
-                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
+                cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
-                          __func__, *devname, rc));
+                          __func__, *devname, rc);
                goto compose_mount_options_err;
        }
        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
         * assuming that we have 'unc=' and 'ip=' in
         * the original sb_mountdata
         */
-        md_len = strlen(sb_mountdata) + strlen(srvIP) +
+        md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
-                strlen(ref->node_name) + 12;
        mountdata = kzalloc(md_len+1, GFP_KERNEL);
        if (mountdata == NULL) {
                rc = -ENOMEM;
@@ -217,8 +216,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                strcat(mountdata, fullpath + ref->path_consumed);
        }
-        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
+        /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
-        /*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
+        /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
 compose_mount_options_out:
        kfree(srvIP);
@@ -230,28 +229,22 @@ compose_mount_options_err:
        goto compose_mount_options_out;
 }
+/**
-static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
+ * cifs_dfs_do_refmount - mounts specified path using provided refferal
-                struct dentry *dentry, const struct dfs_info3_param *ref)
+ * @cifs_sb:            parent/root superblock
+ * @fullpath:           full path in UNC format
+ * @ref:                server's referral
+ */
+static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+                const char *fullpath, const struct dfs_info3_param *ref)
 {
-        struct cifs_sb_info *cifs_sb;
        struct vfsmount *mnt;
        char *mountdata;
        char *devname = NULL;
-        char *fullpath;
-        cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-        /*
-         * this function gives us a path with a double backslash prefix. We
-         * require a single backslash for DFS.
-         */
-        fullpath = build_path_from_dentry(dentry);
-        if (!fullpath)
-                return ERR_PTR(-ENOMEM);
+        /* strip first '\' from fullpath */
        mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
                        fullpath + 1, ref, &devname);
-        kfree(fullpath);
        if (IS_ERR(mountdata))
                return (struct vfsmount *)mountdata;
@@ -294,11 +287,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-        cFYI(1, ("DFS: ref path: %s", ref->path_name));
+        cFYI(1, "DFS: ref path: %s", ref->path_name);
-        cFYI(1, ("DFS: node path: %s", ref->node_name));
+        cFYI(1, "DFS: node path: %s", ref->node_name);
-        cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
+        cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
-        cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+        cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-                                ref->path_consumed));
+                                ref->path_consumed);
 }
@@ -314,7 +307,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        int rc = 0;
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
-        cFYI(1, ("in %s", __func__));
+        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(dentry));
        xid = GetXid();
@@ -352,15 +345,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                /* connect to a node */
                len = strlen(referrals[i].node_name);
                if (len < 2) {
-                        cERROR(1, ("%s: Net Address path too short: %s",
+                        cERROR(1, "%s: Net Address path too short: %s",
-                                        __func__, referrals[i].node_name));
+                                        __func__, referrals[i].node_name);
                        rc = -EINVAL;
                        goto out_err;
                }
-                mnt = cifs_dfs_do_refmount(nd->path.mnt,
+                mnt = cifs_dfs_do_refmount(cifs_sb,
-                                nd->path.dentry, referrals + i);
+                                full_path, referrals + i);
-                cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+                cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-                                        referrals[i].node_name, mnt));
+                                        referrals[i].node_name, mnt);
                /* complete mount procedure if we accured submount */
                if (!IS_ERR(mnt))
@@ -378,7 +371,7 @@ out:
        FreeXid(xid);
        free_dfs_info_array(referrals, num_referrals);
        kfree(full_path);
-        cFYI(1, ("leaving %s" , __func__));
+        cFYI(1, "leaving %s" , __func__);
        return ERR_PTR(rc);
 out_err:
        path_put(&nd->path);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 246a167cb913..9e771450c3b8 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -35,6 +35,7 @@
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 #define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
+#define CIFS_MOUNT_FSCACHE      0x8000 /* local caching enabled */
 struct cifs_sb_info {
        struct cifsTconInfo *tcon;      /* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 310d12f69a92..87044906cd1f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
 /* strlen of ";uid=0x" */
 #define UID_KEY_LEN             7
+/* strlen of ";creduid=0x" */
+#define CREDUID_KEY_LEN         11
 /* strlen of ";user=" */
 #define USER_KEY_LEN            6
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+                   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
                   USER_KEY_LEN + strlen(sesInfo->userName) +
                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
@@ -133,9 +137,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
        /* for now, only sec=krb5 and sec=mskrb5 are valid */
-        if (server->secType == Kerberos)
+        if (server->sec_kerberos)
                sprintf(dp, ";sec=krb5");
-        else if (server->secType == MSKerberos)
+        else if (server->sec_mskerberos)
                sprintf(dp, ";sec=mskrb5");
        else
                goto out;
@@ -144,12 +148,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
        dp = description + strlen(description);
+        sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+        dp = description + strlen(description);
        sprintf(dp, ";user=%s", sesInfo->userName);
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
-        cFYI(1, ("key description = %s", description));
+        cFYI(1, "key description = %s", description);
        spnego_key = request_key(&cifs_spnego_key_type, description, "");
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d07676bd76d2..430f510a1720 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -200,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
                /* works for 2.4.0 kernel or later */
                charlen = codepage->char2uni(from, len, &wchar_to[i]);
                if (charlen < 1) {
-                        cERROR(1,
+                        cERROR(1, "strtoUCS: char2uni of %d returned %d",
-                               ("strtoUCS: char2uni of %d returned %d",
+                                (int)*from, charlen);
-                                (int)*from, charlen));
                        /* A question mark */
                        to[i] = cpu_to_le16(0x003f);
                        charlen = 1;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
 *     This is a compressed table of upper and lower case conversion.
 *
 */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
 #include <asm/byteorder.h>
 #include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
 #endif                          /* UNIUPR_NOUPPER */
 #ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
+extern signed char CifsUniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern const struct UniCaseRange CifsUniLowerRange[];
 #endif                          /* UNIUPR_NOLOWER */
 #ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
 * UniTolower:  Convert a unicode character to lower case
 */
 static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
 {
-        register struct UniCaseRange *rp;
+        register const struct UniCaseRange *rp;
-        if (uc < sizeof(UniLowerTable)) {
+        if (uc < sizeof(CifsUniLowerTable)) {
                /* Latin characters */
-                return uc + UniLowerTable[uc];  /* Use base tables */
+                return uc + CifsUniLowerTable[uc];      /* Use base tables */
        } else {
-                rp = UniLowerRange;     /* Use range tables */
+                rp = CifsUniLowerRange; /* Use range tables */
                while (rp->start) {
                        if (uc < rp->start)     /* Before start of range */
                                return uc;      /* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
 }
 #endif
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
 /*
 * Latin lower case
 */
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
 /*
 * Lower Case Range
 */
-static const struct UniCaseRange CifsUniLowerRange[] = {
+const struct UniCaseRange CifsUniLowerRange[] = {
-        0x0380, 0x03ab, UniCaseRangeL0380,
+        {0x0380, 0x03ab, UniCaseRangeL0380},
-        0x0400, 0x042f, UniCaseRangeL0400,
+        {0x0400, 0x042f, UniCaseRangeL0400},
-        0x0490, 0x04cb, UniCaseRangeL0490,
+        {0x0490, 0x04cb, UniCaseRangeL0490},
-        0x1e00, 0x1ff7, UniCaseRangeL1e00,
+        {0x1e00, 0x1ff7, UniCaseRangeL1e00},
-        0xff20, 0xff3a, UniCaseRangeLff20,
+        {0xff20, 0xff3a, UniCaseRangeLff20},
-        0, 0, 0
+        {0}
 };
 #endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 9b716d044bbd..85d7cf7ff2c8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
                                continue; /* all sub_auth values do not match */
                }
-                cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
+                cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
                return 0; /* sids compare/match */
        }
-        cFYI(1, ("No matching sid"));
+        cFYI(1, "No matching sid");
        return -1;
 }
@@ -208,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        *pbits_to_set &= ~S_IXUGO;
                return;
        } else if (type != ACCESS_ALLOWED) {
-                cERROR(1, ("unknown access control type %d", type));
+                cERROR(1, "unknown access control type %d", type);
                return;
        }
        /* else ACCESS_ALLOWED type */
        if (flags & GENERIC_ALL) {
                *pmode |= (S_IRWXUGO & (*pbits_to_set));
-                cFYI(DBG2, ("all perms"));
+                cFYI(DBG2, "all perms");
                return;
        }
        if ((flags & GENERIC_WRITE) ||
@@ -228,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
                        ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
                *pmode |= (S_IXUGO & (*pbits_to_set));
-        cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+        cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
        return;
 }
@@ -257,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
        if (mode & S_IXUGO)
                *pace_flags |= SET_FILE_EXEC_RIGHTS;
-        cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+        cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
        return;
 }
@@ -297,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
        /* validate that we do not go past end of acl */
        if (le16_to_cpu(pace->size) < 16) {
-                cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+                cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
                return;
        }
        if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-                cERROR(1, ("ACL too small to parse ACE"));
+                cERROR(1, "ACL too small to parse ACE");
                return;
        }
        num_subauth = pace->sid.num_subauth;
        if (num_subauth) {
                int i;
-                cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+                cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
                        pace->sid.revision, pace->sid.num_subauth, pace->type,
-                        pace->flags, le16_to_cpu(pace->size)));
+                        pace->flags, le16_to_cpu(pace->size));
                for (i = 0; i < num_subauth; ++i) {
-                        cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
+                        cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
-                                le32_to_cpu(pace->sid.sub_auth[i])));
+                                le32_to_cpu(pace->sid.sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
@@ -347,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
        /* validate that we do not go past end of acl */
        if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-                cERROR(1, ("ACL too small to parse DACL"));
+                cERROR(1, "ACL too small to parse DACL");
                return;
        }
-        cFYI(DBG2, ("DACL revision %d size %d num aces %d",
+        cFYI(DBG2, "DACL revision %d size %d num aces %d",
                le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-                le32_to_cpu(pdacl->num_aces)));
+                le32_to_cpu(pdacl->num_aces));
        /* reset rwx permissions for user/group/other.
           Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -437,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
        /* validate that we do not go past end of ACL - sid must be at least 8
           bytes long (assuming no sub-auths - e.g. the null SID */
        if (end_of_acl < (char *)psid + 8) {
-                cERROR(1, ("ACL too small to parse SID %p", psid));
+                cERROR(1, "ACL too small to parse SID %p", psid);
                return -EINVAL;
        }
        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
                int i;
-                cFYI(1, ("SID revision %d num_auth %d",
+                cFYI(1, "SID revision %d num_auth %d",
-                        psid->revision, psid->num_subauth));
+                        psid->revision, psid->num_subauth);
                for (i = 0; i < psid->num_subauth; i++) {
-                        cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
+                        cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
-                                le32_to_cpu(psid->sub_auth[i])));
+                                le32_to_cpu(psid->sub_auth[i]));
                }
                /* BB add length check to make sure that we do not have huge
                        num auths and therefore go off the end */
-                cFYI(1, ("RID 0x%x",
+                cFYI(1, "RID 0x%x",
-                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1])));
+                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 #endif
        }
@@ -482,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                                le32_to_cpu(pntsd->gsidoffset));
        dacloffset = le32_to_cpu(pntsd->dacloffset);
        dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-        cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+        cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
                 "sacloffset 0x%x dacloffset 0x%x",
                 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
                 le32_to_cpu(pntsd->gsidoffset),
-                 le32_to_cpu(pntsd->sacloffset), dacloffset));
+                 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*      cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
        rc = parse_sid(owner_sid_ptr, end_of_acl);
        if (rc)
@@ -500,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
                           group_sid_ptr, fattr);
        else
-                cFYI(1, ("no ACL")); /* BB grant all or default perms? */
+                cFYI(1, "no ACL"); /* BB grant all or default perms? */
 /*      cifscred->uid = owner_sid_ptr->rid;
        cifscred->gid = group_sid_ptr->rid;
@@ -563,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
        FreeXid(xid);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        return pntsd;
 }
@@ -581,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to get ACL"));
+                cERROR(1, "Unable to open file to get ACL");
                goto out;
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
-        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+        cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -621,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        FreeXid(xid);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        return rc;
 }
@@ -638,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                         &fid, &oplock, NULL, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cERROR(1, ("Unable to open file to set ACL"));
+                cERROR(1, "Unable to open file to set ACL");
                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+        cFYI(DBG2, "SetCIFSACL rc = %d", rc);
        CIFSSMBClose(xid, cifs_sb->tcon, fid);
 out:
@@ -659,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        struct cifsFileInfo *open_file;
        int rc;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
        open_file = find_readable_file(CIFS_I(inode));
        if (!open_file)
@@ -679,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        u32 acllen = 0;
        int rc = 0;
-        cFYI(DBG2, ("converting ACL to mode for %s", path));
+        cFYI(DBG2, "converting ACL to mode for %s", path);
        if (pfid)
                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -690,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
        if (pntsd)
                rc = parse_sec_desc(pntsd, acllen, fattr);
        if (rc)
-                cFYI(1, ("parse sec desc failed rc = %d", rc));
+                cFYI(1, "parse sec desc failed rc = %d", rc);
        kfree(pntsd);
        return;
@@ -704,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
        struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
-        cFYI(DBG2, ("set ACL from mode for %s", path));
+        cFYI(DBG2, "set ACL from mode for %s", path);
        /* Get the security descriptor */
        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -721,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                                        DEFSECDESCLEN : secdesclen;
                pnntsd = kmalloc(secdesclen, GFP_KERNEL);
                if (!pnntsd) {
-                        cERROR(1, ("Unable to allocate security descriptor"));
+                        cERROR(1, "Unable to allocate security descriptor");
                        kfree(pntsd);
                        return -ENOMEM;
                }
                rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
-                cFYI(DBG2, ("build_sec_desc rc: %d", rc));
+                cFYI(DBG2, "build_sec_desc rc: %d", rc);
                if (!rc) {
                        /* Set the security descriptor */
                        rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-                        cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
+                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
                }
                kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index fbe986430d0c..35042d8f7338 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -103,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (iov[i].iov_len == 0)
                        continue;
                if (iov[i].iov_base == NULL) {
-                        cERROR(1, ("null iovec entry"));
+                        cERROR(1, "null iovec entry");
                        return -EIO;
                }
                /* The first entry includes a length field (which does not get
@@ -181,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
        /* Do not need to verify session setups with signature "BSRSPYL "  */
        if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-                cFYI(1, ("dummy signature received for smb command 0x%x",
+                cFYI(1, "dummy signature received for smb command 0x%x",
-                        cifs_pdu->Command));
+                        cifs_pdu->Command);
        /* save off the origiginal signature so we can modify the smb and check
                its signature against what the server sent */
@@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
        return 0;
 }
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
-                               const struct nls_table *nls_info)
-{
-        char temp_hash[16];
-        struct HMACMD5Context ctx;
-        char *ucase_buf;
-        __le16 *unicode_buf;
-        unsigned int i, user_name_len, dom_name_len;
-        if (ses == NULL)
-                return -EINVAL;
-        E_md4hash(ses->password, temp_hash);
-        hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
-        user_name_len = strlen(ses->userName);
-        if (user_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        if (ses->domainName == NULL)
-                return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
-        dom_name_len = strlen(ses->domainName);
-        if (dom_name_len > MAX_USERNAME_SIZE)
-                return -EINVAL;
-        ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
-        if (ucase_buf == NULL)
-                return -ENOMEM;
-        unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
-        if (unicode_buf == NULL) {
-                kfree(ucase_buf);
-                return -ENOMEM;
-        }
-        for (i = 0; i < user_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
-        ucase_buf[i] = 0;
-        user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
-                                      MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len] = 0;
-        user_name_len++;
-        for (i = 0; i < dom_name_len; i++)
-                ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
-        ucase_buf[i] = 0;
-        dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
-                                     MAX_USERNAME_SIZE*2, nls_info);
-        unicode_buf[user_name_len + dom_name_len] = 0;
-        hmac_md5_update((const unsigned char *) unicode_buf,
-                (user_name_len+dom_name_len)*2, &ctx);
-        hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
-        kfree(ucase_buf);
-        kfree(unicode_buf);
-        return 0;
-}
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
                        char *lnm_session_key)
@@ -291,7 +234,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
        if (password)
                strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
-        if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
+        if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                memcpy(lnm_session_key, password_with_pad,
                        CIFS_ENCPWD_SIZE);
@@ -398,7 +341,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
        /* calculate buf->ntlmv2_hash */
        rc = calc_ntlmv2_hash(ses, nls_cp);
        if (rc)
-                cERROR(1, ("could not get v2 hash rc %d", rc));
+                cERROR(1, "could not get v2 hash rc %d", rc);
        CalcNTLMv2_response(ses, resp_buf);
        /* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ad235d604a0b..b7431afdd76d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,14 +45,10 @@
 #include "cifs_fs_sb.h"
 #include <linux/mm.h>
 #include <linux/key-type.h>
-#include "dns_resolve.h"
 #include "cifs_spnego.h"
+#include "fscache.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42    /* the first four bytes of SMB PDUs */
-#ifdef CONFIG_CIFS_QUOTA
-static const struct quotactl_ops cifs_quotactl_ops;
-#endif /* QUOTA */
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
@@ -61,7 +57,7 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = CIFSSEC_DEF;
+unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
@@ -86,8 +82,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static int
 cifs_read_super(struct super_block *sb, void *data,
                const char *devname, int silent)
@@ -135,8 +129,7 @@ cifs_read_super(struct super_block *sb, void *data,
        if (rc) {
                if (!silent)
-                        cERROR(1,
+                        cERROR(1, "cifs_mount failed w/return code = %d", rc);
-                               ("cifs_mount failed w/return code = %d", rc));
                goto out_mount_failed;
        }
@@ -146,9 +139,6 @@ cifs_read_super(struct super_block *sb, void *data,
 /*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize =
                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
-#ifdef CONFIG_CIFS_QUOTA
-        sb->s_qcop = &cifs_quotactl_ops;
-#endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
        inode = cifs_root_iget(sb, ROOT_I);
@@ -168,7 +158,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-                cFYI(1, ("export ops supported"));
+                cFYI(1, "export ops supported");
                sb->s_export_op = &cifs_export_ops;
        }
 #endif /* EXPERIMENTAL */
@@ -176,7 +166,7 @@ cifs_read_super(struct super_block *sb, void *data,
        return 0;
 out_no_root:
-        cERROR(1, ("cifs_read_super: get root inode failed"));
+        cERROR(1, "cifs_read_super: get root inode failed");
        if (inode)
                iput(inode);
@@ -203,10 +193,10 @@ cifs_put_super(struct super_block *sb)
        int rc = 0;
        struct cifs_sb_info *cifs_sb;
-        cFYI(1, ("In cifs_put_super"));
+        cFYI(1, "In cifs_put_super");
        cifs_sb = CIFS_SB(sb);
        if (cifs_sb == NULL) {
-                cFYI(1, ("Empty cifs superblock info passed to unmount"));
+                cFYI(1, "Empty cifs superblock info passed to unmount");
                return;
        }
@@ -214,7 +204,7 @@ cifs_put_super(struct super_block *sb)
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
-                cERROR(1, ("cifs_umount failed with return code %d", rc));
+                cERROR(1, "cifs_umount failed with return code %d", rc);
 #ifdef CONFIG_CIFS_DFS_UPCALL
        if (cifs_sb->mountdata) {
                kfree(cifs_sb->mountdata);
@@ -300,7 +290,6 @@ static int cifs_permission(struct inode *inode, int mask)
 static struct kmem_cache *cifs_inode_cachep;
 static struct kmem_cache *cifs_req_cachep;
 static struct kmem_cache *cifs_mid_cachep;
-struct kmem_cache *cifs_oplock_cachep;
 static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
@@ -340,6 +329,14 @@ cifs_destroy_inode(struct inode *inode)
 }
 static void
+cifs_evict_inode(struct inode *inode)
+{
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
+        cifs_fscache_release_inode_cookie(inode);
+}
+static void
 cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 {
        seq_printf(s, ",addr=");
@@ -432,106 +429,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        return 0;
 }
-#ifdef CONFIG_CIFS_QUOTA
-int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
-                struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
-                    struct fs_disk_quota *pdquota)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
-{
-        int xid;
-        int rc = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        struct cifsTconInfo *pTcon;
-        if (cifs_sb)
-                pTcon = cifs_sb->tcon;
-        else
-                return -EIO;
-        xid = GetXid();
-        if (pTcon) {
-                cFYI(1, ("pqstats %p", qstats));
-        } else
-                rc = -EIO;
-        FreeXid(xid);
-        return rc;
-}
-static const struct quotactl_ops cifs_quotactl_ops = {
-        .set_xquota     = cifs_xquota_set,
-        .get_xquota     = cifs_xquota_get,
-        .set_xstate     = cifs_xstate_set,
-        .get_xstate     = cifs_xstate_get,
-};
-#endif
 static void cifs_umount_begin(struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -558,7 +455,7 @@ static void cifs_umount_begin(struct super_block *sb)
        /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
        /* cancel_notify_requests(tcon); */
        if (tcon->ses && tcon->ses->server) {
-                cFYI(1, ("wake up tasks now - umount begin not complete"));
+                cFYI(1, "wake up tasks now - umount begin not complete");
                wake_up_all(&tcon->ses->server->request_q);
                wake_up_all(&tcon->ses->server->response_q);
                msleep(1); /* yield */
@@ -584,14 +481,24 @@ static int cifs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+static int cifs_drop_inode(struct inode *inode)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        /* no serverino => unconditional eviction */
+        return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
+                generic_drop_inode(inode);
+}
 static const struct super_operations cifs_super_ops = {
        .put_super = cifs_put_super,
        .statfs = cifs_statfs,
        .alloc_inode = cifs_alloc_inode,
        .destroy_inode = cifs_destroy_inode,
-/*      .drop_inode         = generic_delete_inode,
+        .drop_inode     = cifs_drop_inode,
-        .delete_inode   = cifs_delete_inode,  */  /* Do not need above two
+        .evict_inode    = cifs_evict_inode,
-        functions unless later we add lazy close of inodes or unless the
+/*      .delete_inode   = cifs_delete_inode,  */  /* Do not need above
+        function unless later we add lazy close of inodes or unless the
        kernel forgets to call us with the same number of releases (closes)
        as opens */
        .show_options = cifs_show_options,
@@ -609,7 +516,7 @@ cifs_get_sb(struct file_system_type *fs_type,
        int rc;
        struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
-        cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
+        cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
        if (IS_ERR(sb))
                return PTR_ERR(sb);
@@ -656,7 +563,6 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
        return generic_file_llseek_unlocked(file, offset, origin);
 }
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
        /* note that this is called by vfs setlease with the BKL held
@@ -685,7 +591,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
        else
                return -EAGAIN;
 }
-#endif
 struct file_system_type cifs_fs_type = {
        .owner = THIS_MODULE,
@@ -762,10 +667,7 @@ const struct file_operations cifs_file_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_ops = {
@@ -784,9 +686,7 @@ const struct file_operations cifs_file_direct_ops = {
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
        .read = do_sync_read,
@@ -803,10 +703,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -824,9 +721,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
        .unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
        .setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_dir_ops = {
@@ -878,7 +773,7 @@ cifs_init_request_bufs(void)
        } else {
                CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
        }
-/*      cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */
+/*      cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
        cifs_req_cachep = kmem_cache_create("cifs_request",
                                            CIFSMaxBufSize +
                                            MAX_CIFS_HDR_SIZE, 0,
@@ -890,7 +785,7 @@ cifs_init_request_bufs(void)
                cifs_min_rcv = 1;
        else if (cifs_min_rcv > 64) {
                cifs_min_rcv = 64;
-                cERROR(1, ("cifs_min_rcv set to maximum (64)"));
+                cERROR(1, "cifs_min_rcv set to maximum (64)");
        }
        cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -921,7 +816,7 @@ cifs_init_request_bufs(void)
                cifs_min_small = 2;
        else if (cifs_min_small > 256) {
                cifs_min_small = 256;
-                cFYI(1, ("cifs_min_small set to maximum (256)"));
+                cFYI(1, "cifs_min_small set to maximum (256)");
        }
        cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -962,15 +857,6 @@ cifs_init_mids(void)
                return -ENOMEM;
        }
-        cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
-                                        sizeof(struct oplock_q_entry), 0,
-                                        SLAB_HWCACHE_ALIGN, NULL);
-        if (cifs_oplock_cachep == NULL) {
-                mempool_destroy(cifs_mid_poolp);
-                kmem_cache_destroy(cifs_mid_cachep);
-                return -ENOMEM;
-        }
        return 0;
 }
@@ -979,7 +865,6 @@ cifs_destroy_mids(void)
 {
        mempool_destroy(cifs_mid_poolp);
        kmem_cache_destroy(cifs_mid_cachep);
-        kmem_cache_destroy(cifs_oplock_cachep);
 }
 static int __init
@@ -1019,12 +904,16 @@ init_cifs(void)
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
-                cFYI(1, ("cifs_max_pending set to min of 2"));
+                cFYI(1, "cifs_max_pending set to min of 2");
        } else if (cifs_max_pending > 256) {
                cifs_max_pending = 256;
-                cFYI(1, ("cifs_max_pending set to max of 256"));
+                cFYI(1, "cifs_max_pending set to max of 256");
        }
+        rc = cifs_fscache_register();
+        if (rc)
+                goto out;
        rc = cifs_init_inodecache();
        if (rc)
                goto out_clean_proc;
@@ -1045,27 +934,13 @@ init_cifs(void)
        if (rc)
                goto out_unregister_filesystem;
 #endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        rc = register_key_type(&key_type_dns_resolver);
-        if (rc)
-                goto out_unregister_key_type;
-#endif
-        rc = slow_work_register_user(THIS_MODULE);
-        if (rc)
-                goto out_unregister_resolver_key;
        return 0;
- out_unregister_resolver_key:
-#ifdef CONFIG_CIFS_DFS_UPCALL
-        unregister_key_type(&key_type_dns_resolver);
- out_unregister_key_type:
-#endif
 #ifdef CONFIG_CIFS_UPCALL
-        unregister_key_type(&cifs_spnego_key_type);
 out_unregister_filesystem:
-#endif
        unregister_filesystem(&cifs_fs_type);
+#endif
 out_destroy_request_bufs:
        cifs_destroy_request_bufs();
 out_destroy_mids:
@@ -1074,17 +949,19 @@ init_cifs(void)
        cifs_destroy_inodecache();
 out_clean_proc:
        cifs_proc_clean();
+        cifs_fscache_unregister();
+ out:
        return rc;
 }
 static void __exit
 exit_cifs(void)
 {
-        cFYI(DBG2, ("exit_cifs"));
+        cFYI(DBG2, "exit_cifs");
        cifs_proc_clean();
+        cifs_fscache_unregister();
 #ifdef CONFIG_CIFS_DFS_UPCALL
        cifs_dfs_release_automount_timer();
-        unregister_key_type(&key_type_dns_resolver);
 #endif
 #ifdef CONFIG_CIFS_UPCALL
        unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7aa57ecdc437..d82f5fb4761e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.62"
+#define CIFS_VERSION   "1.65"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0c2fd17439c8..0cdfb8c32ac6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,10 +16,13 @@
 *   the GNU Lesser General Public License for more details.
 *
 */
+#ifndef _CIFS_GLOB_H
+#define _CIFS_GLOB_H
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
-#include <linux/slow-work.h>
+#include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
 /*
@@ -34,7 +37,7 @@
 #define MAX_SHARE_SIZE  64      /* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32    /* 32 is to allow for 15 char names + null
                                   termination then *2 for unicode versions */
-#define MAX_PASSWORD_SIZE 16
+#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
 #define CIFS_MIN_RCV_POOL 4
@@ -80,14 +83,12 @@ enum statusEnum {
 };
 enum securityEnum {
-        PLAINTXT = 0,           /* Legacy with Plaintext passwords */
+        LANMAN = 0,                     /* Legacy LANMAN auth */
-        LANMAN,                 /* Legacy LANMAN auth */
        NTLM,                   /* Legacy NTLM012 auth with NTLM hash */
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO, NTLMv2 hash */
 /*      NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
        Kerberos,               /* Kerberos via SPNEGO */
-        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
 enum protocolEnum {
@@ -143,7 +144,6 @@ struct TCP_Server_Info {
        struct list_head pending_mid_q;
        void *Server_NlsInfo;   /* BB - placeholder for future NLS info  */
        unsigned short server_codepage; /* codepage for the server    */
-        unsigned long ip_address;       /* IP addr for the server if known */
        enum protocolEnum protocolType;
        char versionMajor;
        char versionMinor;
@@ -185,19 +185,15 @@ struct TCP_Server_Info {
        struct mac_key mac_signing_key;
        char ntlmv2_hash[16];
        unsigned long lstrp; /* when we got last response from this server */
-};
+        u16 dialect; /* dialect index that server chose */
+        /* extended security flavors that server supports */
-/*
+        bool    sec_kerberos;           /* supports plain Kerberos */
- * The following is our shortcut to user information.  We surface the uid,
+        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
- * and name. We always get the password on the fly in case it
+        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
- * has changed. We also hang a list of sessions owned by this user off here.
+        bool    sec_ntlmssp;            /* supports NTLMSSP */
- */
+#ifdef CONFIG_CIFS_FSCACHE
-struct cifsUidInfo {
+        struct fscache_cookie   *fscache; /* client index cache cookie */
-        struct list_head userList;
+#endif
-        struct list_head sessionList; /* SMB sessions for this user */
-        uid_t linux_uid;
-        char user[MAX_USERNAME_SIZE + 1];       /* ascii name of user */
-        /* BB may need ptr or callback for PAM or WinBind info */
 };
 /*
@@ -207,9 +203,6 @@ struct cifsSesInfo {
        struct list_head smb_ses_list;
        struct list_head tcon_list;
        struct mutex session_mutex;
-#if 0
-        struct cifsUidInfo *uidInfo;    /* pointer to user info */
-#endif
        struct TCP_Server_Info *server; /* pointer to server info */
        int ses_count;          /* reference counter */
        enum statusEnum status;
@@ -221,7 +214,8 @@ struct cifsSesInfo {
        char *serverNOS;        /* name of network operating system of server */
        char *serverDomain;     /* security realm of server */
        int Suid;               /* remote smb uid  */
-        uid_t linux_uid;        /* local Linux uid */
+        uid_t linux_uid;        /* overriding owner of files on the mount */
+        uid_t cred_uid;         /* owner of credentials */
        int capabilities;
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
                                TCP names - will ipv6 and sctp addresses fit? */
@@ -306,6 +300,10 @@ struct cifsTconInfo {
        bool local_lease:1; /* check leases (only) on local system not remote */
        bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
        bool need_reconnect:1; /* connection reset, tid now invalid */
+#ifdef CONFIG_CIFS_FSCACHE
+        u64 resource_id;                /* server resource id */
+        struct fscache_cookie *fscache; /* cookie for share */
+#endif
        /* BB add field for back pointer to sb struct(s)? */
 };
@@ -358,7 +356,7 @@ struct cifsFileInfo {
        atomic_t count;         /* reference count */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
-        struct slow_work oplock_break; /* slow_work job for oplock breaks */
+        struct work_struct oplock_break; /* work for oplock breaks */
 };
 /* Take a reference on the file private data */
@@ -393,6 +391,9 @@ struct cifsInodeInfo {
        bool invalid_mapping:1;         /* pagecache is invalid */
        u64  server_eof;                /* current file size on server */
        u64  uniqueid;                  /* server inode number */
+#ifdef CONFIG_CIFS_FSCACHE
+        struct fscache_cookie *fscache;
+#endif
        struct inode vfs_inode;
 };
@@ -718,7 +719,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 GLOBAL_EXTERN unsigned int oplockEnabled;
 GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int extended_security;   /* if on, session setup sent
+GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
@@ -727,4 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
 GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */
 GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+void cifs_oplock_break(struct work_struct *work);
+void cifs_oplock_break_get(struct cifsFileInfo *cfile);
+void cifs_oplock_break_put(struct cifsFileInfo *cfile);
 extern const struct slow_work_ops cifs_oplock_break_ops;
+#endif  /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 39e47f46dea5..1d60c655e3e0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
                        unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
+#define GetXid()                                                \
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
+({                                                              \
+        int __xid = (int)_GetXid();                             \
+        cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",      \
+             __func__, __xid, current_fsuid());                 \
+        __xid;                                                  \
+})
+#define FreeXid(curr_xid)                                       \
+do {                                                            \
+        _FreeXid(curr_xid);                                     \
+        cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",      \
+             __func__, curr_xid, (int)rc);                      \
+} while (0)
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -73,8 +85,11 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
-                        enum securityEnum *secType);
+                        struct TCP_Server_Info *server);
-extern int cifs_convert_address(char *src, void *dst);
+extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
+extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+                                const unsigned short int port);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifsTconInfo *, int /* length of
@@ -83,7 +98,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
                                struct cifsSesInfo *ses,
                                void **request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                             const int stage,
                             const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -95,8 +109,10 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
                                __u16 fileHandle, struct file *file,
                                struct vfsmount *mnt, unsigned int oflags);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
-                           struct vfsmount *mnt, int mode, int oflags,
+                                struct super_block *sb,
-                           __u32 *poplock, __u16 *pnetfid, int xid);
+                                int mode, int oflags,
+                                __u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
                                     FILE_UNIX_BASIC_INFO *info,
                                     struct cifs_sb_info *cifs_sb);
@@ -125,7 +141,9 @@ extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+extern int cifs_negotiate_protocol(unsigned int xid,
+                                  struct cifsSesInfo *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
                        struct nls_table *nls_info);
 extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
@@ -348,8 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *,
                                __u32 expected_sequence_number);
 extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
                                 const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
-                        const struct nls_table *);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
 extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
                             const struct nls_table *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5d3f29fef532..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/cifssmb.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   Contains the routines for constructing the SMB PDUs themselves
@@ -130,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                if (smb_command != SMB_COM_WRITE_ANDX &&
                    smb_command != SMB_COM_OPEN_ANDX &&
                    smb_command != SMB_COM_TREE_DISCONNECT) {
-                        cFYI(1, ("can not send cmd %d while umounting",
+                        cFYI(1, "can not send cmd %d while umounting",
-                                smb_command));
+                                smb_command);
                        return -ENODEV;
                }
        }
@@ -157,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * back on-line
                 */
                if (!tcon->retry || ses->status == CifsExiting) {
-                        cFYI(1, ("gave up waiting on reconnect in smb_init"));
+                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
        }
@@ -172,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
         * reconnect the same SMB session
         */
        mutex_lock(&ses->session_mutex);
-        if (ses->need_reconnect)
+        rc = cifs_negotiate_protocol(0, ses);
+        if (rc == 0 && ses->need_reconnect)
                rc = cifs_setup_session(0, ses, nls_codepage);
        /* do we need to reconnect tcon? */
@@ -184,7 +185,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
        mark_open_files_invalid(tcon);
        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
        mutex_unlock(&ses->session_mutex);
-        cFYI(1, ("reconnect tcon rc = %d", rc));
+        cFYI(1, "reconnect tcon rc = %d", rc);
        if (rc)
                goto out;
@@ -231,7 +232,7 @@ static int
 small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
                void **request_buf)
 {
-        int rc = 0;
+        int rc;
        rc = cifs_reconnect_tcon(tcon, smb_command);
        if (rc)
@@ -249,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
 }
 int
@@ -280,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
-         void **request_buf /* returned */ ,
+                        void **request_buf, void **response_buf)
-         void **response_buf /* returned */ )
 {
-        int rc = 0;
-        rc = cifs_reconnect_tcon(tcon, smb_command);
-        if (rc)
-                return rc;
        *request_buf = cifs_buf_get();
        if (*request_buf == NULL) {
                /* BB should we add a retry in here if not a writepage? */
@@ -308,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
        if (tcon != NULL)
                cifs_stats_inc(&tcon->num_smbs_sent);
-        return rc;
+        return 0;
+}
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+         void **request_buf, void **response_buf)
+{
+        int rc;
+        rc = cifs_reconnect_tcon(tcon, smb_command);
+        if (rc)
+                return rc;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+                        void **request_buf, void **response_buf)
+{
+        if (tcon->ses->need_reconnect || tcon->need_reconnect)
+                return -EHOSTDOWN;
+        return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
 }
 static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -355,7 +373,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        struct TCP_Server_Info *server;
        u16 count;
        unsigned int secFlags;
-        u16 dialect;
        if (ses->server)
                server = ses->server;
@@ -372,9 +389,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
                secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
        else /* if override flags set only sign/seal OR them with global auth */
-                secFlags = extended_security | ses->overrideSecFlg;
+                secFlags = global_secflags | ses->overrideSecFlg;
-        cFYI(1, ("secFlags 0x%x", secFlags));
+        cFYI(1, "secFlags 0x%x", secFlags);
        pSMB->hdr.Mid = GetNextMid(server);
        pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -382,14 +399,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-                cFYI(1, ("Kerberos only mechanism, enable extended security"));
+                cFYI(1, "Kerberos only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-                cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+                cFYI(1, "NTLMSSP only mechanism, enable extended security");
                pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
        }
 #endif
@@ -408,10 +425,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        if (rc != 0)
                goto neg_err_exit;
-        dialect = le16_to_cpu(pSMBr->DialectIndex);
+        server->dialect = le16_to_cpu(pSMBr->DialectIndex);
-        cFYI(1, ("Dialect: %d", dialect));
+        cFYI(1, "Dialect: %d", server->dialect);
        /* Check wct = 1 error case */
-        if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
+        if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
                /* core returns wct = 1, but we do not ask for core - otherwise
                small wct just comes when dialect index is -1 indicating we
                could not negotiate a common dialect */
@@ -419,8 +436,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
        } else if ((pSMBr->hdr.WordCount == 13)
-                        && ((dialect == LANMAN_PROT)
+                        && ((server->dialect == LANMAN_PROT)
-                                || (dialect == LANMAN2_PROT))) {
+                                || (server->dialect == LANMAN2_PROT))) {
                __s16 tmp;
                struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
@@ -428,8 +445,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        (secFlags & CIFSSEC_MAY_PLNTXT))
                        server->secType = LANMAN;
                else {
-                        cERROR(1, ("mount failed weak security disabled"
+                        cERROR(1, "mount failed weak security disabled"
-                                   " in /proc/fs/cifs/SecurityFlags"));
+                                   " in /proc/fs/cifs/SecurityFlags");
                        rc = -EOPNOTSUPP;
                        goto neg_err_exit;
                }
@@ -462,9 +479,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        utc = CURRENT_TIME;
                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
                                            rsp->SrvTime.Time, 0);
-                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
+                        cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
-                                (int)(utc.tv_sec - ts.tv_sec)));
+                                (int)(utc.tv_sec - ts.tv_sec));
                        val = (int)(utc.tv_sec - ts.tv_sec);
                        seconds = abs(val);
                        result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -478,7 +495,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->timeAdj = (int)tmp;
                        server->timeAdj *= 60; /* also in seconds */
                }
-                cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj));
+                cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
                /* BB get server time for time conversions and add
@@ -493,14 +510,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        goto neg_err_exit;
                }
-                cFYI(1, ("LANMAN negotiated"));
+                cFYI(1, "LANMAN negotiated");
                /* we will not end up setting signing flags - as no signing
                was in LANMAN and server did not return the flags on */
                goto signing_check;
 #else /* weak security disabled */
        } else if (pSMBr->hdr.WordCount == 13) {
-                cERROR(1, ("mount failed, cifs module not built "
+                cERROR(1, "mount failed, cifs module not built "
-                          "with CIFS_WEAK_PW_HASH support"));
+                          "with CIFS_WEAK_PW_HASH support");
                rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
                goto neg_err_exit;
@@ -512,14 +529,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        /* else wct == 17 NTLM */
        server->secMode = pSMBr->SecurityMode;
        if ((server->secMode & SECMODE_USER) == 0)
-                cFYI(1, ("share mode security"));
+                cFYI(1, "share mode security");
        if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-                        cERROR(1, ("Server requests plain text password"
+                        cERROR(1, "Server requests plain text password"
-                                  " but client support disabled"));
+                                  " but client support disabled");
        if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
                server->secType = NTLMv2;
@@ -539,7 +556,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif */
        else {
                rc = -EOPNOTSUPP;
-                cERROR(1, ("Invalid security type"));
+                cERROR(1, "Invalid security type");
                goto neg_err_exit;
        }
        /* else ... any others ...? */
@@ -551,7 +568,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
        server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
                        (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
        server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-        cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
+        cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
        GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
        server->capabilities = le32_to_cpu(pSMBr->Capabilities);
        server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -582,7 +599,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        if (memcmp(server->server_GUID,
                                   pSMBr->u.extended_response.
                                   GUID, 16) != 0) {
-                                cFYI(1, ("server UID changed"));
+                                cFYI(1, "server UID changed");
                                memcpy(server->server_GUID,
                                        pSMBr->u.extended_response.GUID,
                                        16);
@@ -597,13 +614,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        server->secType = RawNTLMSSP;
                } else {
                        rc = decode_negTokenInit(pSMBr->u.extended_response.
-                                                 SecurityBlob,
+                                                 SecurityBlob, count - 16,
-                                                 count - 16,
+                                                 server);
-                                                 &server->secType);
                        if (rc == 1)
                                rc = 0;
                        else
                                rc = -EINVAL;
+                        if (server->sec_kerberos || server->sec_mskerberos)
+                                server->secType = Kerberos;
+                        else if (server->sec_ntlmssp)
+                                server->secType = RawNTLMSSP;
+                        else
+                                rc = -EOPNOTSUPP;
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -614,22 +637,21 @@ signing_check:
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
                /* MUST_SIGN already includes the MAY_SIGN FLAG
                   so if this is zero it means that signing is disabled */
-                cFYI(1, ("Signing disabled"));
+                cFYI(1, "Signing disabled");
                if (server->secMode & SECMODE_SIGN_REQUIRED) {
-                        cERROR(1, ("Server requires "
+                        cERROR(1, "Server requires "
                                   "packet signing to be enabled in "
-                                   "/proc/fs/cifs/SecurityFlags."));
+                                   "/proc/fs/cifs/SecurityFlags.");
                        rc = -EOPNOTSUPP;
                }
                server->secMode &=
                        ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
        } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                /* signing required */
-                cFYI(1, ("Must sign - secFlags 0x%x", secFlags));
+                cFYI(1, "Must sign - secFlags 0x%x", secFlags);
                if ((server->secMode &
                        (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-                        cERROR(1,
+                        cERROR(1, "signing required but server lacks support");
-                                ("signing required but server lacks support"));
                        rc = -EOPNOTSUPP;
                } else
                        server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -643,7 +665,7 @@ signing_check:
 neg_err_exit:
        cifs_buf_release(pSMB);
-        cFYI(1, ("negprot rc %d", rc));
+        cFYI(1, "negprot rc %d", rc);
        return rc;
 }
@@ -653,7 +675,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        struct smb_hdr *smb_buffer;
        int rc = 0;
-        cFYI(1, ("In tree disconnect"));
+        cFYI(1, "In tree disconnect");
        /* BB: do we need to check this? These should never be NULL. */
        if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -675,7 +697,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
        rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
        if (rc)
-                cFYI(1, ("Tree disconnect failed %d", rc));
+                cFYI(1, "Tree disconnect failed %d", rc);
        /* No need to return error on this operation if tid invalidated and
           closed on server already e.g. due to tcp session crashing */
@@ -691,7 +713,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
        LOGOFF_ANDX_REQ *pSMB;
        int rc = 0;
-        cFYI(1, ("In SMBLogoff for session disconnect"));
+        cFYI(1, "In SMBLogoff for session disconnect");
        /*
         * BB: do we need to check validity of ses and server? They should
@@ -744,7 +766,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In POSIX delete"));
+        cFYI(1, "In POSIX delete");
 PsxDelete:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -796,7 +818,7 @@ PsxDelete:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Posix delete returned %d", rc));
+                cFYI(1, "Posix delete returned %d", rc);
        cifs_buf_release(pSMB);
        cifs_stats_inc(&tcon->num_deletes);
@@ -843,7 +865,7 @@ DelFileRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_deletes);
        if (rc)
-                cFYI(1, ("Error in RMFile = %d", rc));
+                cFYI(1, "Error in RMFile = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -862,7 +884,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBRmDir"));
+        cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
        rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -887,7 +909,7 @@ RmDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_rmdirs);
        if (rc)
-                cFYI(1, ("Error in RMDir = %d", rc));
+                cFYI(1, "Error in RMDir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -905,7 +927,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In CIFSSMBMkDir"));
+        cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
        rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -930,7 +952,7 @@ MkDirRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_mkdirs);
        if (rc)
-                cFYI(1, ("Error in Mkdir = %d", rc));
+                cFYI(1, "Error in Mkdir = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -953,7 +975,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
        OPEN_PSX_REQ *pdata;
        OPEN_PSX_RSP *psx_rsp;
-        cFYI(1, ("In POSIX Create"));
+        cFYI(1, "In POSIX Create");
 PsxCreat:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1007,11 +1029,11 @@ PsxCreat:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Posix create returned %d", rc));
+                cFYI(1, "Posix create returned %d", rc);
                goto psx_create_err;
        }
-        cFYI(1, ("copying inode info"));
+        cFYI(1, "copying inode info");
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
        if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1033,11 +1055,11 @@ PsxCreat:
        /* check to make sure response data is there */
        if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
                pRetData->Type = cpu_to_le32(-1); /* unknown */
-                cFYI(DBG2, ("unknown type"));
+                cFYI(DBG2, "unknown type");
        } else {
                if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
                                        + sizeof(FILE_UNIX_BASIC_INFO)) {
-                        cERROR(1, ("Open response data too small"));
+                        cERROR(1, "Open response data too small");
                        pRetData->Type = cpu_to_le32(-1);
                        goto psx_create_err;
                }
@@ -1084,7 +1106,7 @@ static __u16 convert_disposition(int disposition)
                        ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
                        break;
                default:
-                        cFYI(1, ("unknown disposition %d", disposition));
+                        cFYI(1, "unknown disposition %d", disposition);
                        ofun =  SMBOPEN_OAPPEND; /* regular open */
        }
        return ofun;
@@ -1175,7 +1197,7 @@ OldOpenRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
        /* BB verify if wct == 15 */
@@ -1288,7 +1310,7 @@ openRetry:
                        (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
        cifs_stats_inc(&tcon->num_opens);
        if (rc) {
-                cFYI(1, ("Error in Open = %d", rc));
+                cFYI(1, "Error in Open = %d", rc);
        } else {
                *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
                *netfid = pSMBr->Fid;   /* cifs fid stays in le */
@@ -1326,7 +1348,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        int resp_buf_type = 0;
        struct kvec iov[1];
-        cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
+        cFYI(1, "Reading %d bytes on fid %d", count, netfid);
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
        else {
@@ -1371,7 +1393,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        cifs_stats_inc(&tcon->num_reads);
        pSMBr = (READ_RSP *)iov[0].iov_base;
        if (rc) {
-                cERROR(1, ("Send error in read = %d", rc));
+                cERROR(1, "Send error in read = %d", rc);
        } else {
                int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
                data_length = data_length << 16;
@@ -1381,15 +1403,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
                /*check that DataLength would not go beyond end of SMB */
                if ((data_length > CIFSMaxBufSize)
                                || (data_length > count)) {
-                        cFYI(1, ("bad length %d for count %d",
+                        cFYI(1, "bad length %d for count %d",
-                                 data_length, count));
+                                 data_length, count);
                        rc = -EIO;
                        *nbytes = 0;
                } else {
                        pReadData = (char *) (&pSMBr->hdr.Protocol) +
                                        le16_to_cpu(pSMBr->DataOffset);
 /*                      if (rc = copy_to_user(buf, pReadData, data_length)) {
-                                cERROR(1,("Faulting on read rc = %d",rc));
+                                cERROR(1, "Faulting on read rc = %d",rc);
                                rc = -EFAULT;
                        }*/ /* can not use copy_to_user when using page cache*/
                        if (*buf)
@@ -1433,7 +1455,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        *nbytes = 0;
-        /* cFYI(1, ("write at %lld %d bytes", offset, count));*/
+        /* cFYI(1, "write at %lld %d bytes", offset, count);*/
        if (tcon->ses == NULL)
                return -ECONNABORTED;
@@ -1514,7 +1536,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error in write = %d", rc));
+                cFYI(1, "Send error in write = %d", rc);
        } else {
                *nbytes = le16_to_cpu(pSMBr->CountHigh);
                *nbytes = (*nbytes) << 16;
@@ -1551,7 +1573,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        *nbytes = 0;
-        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
+        cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
                wct = 14;
@@ -1606,7 +1628,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
                          long_op);
        cifs_stats_inc(&tcon->num_writes);
        if (rc) {
-                cFYI(1, ("Send error Write2 = %d", rc));
+                cFYI(1, "Send error Write2 = %d", rc);
        } else if (resp_buf_type == 0) {
                /* presumably this can not happen, but best to be safe */
                rc = -EIO;
@@ -1651,7 +1673,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        int timeout = 0;
        __u16 count;
-        cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
+        cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
        rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
        if (rc)
@@ -1699,7 +1721,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
        }
        cifs_stats_inc(&tcon->num_locks);
        if (rc)
-                cFYI(1, ("Send error in Lock = %d", rc));
+                cFYI(1, "Send error in Lock = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
        since file handle passed in no longer valid */
@@ -1722,7 +1744,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        __u16 params, param_offset, offset, byte_count, count;
        struct kvec iov[1];
-        cFYI(1, ("Posix Lock"));
+        cFYI(1, "Posix Lock");
        if (pLockData == NULL)
                return -EINVAL;
@@ -1792,7 +1814,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
        }
        if (rc) {
-                cFYI(1, ("Send error in Posix Lock = %d", rc));
+                cFYI(1, "Send error in Posix Lock = %d", rc);
        } else if (get_flag) {
                /* lock structure can be returned on get */
                __u16 data_offset;
@@ -1849,7 +1871,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        CLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBClose"));
+        cFYI(1, "In CIFSSMBClose");
 /* do not retry on dead session on close */
        rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1866,7 +1888,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        if (rc) {
                if (rc != -EINTR) {
                        /* EINTR is expected when user ctl-c to kill app */
-                        cERROR(1, ("Send error in Close = %d", rc));
+                        cERROR(1, "Send error in Close = %d", rc);
                }
        }
@@ -1882,7 +1904,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
        int rc = 0;
        FLUSH_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFlush"));
+        cFYI(1, "In CIFSSMBFlush");
        rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
        if (rc)
@@ -1893,7 +1915,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        cifs_stats_inc(&tcon->num_flushes);
        if (rc)
-                cERROR(1, ("Send error in Flush = %d", rc));
+                cERROR(1, "Send error in Flush = %d", rc);
        return rc;
 }
@@ -1910,7 +1932,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBRename"));
+        cFYI(1, "In CIFSSMBRename");
 renameRetry:
        rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1956,7 +1978,7 @@ renameRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_renames);
        if (rc)
-                cFYI(1, ("Send error in rename = %d", rc));
+                cFYI(1, "Send error in rename = %d", rc);
        cifs_buf_release(pSMB);
@@ -1980,7 +2002,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
        int len_of_str;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("Rename to File by handle"));
+        cFYI(1, "Rename to File by handle");
        rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
                        (void **) &pSMBr);
        if (rc)
@@ -2035,7 +2057,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&pTcon->num_t2renames);
        if (rc)
-                cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
+                cFYI(1, "Send error in Rename (by file handle) = %d", rc);
        cifs_buf_release(pSMB);
@@ -2057,7 +2079,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSSMBCopy"));
+        cFYI(1, "In CIFSSMBCopy");
 copyRetry:
        rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
                        (void **) &pSMBr);
@@ -2102,8 +2124,8 @@ copyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in copy = %d with %d files copied",
+                cFYI(1, "Send error in copy = %d with %d files copied",
-                        rc, le16_to_cpu(pSMBr->CopyCount)));
+                        rc, le16_to_cpu(pSMBr->CopyCount));
        }
        cifs_buf_release(pSMB);
@@ -2127,7 +2149,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Symlink Unix style"));
+        cFYI(1, "In Symlink Unix style");
 createSymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2192,7 +2214,7 @@ createSymLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_symlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
+                cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
        cifs_buf_release(pSMB);
@@ -2216,7 +2238,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In Create Hard link Unix style"));
+        cFYI(1, "In Create Hard link Unix style");
 createHardLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2278,7 +2300,7 @@ createHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
+                cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2299,7 +2321,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
        int name_len, name_len2;
        __u16 count;
-        cFYI(1, ("In CIFSCreateHardLink"));
+        cFYI(1, "In CIFSCreateHardLink");
 winCreateHardLinkRetry:
        rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2350,7 +2372,7 @@ winCreateHardLinkRetry:
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_hardlinks);
        if (rc)
-                cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
+                cFYI(1, "Send error in hard link (NT rename) = %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -2373,7 +2395,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
        __u16 params, byte_count;
        char *data_start;
-        cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
+        cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
 querySymLinkRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2420,7 +2442,7 @@ querySymLinkRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc));
+                cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
        } else {
                /* decode response */
@@ -2521,21 +2543,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
        /* should we also check that parm and data areas do not overlap? */
        if (*ppparm > end_of_smb) {
-                cFYI(1, ("parms start after end of smb"));
+                cFYI(1, "parms start after end of smb");
                return -EINVAL;
        } else if (parm_count + *ppparm > end_of_smb) {
-                cFYI(1, ("parm end after end of smb"));
+                cFYI(1, "parm end after end of smb");
                return -EINVAL;
        } else if (*ppdata > end_of_smb) {
-                cFYI(1, ("data starts after end of smb"));
+                cFYI(1, "data starts after end of smb");
                return -EINVAL;
        } else if (data_count + *ppdata > end_of_smb) {
-                cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p",
+                cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
                        *ppdata, data_count, (data_count + *ppdata),
-                        end_of_smb, pSMBr));
+                        end_of_smb, pSMBr);
                return -EINVAL;
        } else if (parm_count + data_count > pSMBr->ByteCount) {
-                cFYI(1, ("parm count and data count larger than SMB"));
+                cFYI(1, "parm count and data count larger than SMB");
                return -EINVAL;
        }
        *pdatalen = data_count;
@@ -2554,7 +2576,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        struct smb_com_transaction_ioctl_req *pSMB;
        struct smb_com_transaction_ioctl_rsp *pSMBr;
-        cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName));
+        cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -2583,7 +2605,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc));
+                cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
        } else {                /* decode response */
                __u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
                __u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2607,7 +2629,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        if ((reparse_buf->LinkNamesBuf +
                                reparse_buf->TargetNameOffset +
                                reparse_buf->TargetNameLen) > end_of_smb) {
-                                cFYI(1, ("reparse buf beyond SMB"));
+                                cFYI(1, "reparse buf beyond SMB");
                                rc = -EIO;
                                goto qreparse_out;
                        }
@@ -2628,12 +2650,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
                        }
                } else {
                        rc = -EIO;
-                        cFYI(1, ("Invalid return data count on "
+                        cFYI(1, "Invalid return data count on "
-                                 "get reparse info ioctl"));
+                                 "get reparse info ioctl");
                }
                symlinkinfo[buflen] = 0; /* just in case so the caller
                                        does not go off the end of the buffer */
-                cFYI(1, ("readlink result - %s", symlinkinfo));
+                cFYI(1, "readlink result - %s", symlinkinfo);
        }
 qreparse_out:
@@ -2656,7 +2678,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
        ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
        ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
        ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-        /* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
+        /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
        return;
 }
@@ -2682,8 +2704,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
                size += sizeof(struct cifs_posix_ace) * count;
                /* check if we would go beyond end of SMB */
                if (size_of_data_area < size) {
-                        cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d",
+                        cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
-                                size_of_data_area, size));
+                                size_of_data_area, size);
                        return -EINVAL;
                }
        } else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2730,7 +2752,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
                cifs_ace->cifs_uid = cpu_to_le64(-1);
        } else
                cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-        /*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
+        /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
        return rc;
 }
@@ -2748,12 +2770,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
                return 0;
        count = posix_acl_xattr_count((size_t)buflen);
-        cFYI(1, ("setting acl with %d entries from buf of length %d and "
+        cFYI(1, "setting acl with %d entries from buf of length %d and "
                "version of %d",
-                count, buflen, le32_to_cpu(local_acl->a_version)));
+                count, buflen, le32_to_cpu(local_acl->a_version));
        if (le32_to_cpu(local_acl->a_version) != 2) {
-                cFYI(1, ("unknown POSIX ACL version %d",
+                cFYI(1, "unknown POSIX ACL version %d",
-                     le32_to_cpu(local_acl->a_version)));
+                     le32_to_cpu(local_acl->a_version));
                return 0;
        }
        cifs_acl->version = cpu_to_le16(1);
@@ -2762,7 +2784,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
        else if (acl_type == ACL_TYPE_DEFAULT)
                cifs_acl->default_entry_count = cpu_to_le16(count);
        else {
-                cFYI(1, ("unknown ACL type %d", acl_type));
+                cFYI(1, "unknown ACL type %d", acl_type);
                return 0;
        }
        for (i = 0; i < count; i++) {
@@ -2795,7 +2817,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName));
+        cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
 queryAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2847,7 +2869,7 @@ queryAclRetry:
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
+                cFYI(1, "Send error in Query POSIX ACL = %d", rc);
        } else {
                /* decode response */
@@ -2884,7 +2906,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName));
+        cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
 setAclRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -2939,7 +2961,7 @@ setAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Set POSIX ACL returned %d", rc));
+                cFYI(1, "Set POSIX ACL returned %d", rc);
 setACLerrorExit:
        cifs_buf_release(pSMB);
@@ -2959,7 +2981,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetExtAttr"));
+        cFYI(1, "In GetExtAttr");
        if (tcon == NULL)
                return -ENODEV;
@@ -2998,7 +3020,7 @@ GetExtAttrRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in GetExtAttr", rc));
+                cFYI(1, "error %d in GetExtAttr", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3013,7 +3035,7 @@ GetExtAttrRetry:
                        struct file_chattr_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count != 16) {
-                                cFYI(1, ("Illegal size ret in GetExtAttr"));
+                                cFYI(1, "Illegal size ret in GetExtAttr");
                                rc = -EIO;
                                goto GetExtAttrOut;
                        }
@@ -3043,7 +3065,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
        QUERY_SEC_DESC_REQ *pSMB;
        struct kvec iov[1];
-        cFYI(1, ("GetCifsACL"));
+        cFYI(1, "GetCifsACL");
        *pbuflen = 0;
        *acl_inf = NULL;
@@ -3068,7 +3090,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                         CIFS_STD_OP);
        cifs_stats_inc(&tcon->num_acl_get);
        if (rc) {
-                cFYI(1, ("Send error in QuerySecDesc = %d", rc));
+                cFYI(1, "Send error in QuerySecDesc = %d", rc);
        } else {                /* decode response */
                __le32 *parm;
                __u32 parm_len;
@@ -3083,7 +3105,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                        goto qsec_out;
                pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
-                cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
+                cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
                if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
                        rc = -EIO;      /* bad smb */
@@ -3095,8 +3117,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                acl_len = le32_to_cpu(*parm);
                if (acl_len != *pbuflen) {
-                        cERROR(1, ("acl length %d does not match %d",
+                        cERROR(1, "acl length %d does not match %d",
-                                   acl_len, *pbuflen));
+                                   acl_len, *pbuflen);
                        if (*pbuflen > acl_len)
                                *pbuflen = acl_len;
                }
@@ -3105,7 +3127,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
                   header followed by the smallest SID */
                if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
                    (*pbuflen >= 64 * 1024)) {
-                        cERROR(1, ("bad acl length %d", *pbuflen));
+                        cERROR(1, "bad acl length %d", *pbuflen);
                        rc = -EINVAL;
                        *pbuflen = 0;
                } else {
@@ -3179,9 +3201,9 @@ setCifsAclRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-        cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+        cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
        if (rc)
-                cFYI(1, ("Set CIFS ACL returned %d", rc));
+                cFYI(1, "Set CIFS ACL returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -3205,7 +3227,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SMBQPath path %s", searchName));
+        cFYI(1, "In SMBQPath path %s", searchName);
 QInfRetry:
        rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3231,7 +3253,7 @@ QInfRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryInfo = %d", rc));
+                cFYI(1, "Send error in QueryInfo = %d", rc);
        } else if (pFinfo) {
                struct timespec ts;
                __u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3305,7 +3327,7 @@ QFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3343,7 +3365,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-/* cFYI(1, ("In QPathInfo path %s", searchName)); */
+/* cFYI(1, "In QPathInfo path %s", searchName); */
 QPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3393,7 +3415,7 @@ QPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3473,14 +3495,14 @@ UnixQFileInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
-                                   "by specifying the nosfu mount option."));
+                                   "by specifying the nosfu mount option.");
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3512,7 +3534,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In QPathInfo (Unix) the path %s", searchName));
+        cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
 UnixQPathInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -3559,14 +3581,14 @@ UnixQPathInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QPathInfo = %d", rc));
+                cFYI(1, "Send error in QPathInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-                        cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+                        cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
                                   "Unix Extensions can be disabled on mount "
-                                   "by specifying the nosfu mount option."));
+                                   "by specifying the nosfu mount option.");
                        rc = -EIO;      /* bad smb */
                } else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3600,7 +3622,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
        int name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindFirst for %s", searchName));
+        cFYI(1, "In FindFirst for %s", searchName);
 findFirstRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3677,7 +3699,7 @@ findFirstRetry:
        if (rc) {/* BB add logic to retry regular search if Unix search
                        rejected unexpectedly by server */
                /* BB Add code to handle unsupported level rc */
-                cFYI(1, ("Error in FindFirst = %d", rc));
+                cFYI(1, "Error in FindFirst = %d", rc);
                cifs_buf_release(pSMB);
@@ -3716,7 +3738,7 @@ findFirstRetry:
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        }
@@ -3744,7 +3766,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned, name_len;
        __u16 params, byte_count;
-        cFYI(1, ("In FindNext"));
+        cFYI(1, "In FindNext");
        if (psrch_inf->endOfSearch)
                return -ENOENT;
@@ -3808,7 +3830,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        cifs_buf_release(pSMB);
                        rc = 0; /* search probably was closed at end of search*/
                } else
-                        cFYI(1, ("FindNext returned = %d", rc));
+                        cFYI(1, "FindNext returned = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3844,15 +3866,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                        lnoff = le16_to_cpu(parms->LastNameOffset);
                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
                              lnoff) {
-                                cERROR(1, ("ignoring corrupt resume name"));
+                                cERROR(1, "ignoring corrupt resume name");
                                psrch_inf->last_entry = NULL;
                                return rc;
                        } else
                                psrch_inf->last_entry =
                                        psrch_inf->srch_entries_start + lnoff;
-/*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
-            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
+            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
                        /* BB fixme add unlock here */
                }
@@ -3877,7 +3899,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        FINDCLOSE_REQ *pSMB = NULL;
-        cFYI(1, ("In CIFSSMBFindClose"));
+        cFYI(1, "In CIFSSMBFindClose");
        rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
        /* no sense returning error if session restarted
@@ -3891,7 +3913,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
        pSMB->ByteCount = 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cERROR(1, ("Send error in FindClose = %d", rc));
+                cERROR(1, "Send error in FindClose = %d", rc);
        cifs_stats_inc(&tcon->num_fclose);
@@ -3914,7 +3936,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
        int name_len, bytes_returned;
        __u16 params, byte_count;
-        cFYI(1, ("In GetSrvInodeNum for %s", searchName));
+        cFYI(1, "In GetSrvInodeNum for %s", searchName);
        if (tcon == NULL)
                return -ENODEV;
@@ -3964,7 +3986,7 @@ GetInodeNumberRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("error %d in QueryInternalInfo", rc));
+                cFYI(1, "error %d in QueryInternalInfo", rc);
        } else {
                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3979,7 +4001,7 @@ GetInodeNumberRetry:
                        struct file_internal_info *pfinfo;
                        /* BB Do we need a cast or hash here ? */
                        if (count < 8) {
-                                cFYI(1, ("Illegal size ret in QryIntrnlInf"));
+                                cFYI(1, "Illegal size ret in QryIntrnlInf");
                                rc = -EIO;
                                goto GetInodeNumOut;
                        }
@@ -4020,16 +4042,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
        if (*num_of_nodes < 1) {
-                cERROR(1, ("num_referrals: must be at least > 0,"
+                cERROR(1, "num_referrals: must be at least > 0,"
-                        "but we get num_referrals = %d\n", *num_of_nodes));
+                        "but we get num_referrals = %d\n", *num_of_nodes);
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
        ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
        if (ref->VersionNumber != cpu_to_le16(3)) {
-                cERROR(1, ("Referrals of V%d version are not supported,"
+                cERROR(1, "Referrals of V%d version are not supported,"
-                        "should be V3", le16_to_cpu(ref->VersionNumber)));
+                        "should be V3", le16_to_cpu(ref->VersionNumber));
                rc = -EINVAL;
                goto parse_DFS_referrals_exit;
        }
@@ -4038,14 +4060,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        data_end = (char *)(&(pSMBr->PathConsumed)) +
                                le16_to_cpu(pSMBr->t2.DataCount);
-        cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+        cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
                        *num_of_nodes,
-                        le32_to_cpu(pSMBr->DFSFlags)));
+                        le32_to_cpu(pSMBr->DFSFlags));
        *target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
                        *num_of_nodes, GFP_KERNEL);
        if (*target_nodes == NULL) {
-                cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+                cERROR(1, "Failed to allocate buffer for target_nodes\n");
                rc = -ENOMEM;
                goto parse_DFS_referrals_exit;
        }
@@ -4121,7 +4143,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
        *num_of_nodes = 0;
        *target_nodes = NULL;
-        cFYI(1, ("In GetDFSRefer the path %s", searchName));
+        cFYI(1, "In GetDFSRefer the path %s", searchName);
        if (ses == NULL)
                return -ENODEV;
 getDFSRetry:
@@ -4188,7 +4210,7 @@ getDFSRetry:
        rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in GetDFSRefer = %d", rc));
+                cFYI(1, "Send error in GetDFSRefer = %d", rc);
                goto GetDFSRefExit;
        }
        rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4199,9 +4221,9 @@ getDFSRetry:
                goto GetDFSRefExit;
        }
-        cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+        cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
                                pSMBr->ByteCount,
-                                le16_to_cpu(pSMBr->t2.DataOffset)));
+                                le16_to_cpu(pSMBr->t2.DataOffset));
        /* parse returned result into more usable form */
        rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4229,7 +4251,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("OldQFSInfo"));
+        cFYI(1, "OldQFSInfo");
 oldQFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                (void **) &pSMBr);
@@ -4262,7 +4284,7 @@ oldQFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4270,8 +4292,8 @@ oldQFSInfoRetry:
                        rc = -EIO;      /* bad smb */
                else {
                        __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-                        cFYI(1, ("qfsinf resp BCC: %d  Offset %d",
+                        cFYI(1, "qfsinf resp BCC: %d  Offset %d",
-                                 pSMBr->ByteCount, data_offset));
+                                 pSMBr->ByteCount, data_offset);
                        response_data = (FILE_SYSTEM_ALLOC_INFO *)
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4283,11 +4305,10 @@ oldQFSInfoRetry:
                               le32_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                                le32_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4309,7 +4330,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSInfo"));
+        cFYI(1, "In QFSInfo");
 QFSInfoRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4342,7 +4363,7 @@ QFSInfoRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSInfo = %d", rc));
+                cFYI(1, "Send error in QFSInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4363,11 +4384,10 @@ QFSInfoRetry:
                            le64_to_cpu(response_data->TotalAllocationUnits);
                        FSData->f_bfree = FSData->f_bavail =
                            le64_to_cpu(response_data->FreeAllocationUnits);
-                        cFYI(1,
+                        cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-                             ("Blocks: %lld  Free: %lld Block size %ld",
+                             (unsigned long long)FSData->f_blocks,
-                              (unsigned long long)FSData->f_blocks,
+                             (unsigned long long)FSData->f_bfree,
-                              (unsigned long long)FSData->f_bfree,
+                             FSData->f_bsize);
-                              FSData->f_bsize));
                }
        }
        cifs_buf_release(pSMB);
@@ -4389,7 +4409,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSAttributeInfo"));
+        cFYI(1, "In QFSAttributeInfo");
 QFSAttributeRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4423,7 +4443,7 @@ QFSAttributeRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSAttributeInfo = %d", rc));
+                cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4459,7 +4479,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSDeviceInfo"));
+        cFYI(1, "In QFSDeviceInfo");
 QFSDeviceRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4494,7 +4514,7 @@ QFSDeviceRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSDeviceInfo = %d", rc));
+                cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4529,10 +4549,10 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSUnixInfo"));
+        cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                   (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
@@ -4563,7 +4583,7 @@ QFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in QFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4598,11 +4618,11 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
        int bytes_returned = 0;
        __u16 params, param_offset, offset, byte_count;
-        cFYI(1, ("In SETFSUnixInfo"));
+        cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
        /* BB switch to small buf init to save memory */
-        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+        rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
-                      (void **) &pSMBr);
+                                        (void **) &pSMB, (void **) &pSMBr);
        if (rc)
                return rc;
@@ -4646,7 +4666,7 @@ SETFSUnixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
+                cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc)
@@ -4674,7 +4694,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        __u16 params, byte_count;
-        cFYI(1, ("In QFSPosixInfo"));
+        cFYI(1, "In QFSPosixInfo");
 QFSPosixRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4708,7 +4728,7 @@ QFSPosixRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QFSUnixInfo = %d", rc));
+                cFYI(1, "Send error in QFSUnixInfo = %d", rc);
        } else {                /* decode response */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4768,7 +4788,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, byte_count, data_count, param_offset, offset;
-        cFYI(1, ("In SetEOF"));
+        cFYI(1, "In SetEOF");
 SetEOFRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -4834,7 +4854,7 @@ SetEOFRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (file size) returned %d", rc));
+                cFYI(1, "SetPathInfo (file size) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -4854,8 +4874,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
+        cFYI(1, "SetFileSize (via SetFileInfo) %lld",
-                        (long long)size));
+                        (long long)size);
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4914,9 +4934,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        pSMB->ByteCount = cpu_to_le16(byte_count);
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc) {
-                cFYI(1,
+                cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
-                     ("Send error in SetFileInfo (SetFileSize) = %d",
-                      rc));
        }
        /* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4940,7 +4958,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Times (via SetFileInfo)"));
+        cFYI(1, "Set Times (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -4985,7 +5003,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -5002,7 +5020,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+        cFYI(1, "Set File Disposition (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -5044,7 +5062,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
        *data_offset = delete_file ? 1 : 0;
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+                cFYI(1, "Send error in SetFileDisposition = %d", rc);
        return rc;
 }
@@ -5062,7 +5080,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
        char *data_offset;
        __u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("In SetTimes"));
+        cFYI(1, "In SetTimes");
 SetTimesRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5118,7 +5136,7 @@ SetTimesRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (times) returned %d", rc));
+                cFYI(1, "SetPathInfo (times) returned %d", rc);
        cifs_buf_release(pSMB);
@@ -5143,7 +5161,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
        int bytes_returned;
        int name_len;
-        cFYI(1, ("In SetAttrLegacy"));
+        cFYI(1, "In SetAttrLegacy");
 SetAttrLgcyRetry:
        rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5169,7 +5187,7 @@ SetAttrLgcyRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("Error in LegacySetAttr = %d", rc));
+                cFYI(1, "Error in LegacySetAttr = %d", rc);
        cifs_buf_release(pSMB);
@@ -5231,7 +5249,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        int rc = 0;
        u16 params, param_offset, offset, byte_count, count;
-        cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+        cFYI(1, "Set Unix Info (via SetFileInfo)");
        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
        if (rc)
@@ -5276,7 +5294,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
        if (rc)
-                cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+                cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -5297,7 +5315,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
-        cFYI(1, ("In SetUID/GID/Mode"));
+        cFYI(1, "In SetUID/GID/Mode");
 setPermsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5353,7 +5371,7 @@ setPermsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (perms) returned %d", rc));
+                cFYI(1, "SetPathInfo (perms) returned %d", rc);
        cifs_buf_release(pSMB);
        if (rc == -EAGAIN)
@@ -5372,7 +5390,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
        struct dir_notify_req *dnotify_req;
        int bytes_returned;
-        cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid));
+        cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
        rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
        if (rc)
@@ -5406,7 +5424,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
                         (struct smb_hdr *)pSMBr, &bytes_returned,
                         CIFS_ASYNC_OP);
        if (rc) {
-                cFYI(1, ("Error in Notify = %d", rc));
+                cFYI(1, "Error in Notify = %d", rc);
        } else {
                /* Add file to outstanding requests */
                /* BB change to kmem cache alloc */
@@ -5462,7 +5480,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
        char *end_of_smb;
        __u16 params, byte_count, data_offset;
-        cFYI(1, ("In Query All EAs path %s", searchName));
+        cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5509,7 +5527,7 @@ QAllEAsRetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc) {
-                cFYI(1, ("Send error in QueryAllEAs = %d", rc));
+                cFYI(1, "Send error in QueryAllEAs = %d", rc);
                goto QAllEAsOut;
        }
@@ -5537,16 +5555,16 @@ QAllEAsRetry:
                                (((char *) &pSMBr->hdr.Protocol) + data_offset);
        list_len = le32_to_cpu(ea_response_data->list_len);
-        cFYI(1, ("ea length %d", list_len));
+        cFYI(1, "ea length %d", list_len);
        if (list_len <= 8) {
-                cFYI(1, ("empty EA list returned from server"));
+                cFYI(1, "empty EA list returned from server");
                goto QAllEAsOut;
        }
        /* make sure list_len doesn't go past end of SMB */
        end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
        if ((char *)ea_response_data + list_len > end_of_smb) {
-                cFYI(1, ("EA list appears to go beyond SMB"));
+                cFYI(1, "EA list appears to go beyond SMB");
                rc = -EIO;
                goto QAllEAsOut;
        }
@@ -5563,7 +5581,7 @@ QAllEAsRetry:
                temp_ptr += 4;
                /* make sure we can read name_len and value_len */
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5572,7 +5590,7 @@ QAllEAsRetry:
                value_len = le16_to_cpu(temp_fea->value_len);
                list_len -= name_len + 1 + value_len;
                if (list_len < 0) {
-                        cFYI(1, ("EA entry goes beyond length of list"));
+                        cFYI(1, "EA entry goes beyond length of list");
                        rc = -EIO;
                        goto QAllEAsOut;
                }
@@ -5639,7 +5657,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
        int bytes_returned = 0;
        __u16 params, param_offset, byte_count, offset, count;
-        cFYI(1, ("In SetEA"));
+        cFYI(1, "In SetEA");
 SetEARetry:
        rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -5721,7 +5739,7 @@ SetEARetry:
        rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
                         (struct smb_hdr *) pSMBr, &bytes_returned, 0);
        if (rc)
-                cFYI(1, ("SetPathInfo (EA) returned %d", rc));
+                cFYI(1, "SetPathInfo (EA) returned %d", rc);
        cifs_buf_release(pSMB);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d9566bf8f917..88c84a38bccb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -48,6 +48,7 @@
 #include "nterr.h"
 #include "rfc1002pdu.h"
 #include "cn_cifs.h"
+#include "fscache.h"
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -66,6 +67,7 @@ struct smb_vol {
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
+        uid_t cred_uid;
        uid_t linux_uid;
        gid_t linux_gid;
        mode_t file_mode;
@@ -97,11 +99,13 @@ struct smb_vol {
        bool noblocksnd:1;
        bool noautotune:1;
        bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
+        bool fsc:1;     /* enable fscache */
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
        unsigned short int port;
        char *prepath;
+        struct nls_table *local_nls;
 };
 static int ipv4_connect(struct TCP_Server_Info *server);
@@ -135,7 +139,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        spin_unlock(&GlobalMid_Lock);
        server->maxBuf = 0;
-        cFYI(1, ("Reconnecting tcp session"));
+        cFYI(1, "Reconnecting tcp session");
        /* before reconnecting the tcp session, mark the smb session (uid)
                and the tid bad so they are not used until reconnected */
@@ -153,12 +157,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
        /* do not want to be sending data on a socket we are freeing */
        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
-                cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
+                cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                kernel_sock_shutdown(server->ssocket, SHUT_WR);
-                cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
+                cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
                        server->ssocket->state,
-                        server->ssocket->flags));
+                        server->ssocket->flags);
                sock_release(server->ssocket);
                server->ssocket = NULL;
        }
@@ -187,7 +191,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                else
                        rc = ipv4_connect(server);
                if (rc) {
-                        cFYI(1, ("reconnect error %d", rc));
+                        cFYI(1, "reconnect error %d", rc);
                        msleep(3000);
                } else {
                        atomic_inc(&tcpSesReconnectCount);
@@ -223,7 +227,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        /* check for plausible wct, bcc and t2 data and parm sizes */
        /* check for parm and data offset going beyond end of smb */
        if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-                cFYI(1, ("invalid transact2 word count"));
+                cFYI(1, "invalid transact2 word count");
                return -EINVAL;
        }
@@ -237,15 +241,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        if (remaining == 0)
                return 0;
        else if (remaining < 0) {
-                cFYI(1, ("total data %d smaller than data in frame %d",
+                cFYI(1, "total data %d smaller than data in frame %d",
-                        total_data_size, data_in_this_rsp));
+                        total_data_size, data_in_this_rsp);
                return -EINVAL;
        } else {
-                cFYI(1, ("missing %d bytes from transact2, check next response",
+                cFYI(1, "missing %d bytes from transact2, check next response",
-                        remaining));
+                        remaining);
                if (total_data_size > maxBufSize) {
-                        cERROR(1, ("TotalDataSize %d is over maximum buffer %d",
+                        cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                                total_data_size, maxBufSize));
+                                total_data_size, maxBufSize);
                        return -EINVAL;
                }
                return remaining;
@@ -267,7 +271,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
        if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
-                cFYI(1, ("total data size of primary and secondary t2 differ"));
+                cFYI(1, "total data size of primary and secondary t2 differ");
        }
        total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -282,7 +286,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
        if (remaining < total_in_buf2) {
-                cFYI(1, ("transact2 2nd response contains too much data"));
+                cFYI(1, "transact2 2nd response contains too much data");
        }
        /* find end of first SMB data area */
@@ -311,7 +315,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
        pTargetSMB->smb_buf_length = byte_count;
        if (remaining == total_in_buf2) {
-                cFYI(1, ("found the last secondary response"));
+                cFYI(1, "found the last secondary response");
                return 0; /* we are done */
        } else /* more responses to go */
                return 1;
@@ -339,7 +343,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        int reconnect;
        current->flags |= PF_MEMALLOC;
-        cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
+        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
        length = atomic_inc_return(&tcpSesAllocCount);
        if (length > 1)
@@ -353,7 +357,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (bigbuf == NULL) {
                        bigbuf = cifs_buf_get();
                        if (!bigbuf) {
-                                cERROR(1, ("No memory for large SMB response"));
+                                cERROR(1, "No memory for large SMB response");
                                msleep(3000);
                                /* retry will check if exiting */
                                continue;
@@ -366,7 +370,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                if (smallbuf == NULL) {
                        smallbuf = cifs_small_buf_get();
                        if (!smallbuf) {
-                                cERROR(1, ("No memory for SMB response"));
+                                cERROR(1, "No memory for SMB response");
                                msleep(1000);
                                /* retry will check if exiting */
                                continue;
@@ -391,12 +395,14 @@ incomplete_rcv:
                if (server->tcpStatus == CifsExiting) {
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
-                        cFYI(1, ("Reconnect after server stopped responding"));
+                        cFYI(1, "Reconnect after server stopped responding");
                        cifs_reconnect(server);
-                        cFYI(1, ("call to reconnect done"));
+                        cFYI(1, "call to reconnect done");
                        csocket = server->ssocket;
                        continue;
-                } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
+                } else if (length == -ERESTARTSYS ||
+                           length == -EAGAIN ||
+                           length == -EINTR) {
                        msleep(1); /* minimum sleep to prevent looping
                                allowing socket to clear and app threads to set
                                tcpStatus CifsNeedReconnect if server hung */
@@ -410,27 +416,15 @@ incomplete_rcv:
                        } else
                                continue;
                } else if (length <= 0) {
-                        if (server->tcpStatus == CifsNew) {
+                        cFYI(1, "Reconnect after unexpected peek error %d",
-                                cFYI(1, ("tcp session abend after SMBnegprot"));
+                                length);
-                                /* some servers kill the TCP session rather than
-                                   returning an SMB negprot error, in which
-                                   case reconnecting here is not going to help,
-                                   and so simply return error to mount */
-                                break;
-                        }
-                        if (!try_to_freeze() && (length == -EINTR)) {
-                                cFYI(1, ("cifsd thread killed"));
-                                break;
-                        }
-                        cFYI(1, ("Reconnect after unexpected peek error %d",
-                                length));
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
                        continue;
                } else if (length < pdu_length) {
-                        cFYI(1, ("requested %d bytes but only got %d bytes",
+                        cFYI(1, "requested %d bytes but only got %d bytes",
-                                  pdu_length, length));
+                                  pdu_length, length);
                        pdu_length -= length;
                        msleep(1);
                        goto incomplete_rcv;
@@ -450,41 +444,33 @@ incomplete_rcv:
                pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
                smb_buffer->smb_buf_length = pdu_length;
-                cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
+                cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
                if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
                        continue;
                } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-                        cFYI(1, ("Good RFC 1002 session rsp"));
+                        cFYI(1, "Good RFC 1002 session rsp");
                        continue;
                } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
                        /* we get this from Windows 98 instead of
                           an error on SMB negprot response */
-                        cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)",
+                        cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
-                                pdu_length));
+                                pdu_length);
-                        if (server->tcpStatus == CifsNew) {
+                        /* give server a second to clean up  */
-                                /* if nack on negprot (rather than
+                        msleep(1000);
-                                ret of smb negprot error) reconnecting
+                        /* always try 445 first on reconnect since we get NACK
-                                not going to help, ret error to mount */
+                         * on some if we ever connected to port 139 (the NACK
-                                break;
+                         * is since we do not begin with RFC1001 session
-                        } else {
+                         * initialize frame)
-                                /* give server a second to
+                         */
-                                clean up before reconnect attempt */
+                        cifs_set_port((struct sockaddr *)
-                                msleep(1000);
+                                        &server->addr.sockAddr, CIFS_PORT);
-                                /* always try 445 first on reconnect
+                        cifs_reconnect(server);
-                                since we get NACK on some if we ever
+                        csocket = server->ssocket;
-                                connected to port 139 (the NACK is
+                        wake_up(&server->response_q);
-                                since we do not begin with RFC1001
+                        continue;
-                                session initialize frame) */
-                                server->addr.sockAddr.sin_port =
-                                        htons(CIFS_PORT);
-                                cifs_reconnect(server);
-                                csocket = server->ssocket;
-                                wake_up(&server->response_q);
-                                continue;
-                        }
                } else if (temp != (char) 0) {
-                        cERROR(1, ("Unknown RFC 1002 frame"));
+                        cERROR(1, "Unknown RFC 1002 frame");
                        cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
                                      length);
                        cifs_reconnect(server);
@@ -495,8 +481,8 @@ incomplete_rcv:
                /* else we have an SMB response */
                if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
                            (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                        cERROR(1, ("Invalid size SMB length %d pdu_length %d",
+                        cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                                        length, pdu_length+4));
+                                        length, pdu_length+4);
                        cifs_reconnect(server);
                        csocket = server->ssocket;
                        wake_up(&server->response_q);
@@ -518,8 +504,7 @@ incomplete_rcv:
                     total_read += length) {
                        length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
                                                pdu_length - total_read, 0);
-                        if ((server->tcpStatus == CifsExiting) ||
+                        if (server->tcpStatus == CifsExiting) {
-                            (length == -EINTR)) {
                                /* then will exit */
                                reconnect = 2;
                                break;
@@ -530,8 +515,9 @@ incomplete_rcv:
                                /* Now we will reread sock */
                                reconnect = 1;
                                break;
-                        } else if ((length == -ERESTARTSYS) ||
+                        } else if (length == -ERESTARTSYS ||
-                                   (length == -EAGAIN)) {
+                                   length == -EAGAIN ||
+                                   length == -EINTR) {
                                msleep(1); /* minimum sleep to prevent looping,
                                              allowing socket to clear and app
                                              threads to set tcpStatus
@@ -539,8 +525,8 @@ incomplete_rcv:
                                length = 0;
                                continue;
                        } else if (length <= 0) {
-                                cERROR(1, ("Received no data, expecting %d",
+                                cERROR(1, "Received no data, expecting %d",
-                                              pdu_length - total_read));
+                                              pdu_length - total_read);
                                cifs_reconnect(server);
                                csocket = server->ssocket;
                                reconnect = 1;
@@ -588,7 +574,7 @@ incomplete_rcv:
                                                }
                                        } else {
                                                if (!isLargeBuf) {
-                                                        cERROR(1,("1st trans2 resp needs bigbuf"));
+                                                        cERROR(1, "1st trans2 resp needs bigbuf");
                                        /* BB maybe we can fix this up,  switch
                                           to already allocated large buffer? */
                                                } else {
@@ -630,8 +616,8 @@ multi_t2_fnd:
                        wake_up_process(task_to_wake);
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
-                        cERROR(1, ("No task to wake, unknown frame received! "
+                        cERROR(1, "No task to wake, unknown frame received! "
-                                   "NumMids %d", midCount.counter));
+                                   "NumMids %d", midCount.counter);
                        cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
                                      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -708,8 +694,8 @@ multi_t2_fnd:
                list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                                cFYI(1, ("Clearing Mid 0x%x - waking up ",
+                                cFYI(1, "Clearing Mid 0x%x - waking up ",
-                                         mid_entry->mid));
+                                         mid_entry->mid);
                                task_to_wake = mid_entry->tsk;
                                if (task_to_wake)
                                        wake_up_process(task_to_wake);
@@ -728,7 +714,7 @@ multi_t2_fnd:
                to wait at least 45 seconds before giving up
                on a request getting a response and going ahead
                and killing cifsd */
-                cFYI(1, ("Wait for exit from demultiplex thread"));
+                cFYI(1, "Wait for exit from demultiplex thread");
                msleep(46000);
                /* if threads still have not exited they are probably never
                coming home not much else we can do but free the memory */
@@ -829,7 +815,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
-        vol->linux_uid = current_uid();  /* use current_euid() instead? */
+        vol->cred_uid = current_uid();
+        vol->linux_uid = current_uid();
        vol->linux_gid = current_gid();
        /* default to only allowing write access to owner of the mount */
@@ -849,7 +836,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        separator[0] = options[4];
                        options += 5;
                } else {
-                        cFYI(1, ("Null separator not allowed"));
+                        cFYI(1, "Null separator not allowed");
                }
        }
@@ -974,7 +961,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sec", 3) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no security value specified"));
+                                cERROR(1, "no security value specified");
                                continue;
                        } else if (strnicmp(value, "krb5i", 5) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -982,7 +969,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "krb5p", 5) == 0) {
                                /* vol->secFlg |= CIFSSEC_MUST_SEAL |
                                        CIFSSEC_MAY_KRB5; */
-                                cERROR(1, ("Krb5 cifs privacy not supported"));
+                                cERROR(1, "Krb5 cifs privacy not supported");
                                return 1;
                        } else if (strnicmp(value, "krb5", 4) == 0) {
                                vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1014,7 +1001,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        } else if (strnicmp(value, "none", 4) == 0) {
                                vol->nullauth = 1;
                        } else {
-                                cERROR(1, ("bad security option: %s", value));
+                                cERROR(1, "bad security option: %s", value);
                                return 1;
                        }
                } else if ((strnicmp(data, "unc", 3) == 0)
@@ -1053,7 +1040,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                        a domain name and need special handling? */
                        if (strnlen(value, 256) < 256) {
                                vol->domainname = value;
-                                cFYI(1, ("Domain name set"));
+                                cFYI(1, "Domain name set");
                        } else {
                                printk(KERN_WARNING "CIFS: domain name too "
                                                    "long\n");
@@ -1076,7 +1063,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        strcpy(vol->prepath+1, value);
                                } else
                                        strcpy(vol->prepath, value);
-                                cFYI(1, ("prefix path %s", vol->prepath));
+                                cFYI(1, "prefix path %s", vol->prepath);
                        } else {
                                printk(KERN_WARNING "CIFS: prefix too long\n");
                                return 1;
@@ -1092,7 +1079,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                                        vol->iocharset = value;
                                /* if iocharset not set then load_nls_default
                                   is used by caller */
-                                cFYI(1, ("iocharset set to %s", value));
+                                cFYI(1, "iocharset set to %s", value);
                        } else {
                                printk(KERN_WARNING "CIFS: iocharset name "
                                                    "too long.\n");
@@ -1144,14 +1131,14 @@ cifs_parse_mount_options(char *options, const char *devname,
                        }
                } else if (strnicmp(data, "sockopt", 5) == 0) {
                        if (!value || !*value) {
-                                cERROR(1, ("no socket option specified"));
+                                cERROR(1, "no socket option specified");
                                continue;
                        } else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
                                vol->sockopt_tcp_nodelay = 1;
                        }
                } else if (strnicmp(data, "netbiosname", 4) == 0) {
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("invalid (empty) netbiosname"));
+                                cFYI(1, "invalid (empty) netbiosname");
                        } else {
                                memset(vol->source_rfc1001_name, 0x20, 15);
                                for (i = 0; i < 15; i++) {
@@ -1175,7 +1162,7 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if (strnicmp(data, "servern", 7) == 0) {
                        /* servernetbiosname specified override *SMBSERVER */
                        if (!value || !*value || (*value == ' ')) {
-                                cFYI(1, ("empty server netbiosname specified"));
+                                cFYI(1, "empty server netbiosname specified");
                        } else {
                                /* last byte, type, is 0x20 for servr type */
                                memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1256,6 +1243,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                } else if ((strnicmp(data, "nocase", 6) == 0) ||
                           (strnicmp(data, "ignorecase", 10)  == 0)) {
                        vol->nocase = 1;
+                } else if (strnicmp(data, "mand", 4) == 0) {
+                        /* ignore */
+                } else if (strnicmp(data, "nomand", 6) == 0) {
+                        /* ignore */
+                } else if (strnicmp(data, "_netdev", 7) == 0) {
+                        /* ignore */
                } else if (strnicmp(data, "brl", 3) == 0) {
                        vol->nobrl =  0;
                } else if ((strnicmp(data, "nobrl", 5) == 0) ||
@@ -1330,6 +1323,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
                                "/proc/fs/cifs/LookupCacheEnabled to 0\n");
+                } else if (strnicmp(data, "fsc", 3) == 0) {
+                        vol->fsc = true;
                } else
                        printk(KERN_WARNING "CIFS: Unknown mount option %s\n",
                                                data);
@@ -1379,18 +1374,92 @@ cifs_parse_mount_options(char *options, const char *devname,
        return 0;
 }
+static bool
+match_address(struct TCP_Server_Info *server, struct sockaddr *addr)
+{
+        struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+        switch (addr->sa_family) {
+        case AF_INET:
+                if (addr4->sin_addr.s_addr !=
+                    server->addr.sockAddr.sin_addr.s_addr)
+                        return false;
+                if (addr4->sin_port &&
+                    addr4->sin_port != server->addr.sockAddr.sin_port)
+                        return false;
+                break;
+        case AF_INET6:
+                if (!ipv6_addr_equal(&addr6->sin6_addr,
+                                     &server->addr.sockAddr6.sin6_addr))
+                        return false;
+                if (addr6->sin6_scope_id !=
+                    server->addr.sockAddr6.sin6_scope_id)
+                        return false;
+                if (addr6->sin6_port &&
+                    addr6->sin6_port != server->addr.sockAddr6.sin6_port)
+                        return false;
+                break;
+        }
+        return true;
+}
+static bool
+match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
+{
+        unsigned int secFlags;
+        if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+                secFlags = vol->secFlg;
+        else
+                secFlags = global_secflags | vol->secFlg;
+        switch (server->secType) {
+        case LANMAN:
+                if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
+                        return false;
+                break;
+        case NTLMv2:
+                if (!(secFlags & CIFSSEC_MAY_NTLMV2))
+                        return false;
+                break;
+        case NTLM:
+                if (!(secFlags & CIFSSEC_MAY_NTLM))
+                        return false;
+                break;
+        case Kerberos:
+                if (!(secFlags & CIFSSEC_MAY_KRB5))
+                        return false;
+                break;
+        case RawNTLMSSP:
+                if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
+                        return false;
+                break;
+        default:
+                /* shouldn't happen */
+                return false;
+        }
+        /* now check if signing mode is acceptible */
+        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
+            (server->secMode & SECMODE_SIGN_REQUIRED))
+                        return false;
+        else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
+                 (server->secMode &
+                  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
+                        return false;
+        return true;
+}
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
+cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 {
-        struct list_head *tmp;
        struct TCP_Server_Info *server;
-        struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
-        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
        write_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &cifs_tcp_ses_list) {
+        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                server = list_entry(tmp, struct TCP_Server_Info,
-                                    tcp_ses_list);
                /*
                 * the demux thread can exit on its own while still in CifsNew
                 * so don't accept any sockets in that state. Since the
@@ -1400,41 +1469,15 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
                if (server->tcpStatus == CifsNew)
                        continue;
-                switch (addr->ss_family) {
+                if (!match_address(server, addr))
-                case AF_INET:
+                        continue;
-                        if (addr4->sin_addr.s_addr ==
-                            server->addr.sockAddr.sin_addr.s_addr) {
-                                addr4->sin_port = htons(port);
-                                /* user overrode default port? */
-                                if (addr4->sin_port) {
-                                        if (addr4->sin_port !=
-                                            server->addr.sockAddr.sin_port)
-                                                continue;
-                                }
-                                break;
-                        } else
-                                continue;
-                case AF_INET6:
+                if (!match_security(server, vol))
-                        if (ipv6_addr_equal(&addr6->sin6_addr,
+                        continue;
-                            &server->addr.sockAddr6.sin6_addr) &&
-                            (addr6->sin6_scope_id ==
-                            server->addr.sockAddr6.sin6_scope_id)) {
-                                addr6->sin6_port = htons(port);
-                                /* user overrode default port? */
-                                if (addr6->sin6_port) {
-                                        if (addr6->sin6_port !=
-                                           server->addr.sockAddr6.sin6_port)
-                                                continue;
-                                }
-                                break;
-                        } else
-                                continue;
-                }
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
-                cFYI(1, ("Existing tcp session with server found"));
+                cFYI(1, "Existing tcp session with server found");
                return server;
        }
        write_unlock(&cifs_tcp_ses_lock);
@@ -1459,6 +1502,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
        server->tcpStatus = CifsExiting;
        spin_unlock(&GlobalMid_Lock);
+        cifs_fscache_release_client_cookie(server);
        task = xchg(&server->tsk, NULL);
        if (task)
                force_sig(SIGKILL, task);
@@ -1475,10 +1520,13 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        memset(&addr, 0, sizeof(struct sockaddr_storage));
-        cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
+        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
        if (volume_info->UNCip && volume_info->UNC) {
-                rc = cifs_convert_address(volume_info->UNCip, &addr);
+                rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
+                                        volume_info->UNCip,
+                                        strlen(volume_info->UNCip),
+                                        volume_info->port);
                if (!rc) {
                        /* we failed translating address */
                        rc = -EINVAL;
@@ -1487,19 +1535,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        } else if (volume_info->UNCip) {
                /* BB using ip addr as tcp_ses name to connect to the
                   DFS root below */
-                cERROR(1, ("Connecting to DFS root not implemented yet"));
+                cERROR(1, "Connecting to DFS root not implemented yet");
                rc = -EINVAL;
                goto out_err;
        } else /* which tcp_sess DFS root would we conect to */ {
-                cERROR(1,
+                cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-                       ("CIFS mount error: No UNC path (e.g. -o "
+                        "unc=//192.168.1.100/public) specified");
-                        "unc=//192.168.1.100/public) specified"));
                rc = -EINVAL;
                goto out_err;
        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
+        tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
        if (tcp_ses)
                return tcp_ses;
@@ -1540,21 +1587,19 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        ++tcp_ses->srv_count;
        if (addr.ss_family == AF_INET6) {
-                cFYI(1, ("attempting ipv6 connect"));
+                cFYI(1, "attempting ipv6 connect");
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
-                sin_server6->sin6_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
                        sizeof(struct sockaddr_in6));
                rc = ipv6_connect(tcp_ses);
        } else {
-                sin_server->sin_port = htons(volume_info->port);
                memcpy(&tcp_ses->addr.sockAddr, sin_server,
                        sizeof(struct sockaddr_in));
                rc = ipv4_connect(tcp_ses);
        }
        if (rc < 0) {
-                cERROR(1, ("Error connecting to socket. Aborting operation"));
+                cERROR(1, "Error connecting to socket. Aborting operation");
                goto out_err;
        }
@@ -1567,7 +1612,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                                  tcp_ses, "cifsd");
        if (IS_ERR(tcp_ses->tsk)) {
                rc = PTR_ERR(tcp_ses->tsk);
-                cERROR(1, ("error %d create cifsd thread", rc));
+                cERROR(1, "error %d create cifsd thread", rc);
                module_put(THIS_MODULE);
                goto out_err;
        }
@@ -1577,6 +1622,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
        write_unlock(&cifs_tcp_ses_lock);
+        cifs_fscache_get_client_cookie(tcp_ses);
        return tcp_ses;
 out_err:
@@ -1591,17 +1638,29 @@ out_err:
 }
 static struct cifsSesInfo *
-cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-        struct list_head *tmp;
        struct cifsSesInfo *ses;
        write_lock(&cifs_tcp_ses_lock);
-        list_for_each(tmp, &server->smb_ses_list) {
+        list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-                ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+                switch (server->secType) {
-                if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
+                case Kerberos:
-                        continue;
+                        if (vol->cred_uid != ses->cred_uid)
+                                continue;
+                        break;
+                default:
+                        /* anything else takes username/password */
+                        if (strncmp(ses->userName, vol->username,
+                                    MAX_USERNAME_SIZE))
+                                continue;
+                        if (strlen(vol->username) != 0 &&
+                            ses->password != NULL &&
+                            strncmp(ses->password,
+                                    vol->password ? vol->password : "",
+                                    MAX_PASSWORD_SIZE))
+                                continue;
+                }
                ++ses->ses_count;
                write_unlock(&cifs_tcp_ses_lock);
                return ses;
@@ -1616,6 +1675,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        int xid;
        struct TCP_Server_Info *server = ses->server;
+        cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--ses->ses_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1634,6 +1694,103 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        cifs_put_tcp_session(server);
 }
+static struct cifsSesInfo *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+        int rc = -ENOMEM, xid;
+        struct cifsSesInfo *ses;
+        xid = GetXid();
+        ses = cifs_find_smb_ses(server, volume_info);
+        if (ses) {
+                cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+                mutex_lock(&ses->session_mutex);
+                rc = cifs_negotiate_protocol(xid, ses);
+                if (rc) {
+                        mutex_unlock(&ses->session_mutex);
+                        /* problem -- put our ses reference */
+                        cifs_put_smb_ses(ses);
+                        FreeXid(xid);
+                        return ERR_PTR(rc);
+                }
+                if (ses->need_reconnect) {
+                        cFYI(1, "Session needs reconnect");
+                        rc = cifs_setup_session(xid, ses,
+                                                volume_info->local_nls);
+                        if (rc) {
+                                mutex_unlock(&ses->session_mutex);
+                                /* problem -- put our reference */
+                                cifs_put_smb_ses(ses);
+                                FreeXid(xid);
+                                return ERR_PTR(rc);
+                        }
+                }
+                mutex_unlock(&ses->session_mutex);
+                /* existing SMB ses has a server reference already */
+                cifs_put_tcp_session(server);
+                FreeXid(xid);
+                return ses;
+        }
+        cFYI(1, "Existing smb sess not found");
+        ses = sesInfoAlloc();
+        if (ses == NULL)
+                goto get_ses_fail;
+        /* new SMB session uses our server ref */
+        ses->server = server;
+        if (server->addr.sockAddr6.sin6_family == AF_INET6)
+                sprintf(ses->serverName, "%pI6",
+                        &server->addr.sockAddr6.sin6_addr);
+        else
+                sprintf(ses->serverName, "%pI4",
+                        &server->addr.sockAddr.sin_addr.s_addr);
+        if (volume_info->username)
+                strncpy(ses->userName, volume_info->username,
+                        MAX_USERNAME_SIZE);
+        /* volume_info->password freed at unmount */
+        if (volume_info->password) {
+                ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!ses->password)
+                        goto get_ses_fail;
+        }
+        if (volume_info->domainname) {
+                int len = strlen(volume_info->domainname);
+                ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+                if (ses->domainName)
+                        strcpy(ses->domainName, volume_info->domainname);
+        }
+        ses->cred_uid = volume_info->cred_uid;
+        ses->linux_uid = volume_info->linux_uid;
+        ses->overrideSecFlg = volume_info->secFlg;
+        mutex_lock(&ses->session_mutex);
+        rc = cifs_negotiate_protocol(xid, ses);
+        if (!rc)
+                rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+        mutex_unlock(&ses->session_mutex);
+        if (rc)
+                goto get_ses_fail;
+        /* success, put it on the list */
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&ses->smb_ses_list, &server->smb_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        FreeXid(xid);
+        return ses;
+get_ses_fail:
+        sesInfoFree(ses);
+        FreeXid(xid);
+        return ERR_PTR(rc);
+}
 static struct cifsTconInfo *
 cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 {
@@ -1662,6 +1819,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        int xid;
        struct cifsSesInfo *ses = tcon->ses;
+        cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
        write_lock(&cifs_tcp_ses_lock);
        if (--tcon->tc_count > 0) {
                write_unlock(&cifs_tcp_ses_lock);
@@ -1675,10 +1833,87 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
        CIFSSMBTDis(xid, tcon);
        _FreeXid(xid);
+        cifs_fscache_release_super_cookie(tcon);
        tconInfoFree(tcon);
        cifs_put_smb_ses(ses);
 }
+static struct cifsTconInfo *
+cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+{
+        int rc, xid;
+        struct cifsTconInfo *tcon;
+        tcon = cifs_find_tcon(ses, volume_info->UNC);
+        if (tcon) {
+                cFYI(1, "Found match on UNC path");
+                /* existing tcon already has a reference */
+                cifs_put_smb_ses(ses);
+                if (tcon->seal != volume_info->seal)
+                        cERROR(1, "transport encryption setting "
+                                   "conflicts with existing tid");
+                return tcon;
+        }
+        tcon = tconInfoAlloc();
+        if (tcon == NULL) {
+                rc = -ENOMEM;
+                goto out_fail;
+        }
+        tcon->ses = ses;
+        if (volume_info->password) {
+                tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+                if (!tcon->password) {
+                        rc = -ENOMEM;
+                        goto out_fail;
+                }
+        }
+        if (strchr(volume_info->UNC + 3, '\\') == NULL
+            && strchr(volume_info->UNC + 3, '/') == NULL) {
+                cERROR(1, "Missing share name");
+                rc = -ENODEV;
+                goto out_fail;
+        }
+        /* BB Do we need to wrap session_mutex around
+         * this TCon call and Unix SetFS as
+         * we do on SessSetup and reconnect? */
+        xid = GetXid();
+        rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+        FreeXid(xid);
+        cFYI(1, "CIFS Tcon rc = %d", rc);
+        if (rc)
+                goto out_fail;
+        if (volume_info->nodfs) {
+                tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+                cFYI(1, "DFS disabled (%d)", tcon->Flags);
+        }
+        tcon->seal = volume_info->seal;
+        /* we can have only one retry value for a connection
+           to a share so for resources mounted more than once
+           to the same server share the last value passed in
+           for the retry flag is used */
+        tcon->retry = volume_info->retry;
+        tcon->nocase = volume_info->nocase;
+        tcon->local_lease = volume_info->local_lease;
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&tcon->tcon_list, &ses->tcon_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        cifs_fscache_get_super_cookie(tcon);
+        return tcon;
+out_fail:
+        tconInfoFree(tcon);
+        return ERR_PTR(rc);
+}
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
             const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1703,8 +1938,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
                strcpy(temp_unc + 2, pSesInfo->serverName);
                strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
                rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-                cFYI(1,
+                cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
-                     ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
                kfree(temp_unc);
        }
        if (rc == 0)
@@ -1777,12 +2011,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating socket", rc));
+                        cERROR(1, "Error %d creating socket", rc);
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("Socket created"));
+                cFYI(1, "Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket4(socket);
@@ -1827,7 +2061,7 @@ ipv4_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr.sin_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv4", rc));
+                cFYI(1, "Error %d connecting to server via ipv4", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -1855,12 +2089,12 @@ ipv4_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
-         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+         cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
                 socket->sk->sk_sndbuf,
-                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
+                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
        /* send RFC1001 sessinit */
        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1938,13 +2172,13 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
-                        cERROR(1, ("Error %d creating ipv6 socket", rc));
+                        cERROR(1, "Error %d creating ipv6 socket", rc);
                        socket = NULL;
                        return rc;
                }
                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                cFYI(1, ("ipv6 Socket created"));
+                cFYI(1, "ipv6 Socket created");
                server->ssocket = socket;
                socket->sk->sk_allocation = GFP_NOFS;
                cifs_reclassify_socket6(socket);
@@ -1988,7 +2222,7 @@ ipv6_connect(struct TCP_Server_Info *server)
        if (!connected) {
                if (orig_port)
                        server->addr.sockAddr6.sin6_port = orig_port;
-                cFYI(1, ("Error %d connecting to server via ipv6", rc));
+                cFYI(1, "Error %d connecting to server via ipv6", rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
@@ -2007,7 +2241,7 @@ ipv6_connect(struct TCP_Server_Info *server)
                rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
                                (char *)&val, sizeof(val));
                if (rc)
-                        cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+                        cFYI(1, "set TCP_NODELAY socket option error %d", rc);
        }
        server->ssocket = socket;
@@ -2032,13 +2266,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
        if (vol_info && vol_info->no_linux_ext) {
                tcon->fsUnixInfo.Capability = 0;
                tcon->unix_ext = 0; /* Unix Extensions disabled */
-                cFYI(1, ("Linux protocol extensions disabled"));
+                cFYI(1, "Linux protocol extensions disabled");
                return;
        } else if (vol_info)
                tcon->unix_ext = 1; /* Unix Extensions supported */
        if (tcon->unix_ext == 0) {
-                cFYI(1, ("Unix extensions disabled so not set on reconnect"));
+                cFYI(1, "Unix extensions disabled so not set on reconnect");
                return;
        }
@@ -2054,12 +2288,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                                cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                        if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
                                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                                        cERROR(1, ("POSIXPATH support change"));
+                                        cERROR(1, "POSIXPATH support change");
                                cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                        } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-                                cERROR(1, ("possible reconnect error"));
+                                cERROR(1, "possible reconnect error");
-                                cERROR(1,
+                                cERROR(1, "server disabled POSIX path support");
-                                        ("server disabled POSIX path support"));
                        }
                }
@@ -2067,7 +2300,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->no_psx_acl)
                        cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
                else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-                        cFYI(1, ("negotiated posix acl support"));
+                        cFYI(1, "negotiated posix acl support");
                        if (sb)
                                sb->s_flags |= MS_POSIXACL;
                }
@@ -2075,7 +2308,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (vol_info && vol_info->posix_paths == 0)
                        cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
                else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-                        cFYI(1, ("negotiate posix pathnames"));
+                        cFYI(1, "negotiate posix pathnames");
                        if (sb)
                                CIFS_SB(sb)->mnt_cifs_flags |=
                                        CIFS_MOUNT_POSIX_PATHS;
@@ -2090,39 +2323,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
                if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
                        if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
                                CIFS_SB(sb)->rsize = 127 * 1024;
-                                cFYI(DBG2,
+                                cFYI(DBG2, "larger reads not supported by srv");
-                                        ("larger reads not supported by srv"));
                        }
                }
-                cFYI(1, ("Negotiate caps 0x%x", (int)cap));
+                cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
                if (cap & CIFS_UNIX_FCNTL_CAP)
-                        cFYI(1, ("FCNTL cap"));
+                        cFYI(1, "FCNTL cap");
                if (cap & CIFS_UNIX_EXTATTR_CAP)
-                        cFYI(1, ("EXTATTR cap"));
+                        cFYI(1, "EXTATTR cap");
                if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-                        cFYI(1, ("POSIX path cap"));
+                        cFYI(1, "POSIX path cap");
                if (cap & CIFS_UNIX_XATTR_CAP)
-                        cFYI(1, ("XATTR cap"));
+                        cFYI(1, "XATTR cap");
                if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-                        cFYI(1, ("POSIX ACL cap"));
+                        cFYI(1, "POSIX ACL cap");
                if (cap & CIFS_UNIX_LARGE_READ_CAP)
-                        cFYI(1, ("very large read cap"));
+                        cFYI(1, "very large read cap");
                if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-                        cFYI(1, ("very large write cap"));
+                        cFYI(1, "very large write cap");
 #endif /* CIFS_DEBUG2 */
                if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
                        if (vol_info == NULL) {
-                                cFYI(1, ("resetting capabilities failed"));
+                                cFYI(1, "resetting capabilities failed");
                        } else
-                                cERROR(1, ("Negotiating Unix capabilities "
+                                cERROR(1, "Negotiating Unix capabilities "
                                           "with the server failed.  Consider "
                                           "mounting with the Unix Extensions\n"
                                           "disabled, if problems are found, "
                                           "by specifying the nounix mount "
-                                           "option."));
+                                           "option.");
                }
        }
@@ -2152,8 +2384,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                          struct cifs_sb_info *cifs_sb)
 {
        if (pvolume_info->rsize > CIFSMaxBufSize) {
-                cERROR(1, ("rsize %d too large, using MaxBufSize",
+                cERROR(1, "rsize %d too large, using MaxBufSize",
-                        pvolume_info->rsize));
+                        pvolume_info->rsize);
                cifs_sb->rsize = CIFSMaxBufSize;
        } else if ((pvolume_info->rsize) &&
                        (pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2162,8 +2394,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->rsize = CIFSMaxBufSize;
        if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-                cERROR(1, ("wsize %d too large, using 4096 instead",
+                cERROR(1, "wsize %d too large, using 4096 instead",
-                          pvolume_info->wsize));
+                          pvolume_info->wsize);
                cifs_sb->wsize = 4096;
        } else if (pvolume_info->wsize)
                cifs_sb->wsize = pvolume_info->wsize;
@@ -2181,7 +2413,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        if (cifs_sb->rsize < 2048) {
                cifs_sb->rsize = 2048;
                /* Windows ME may prefer this */
-                cFYI(1, ("readsize set to minimum: 2048"));
+                cFYI(1, "readsize set to minimum: 2048");
        }
        /* calculate prepath */
        cifs_sb->prepath = pvolume_info->prepath;
@@ -2199,8 +2431,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
        cifs_sb->mnt_gid = pvolume_info->linux_gid;
        cifs_sb->mnt_file_mode = pvolume_info->file_mode;
        cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-        cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+        cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
-                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+                cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
        if (pvolume_info->noperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2228,14 +2460,16 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
        if (pvolume_info->dynperm)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+        if (pvolume_info->fsc)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE;
        if (pvolume_info->direct_io) {
-                cFYI(1, ("mounting share using direct i/o"));
+                cFYI(1, "mounting share using direct i/o");
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
        }
        if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-                cERROR(1, ("mount option dynperm ignored if cifsacl "
+                cERROR(1, "mount option dynperm ignored if cifsacl "
-                           "mount option supported"));
+                           "mount option supported");
 }
 static int
@@ -2262,7 +2496,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 {
        struct smb_vol *volume_info;
-        if (!pvolume_info && !*pvolume_info)
+        if (!pvolume_info || !*pvolume_info)
                return;
        volume_info = *pvolume_info;
@@ -2344,11 +2578,11 @@ try_mount_again:
        }
        if (volume_info->nullauth) {
-                cFYI(1, ("null user"));
+                cFYI(1, "null user");
                volume_info->username = "";
        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
-                cFYI(1, ("Username: %s", volume_info->username));
+                cFYI(1, "Username: %s", volume_info->username);
        } else {
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
@@ -2357,20 +2591,20 @@ try_mount_again:
                goto out;
        }
        /* this is needed for ASCII cp to Unicode converts */
        if (volume_info->iocharset == NULL) {
-                cifs_sb->local_nls = load_nls_default();
+                /* load_nls_default cannot return null */
-        /* load_nls_default can not return null */
+                volume_info->local_nls = load_nls_default();
        } else {
-                cifs_sb->local_nls = load_nls(volume_info->iocharset);
+                volume_info->local_nls = load_nls(volume_info->iocharset);
-                if (cifs_sb->local_nls == NULL) {
+                if (volume_info->local_nls == NULL) {
-                        cERROR(1, ("CIFS mount error: iocharset %s not found",
+                        cERROR(1, "CIFS mount error: iocharset %s not found",
-                                 volume_info->iocharset));
+                                 volume_info->iocharset);
                        rc = -ELIBACC;
                        goto out;
                }
        }
+        cifs_sb->local_nls = volume_info->local_nls;
        /* get a reference to a tcp session */
        srvTcp = cifs_get_tcp_session(volume_info);
@@ -2379,148 +2613,30 @@ try_mount_again:
                goto out;
        }
-        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
+        /* get a reference to a SMB session */
-        if (pSesInfo) {
+        pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
-                cFYI(1, ("Existing smb sess found (status=%d)",
+        if (IS_ERR(pSesInfo)) {
-                        pSesInfo->status));
+                rc = PTR_ERR(pSesInfo);
-                /*
+                pSesInfo = NULL;
-                 * The existing SMB session already has a reference to srvTcp,
+                goto mount_fail_check;
-                 * so we can put back the extra one we got before
-                 */
-                cifs_put_tcp_session(srvTcp);
-                mutex_lock(&pSesInfo->session_mutex);
-                if (pSesInfo->need_reconnect) {
-                        cFYI(1, ("Session needs reconnect"));
-                        rc = cifs_setup_session(xid, pSesInfo,
-                                                cifs_sb->local_nls);
-                }
-                mutex_unlock(&pSesInfo->session_mutex);
-        } else if (!rc) {
-                cFYI(1, ("Existing smb sess not found"));
-                pSesInfo = sesInfoAlloc();
-                if (pSesInfo == NULL) {
-                        rc = -ENOMEM;
-                        goto mount_fail_check;
-                }
-                /* new SMB session uses our srvTcp ref */
-                pSesInfo->server = srvTcp;
-                if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-                        sprintf(pSesInfo->serverName, "%pI6",
-                                &srvTcp->addr.sockAddr6.sin6_addr);
-                else
-                        sprintf(pSesInfo->serverName, "%pI4",
-                                &srvTcp->addr.sockAddr.sin_addr.s_addr);
-                write_lock(&cifs_tcp_ses_lock);
-                list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
-                write_unlock(&cifs_tcp_ses_lock);
-                /* volume_info->password freed at unmount */
-                if (volume_info->password) {
-                        pSesInfo->password = kstrdup(volume_info->password,
-                                                     GFP_KERNEL);
-                        if (!pSesInfo->password) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                }
-                if (volume_info->username)
-                        strncpy(pSesInfo->userName, volume_info->username,
-                                MAX_USERNAME_SIZE);
-                if (volume_info->domainname) {
-                        int len = strlen(volume_info->domainname);
-                        pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
-                        if (pSesInfo->domainName)
-                                strcpy(pSesInfo->domainName,
-                                        volume_info->domainname);
-                }
-                pSesInfo->linux_uid = volume_info->linux_uid;
-                pSesInfo->overrideSecFlg = volume_info->secFlg;
-                mutex_lock(&pSesInfo->session_mutex);
-                /* BB FIXME need to pass vol->secFlgs BB */
-                rc = cifs_setup_session(xid, pSesInfo,
-                                        cifs_sb->local_nls);
-                mutex_unlock(&pSesInfo->session_mutex);
        }
-        /* search for existing tcon to this server share */
+        setup_cifs_sb(volume_info, cifs_sb);
-        if (!rc) {
+        if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                setup_cifs_sb(volume_info, cifs_sb);
+                sb->s_maxbytes = MAX_LFS_FILESIZE;
+        else
-                tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
+                sb->s_maxbytes = MAX_NON_LFS;
-                if (tcon) {
-                        cFYI(1, ("Found match on UNC path"));
-                        /* existing tcon already has a reference */
-                        cifs_put_smb_ses(pSesInfo);
-                        if (tcon->seal != volume_info->seal)
-                                cERROR(1, ("transport encryption setting "
-                                           "conflicts with existing tid"));
-                } else {
-                        tcon = tconInfoAlloc();
-                        if (tcon == NULL) {
-                                rc = -ENOMEM;
-                                goto mount_fail_check;
-                        }
-                        tcon->ses = pSesInfo;
-                        if (volume_info->password) {
-                                tcon->password = kstrdup(volume_info->password,
-                                                         GFP_KERNEL);
-                                if (!tcon->password) {
-                                        rc = -ENOMEM;
-                                        goto mount_fail_check;
-                                }
-                        }
-                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-                                cERROR(1, ("Missing share name"));
-                                rc = -ENODEV;
-                                goto mount_fail_check;
-                        } else {
-                                /* BB Do we need to wrap sesSem around
-                                 * this TCon call and Unix SetFS as
-                                 * we do on SessSetup and reconnect? */
-                                rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
-                                              tcon, cifs_sb->local_nls);
-                                cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                if (volume_info->nodfs) {
-                                        tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-                                        cFYI(1, ("DFS disabled (%d)",
-                                                tcon->Flags));
-                                }
-                        }
-                        if (rc)
-                                goto remote_path_check;
-                        tcon->seal = volume_info->seal;
-                        write_lock(&cifs_tcp_ses_lock);
-                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
-                        write_unlock(&cifs_tcp_ses_lock);
-                }
-                /* we can have only one retry value for a connection
-                   to a share so for resources mounted more than once
-                   to the same server share the last value passed in
-                   for the retry flag is used */
-                tcon->retry = volume_info->retry;
-                tcon->nocase = volume_info->nocase;
-                tcon->local_lease = volume_info->local_lease;
-        }
-        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                else
-                        sb->s_maxbytes = MAX_NON_LFS;
-        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
        sb->s_time_gran = 100;
-        if (rc)
+        /* search for existing tcon to this server share */
+        tcon = cifs_get_tcon(pSesInfo, volume_info);
+        if (IS_ERR(tcon)) {
+                rc = PTR_ERR(tcon);
+                tcon = NULL;
                goto remote_path_check;
+        }
        cifs_sb->tcon = tcon;
@@ -2544,7 +2660,7 @@ try_mount_again:
        if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
                cifs_sb->rsize = 1024 * 127;
-                cFYI(DBG2, ("no very large read support, rsize now 127K"));
+                cFYI(DBG2, "no very large read support, rsize now 127K");
        }
        if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
                cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2593,7 +2709,7 @@ remote_path_check:
                        goto mount_fail_check;
                }
-                cFYI(1, ("Getting referral for: %s", full_path));
+                cFYI(1, "Getting referral for: %s", full_path);
                rc = get_dfs_path(xid, pSesInfo , full_path + 1,
                        cifs_sb->local_nls, &num_referrals, &referrals,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2707,7 +2823,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                   by Samba (not sure whether other servers allow
                   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                if ((extended_security & CIFSSEC_MAY_LANMAN) &&
+                if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
                    (ses->server->secType == LANMAN))
                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
                                         ses->server->secMode &
@@ -2778,13 +2894,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                if (length == 3) {
                        if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
                            (bcc_ptr[2] == 'C')) {
-                                cFYI(1, ("IPC connection"));
+                                cFYI(1, "IPC connection");
                                tcon->ipc = 1;
                        }
                } else if (length == 2) {
                        if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
                                /* the most common case */
-                                cFYI(1, ("disk share connection"));
+                                cFYI(1, "disk share connection");
                        }
                }
                bcc_ptr += length + 1;
@@ -2797,7 +2913,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
-                cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+                cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
                if ((smb_buffer_response->WordCount == 3) ||
                         (smb_buffer_response->WordCount == 7))
@@ -2805,7 +2921,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                        tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
                else
                        tcon->Flags = 0;
-                cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
+                cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
        } else if ((rc == 0) && tcon == NULL) {
                /* all we need to save for IPC$ connection */
                ses->ipc_tid = smb_buffer_response->Tid;
@@ -2833,57 +2949,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
        return rc;
 }
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
-                                           struct nls_table *nls_info)
 {
        int rc = 0;
-        int first_time = 0;
+        struct TCP_Server_Info *server = ses->server;
-        struct TCP_Server_Info *server = pSesInfo->server;
+        /* only send once per connect */
-        /* what if server changes its buffer size after dropping the session? */
+        if (server->maxBuf != 0)
-        if (server->maxBuf == 0) /* no need to send on reconnect */ {
+                return 0;
-                rc = CIFSSMBNegotiate(xid, pSesInfo);
-                if (rc == -EAGAIN) {
+        rc = CIFSSMBNegotiate(xid, ses);
-                        /* retry only once on 1st time connection */
+        if (rc == -EAGAIN) {
-                        rc = CIFSSMBNegotiate(xid, pSesInfo);
+                /* retry only once on 1st time connection */
-                        if (rc == -EAGAIN)
+                rc = CIFSSMBNegotiate(xid, ses);
-                                rc = -EHOSTDOWN;
+                if (rc == -EAGAIN)
-                }
+                        rc = -EHOSTDOWN;
-                if (rc == 0) {
+        }
-                        spin_lock(&GlobalMid_Lock);
+        if (rc == 0) {
-                        if (server->tcpStatus != CifsExiting)
+                spin_lock(&GlobalMid_Lock);
-                                server->tcpStatus = CifsGood;
+                if (server->tcpStatus != CifsExiting)
-                        else
+                        server->tcpStatus = CifsGood;
-                                rc = -EHOSTDOWN;
+                else
-                        spin_unlock(&GlobalMid_Lock);
+                        rc = -EHOSTDOWN;
+                spin_unlock(&GlobalMid_Lock);
-                }
-                first_time = 1;
        }
-        if (rc)
+        return rc;
-                goto ss_err_exit;
+}
+int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+                        struct nls_table *nls_info)
+{
+        int rc = 0;
+        struct TCP_Server_Info *server = ses->server;
-        pSesInfo->flags = 0;
+        ses->flags = 0;
-        pSesInfo->capabilities = server->capabilities;
+        ses->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
-                pSesInfo->capabilities &= (~CAP_UNIX);
+                ses->capabilities &= (~CAP_UNIX);
-        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+        cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 server->secMode, server->capabilities, server->timeAdj));
+                 server->secMode, server->capabilities, server->timeAdj);
-        rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+        rc = CIFS_SessSetup(xid, ses, nls_info);
        if (rc) {
-                cERROR(1, ("Send error in SessSetup = %d", rc));
+                cERROR(1, "Send error in SessSetup = %d", rc);
        } else {
-                cFYI(1, ("CIFS Session Established successfully"));
+                cFYI(1, "CIFS Session Established successfully");
                spin_lock(&GlobalMid_Lock);
-                pSesInfo->status = CifsGood;
+                ses->status = CifsGood;
-                pSesInfo->need_reconnect = false;
+                ses->need_reconnect = false;
                spin_unlock(&GlobalMid_Lock);
        }
-ss_err_exit:
        return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e9f7ecc2714b..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -73,7 +74,7 @@ cifs_bp_rename_retry:
                namelen += (1 + temp->d_name.len);
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        return NULL;
                }
        }
@@ -90,19 +91,18 @@ cifs_bp_rename_retry:
                        full_path[namelen] = dirsep;
                        strncpy(full_path + namelen + 1, temp->d_name.name,
                                temp->d_name.len);
-                        cFYI(0, ("name: %s", full_path + namelen));
+                        cFYI(0, "name: %s", full_path + namelen);
                }
                temp = temp->d_parent;
                if (temp == NULL) {
-                        cERROR(1, ("corrupt dentry"));
+                        cERROR(1, "corrupt dentry");
                        kfree(full_path);
                        return NULL;
                }
        }
        if (namelen != pplen + dfsplen) {
-                cERROR(1,
+                cERROR(1, "did not end path lookup where expected namelen is %d",
-                       ("did not end path lookup where expected namelen is %d",
+                        namelen);
-                        namelen));
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
@@ -157,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
        mutex_init(&pCifsFile->lock_mutex);
        INIT_LIST_HEAD(&pCifsFile->llist);
        atomic_set(&pCifsFile->count, 1);
-        slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
+        INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
        write_lock(&GlobalSMBSeslock);
        list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
@@ -173,26 +173,28 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
                if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                        pCifsInode->clientCanCacheAll = true;
                        pCifsInode->clientCanCacheRead = true;
-                        cFYI(1, ("Exclusive Oplock inode %p", newinode));
+                        cFYI(1, "Exclusive Oplock inode %p", newinode);
                } else if ((oplock & 0xF) == OPLOCK_READ)
                                pCifsInode->clientCanCacheRead = true;
        }
        write_unlock(&GlobalSMBSeslock);
+        file->private_data = pCifsFile;
        return pCifsFile;
 }
 int cifs_posix_open(char *full_path, struct inode **pinode,
-                    struct vfsmount *mnt, int mode, int oflags,
+                        struct super_block *sb, int mode, int oflags,
-                    __u32 *poplock, __u16 *pnetfid, int xid)
+                        __u32 *poplock, __u16 *pnetfid, int xid)
 {
        int rc;
        FILE_UNIX_BASIC_INFO *presp_data;
        __u32 posix_flags = 0;
-        struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_fattr fattr;
-        cFYI(1, ("posix open %s", full_path));
+        cFYI(1, "posix open %s", full_path);
        presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
        if (presp_data == NULL)
@@ -242,7 +244,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
        /* get new inode and set it up */
        if (*pinode == NULL) {
-                *pinode = cifs_iget(mnt->mnt_sb, &fattr);
+                cifs_fill_uniqueid(sb, &fattr);
+                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode) {
                        rc = -ENOMEM;
                        goto posix_open_ret;
@@ -251,8 +254,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
                cifs_fattr_to_inode(*pinode, &fattr);
        }
-        cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
 posix_open_ret:
        kfree(presp_data);
        return rc;
@@ -280,7 +281,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int create_options = CREATE_NOT_DIR;
        __u32 oplock = 0;
        int oflags;
-        bool posix_create = false;
        /*
         * BB below access is probably too much for mknod to request
         *    but we have to do query and setpathinfo so requesting
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return rc;
        }
        if (oplockEnabled)
@@ -315,20 +314,19 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (nd && (nd->flags & LOOKUP_OPEN))
                oflags = nd->intent.open.flags;
        else
-                oflags = FMODE_READ;
+                oflags = FMODE_READ | SMB_O_CREAT;
        if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
+                rc = cifs_posix_open(full_path, &newinode,
-                                     mode, oflags, &oplock, &fileHandle, xid);
+                        inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
                /* EIO could indicate that (posix open) operation is not
                   supported, despite what server claimed in capability
                   negotation.  EREMOTE indicates DFS junction, which is not
                   handled in posix open */
                if (rc == 0) {
-                        posix_create = true;
                        if (newinode == NULL) /* query inode info */
                                goto cifs_create_get_file_info;
                        else /* success, no need to query */
@@ -358,7 +356,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                else if ((oflags & O_CREAT) == O_CREAT)
                        disposition = FILE_OPEN_IF;
                else
-                        cFYI(1, ("Create flag not set in create function"));
+                        cFYI(1, "Create flag not set in create function");
        }
        /* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -366,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
        if (buf == NULL) {
-                kfree(full_path);
+                rc = -ENOMEM;
-                FreeXid(xid);
+                goto cifs_create_out;
-                return -ENOMEM;
        }
        /*
@@ -394,7 +391,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_create returned 0x%x", rc));
+                cFYI(1, "cifs_create returned 0x%x", rc);
                goto cifs_create_out;
        }
@@ -457,16 +454,30 @@ cifs_create_set_dentry:
        if (rc == 0)
                setup_cifs_dentry(tcon, direntry, newinode);
        else
-                cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+        if (newinode && nd && (nd->flags & LOOKUP_OPEN)) {
+                struct cifsFileInfo *pfile_info;
+                struct file *filp;
+                filp = lookup_instantiate_filp(nd, direntry, generic_file_open);
+                if (IS_ERR(filp)) {
+                        rc = PTR_ERR(filp);
+                        CIFSSMBClose(xid, tcon, fileHandle);
+                        goto cifs_create_out;
+                }
-        /* nfsd case - nfs srv does not set nd */
+                pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp,
-        if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
+                                               nd->path.mnt, oflags);
-                /* mknod case - do not leave file open */
+                if (pfile_info == NULL) {
+                        fput(filp);
+                        CIFSSMBClose(xid, tcon, fileHandle);
+                        rc = -ENOMEM;
+                }
+        } else {
                CIFSSMBClose(xid, tcon, fileHandle);
-        } else if (!(posix_create) && (newinode)) {
-                        cifs_new_fileinfo(newinode, fileHandle, NULL,
-                                                nd->path.mnt, oflags);
        }
 cifs_create_out:
        kfree(buf);
        kfree(full_path);
@@ -483,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
        struct inode *newinode = NULL;
+        int oplock = 0;
+        u16 fileHandle;
+        FILE_ALL_INFO *buf = NULL;
+        unsigned int bytes_written;
+        struct win_dev *pdev;
        if (!old_valid_dev(device_number))
                return -EINVAL;
@@ -493,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        pTcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
-        if (full_path == NULL)
+        if (full_path == NULL) {
                rc = -ENOMEM;
-        else if (pTcon->unix_ext) {
+                goto mknod_out;
+        }
+        if (pTcon->unix_ext) {
                struct cifs_unix_set_info_args args = {
                        .mode   = mode & ~current_umask(),
                        .ctime  = NO_CHANGE_64,
@@ -514,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                                            cifs_sb->local_nls,
                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc)
+                        goto mknod_out;
-                if (!rc) {
+                rc = cifs_get_inode_info_unix(&newinode, full_path,
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                inode->i_sb, xid);
-                        if (pTcon->nocase)
+                if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
+                        direntry->d_op = &cifs_ci_dentry_ops;
-                        else
+                else
-                                direntry->d_op = &cifs_dentry_ops;
+                        direntry->d_op = &cifs_dentry_ops;
-                        if (rc == 0)
-                                d_instantiate(direntry, newinode);
-                }
-        } else {
-                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-                        int oplock = 0;
-                        u16 fileHandle;
-                        FILE_ALL_INFO *buf;
-                        cFYI(1, ("sfu compat create special file"));
+                if (rc == 0)
+                        d_instantiate(direntry, newinode);
+                goto mknod_out;
+        }
-                        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-                        if (buf == NULL) {
+                goto mknod_out;
-                                kfree(full_path);
-                                rc = -ENOMEM;
-                                FreeXid(xid);
-                                return rc;
-                        }
-                        rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                         FILE_CREATE, /* fail if exists */
+        cFYI(1, "sfu compat create special file");
-                                         GENERIC_WRITE /* BB would
-                                          WRITE_OWNER | WRITE_DAC be better? */,
+        buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-                                         /* Create a file and set the
+        if (buf == NULL) {
-                                            file attribute to SYSTEM */
+                kfree(full_path);
-                                         CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                rc = -ENOMEM;
-                                         &fileHandle, &oplock, buf,
+                FreeXid(xid);
-                                         cifs_sb->local_nls,
+                return rc;
-                                         cifs_sb->mnt_cifs_flags &
-                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* BB FIXME - add handling for backlevel servers
-                           which need legacy open and check for all
-                           calls to SMBOpen for fallback to SMBLeagcyOpen */
-                        if (!rc) {
-                                /* BB Do not bother to decode buf since no
-                                   local inode yet to put timestamps in,
-                                   but we can reuse it safely */
-                                unsigned int bytes_written;
-                                struct win_dev *pdev;
-                                pdev = (struct win_dev *)buf;
-                                if (S_ISCHR(mode)) {
-                                        memcpy(pdev->type, "IntxCHR", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } else if (S_ISBLK(mode)) {
-                                        memcpy(pdev->type, "IntxBLK", 8);
-                                        pdev->major =
-                                              cpu_to_le64(MAJOR(device_number));
-                                        pdev->minor =
-                                              cpu_to_le64(MINOR(device_number));
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                fileHandle,
-                                                sizeof(struct win_dev),
-                                                0, &bytes_written, (char *)pdev,
-                                                NULL, 0);
-                                } /* else if(S_ISFIFO */
-                                CIFSSMBClose(xid, pTcon, fileHandle);
-                                d_drop(direntry);
-                        }
-                        kfree(buf);
-                        /* add code here to set EAs */
-                }
        }
+        /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+                         GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc)
+                goto mknod_out;
+        /* BB Do not bother to decode buf since no local inode yet to put
+         * timestamps in, but we can reuse it safely */
+        pdev = (struct win_dev *)buf;
+        if (S_ISCHR(mode)) {
+                memcpy(pdev->type, "IntxCHR", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } else if (S_ISBLK(mode)) {
+                memcpy(pdev->type, "IntxBLK", 8);
+                pdev->major =
+                      cpu_to_le64(MAJOR(device_number));
+                pdev->minor =
+                      cpu_to_le64(MINOR(device_number));
+                rc = CIFSSMBWrite(xid, pTcon,
+                        fileHandle,
+                        sizeof(struct win_dev),
+                        0, &bytes_written, (char *)pdev,
+                        NULL, 0);
+        } /* else if (S_ISFIFO) */
+        CIFSSMBClose(xid, pTcon, fileHandle);
+        d_drop(direntry);
+        /* FIXME: add code here to set EAs */
+mknod_out:
        kfree(full_path);
+        kfree(buf);
        FreeXid(xid);
        return rc;
 }
@@ -610,14 +620,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        bool posix_open = false;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
+        struct cifsFileInfo *cfile;
        struct inode *newInode = NULL;
        char *full_path = NULL;
        struct file *filp;
        xid = GetXid();
-        cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
+        cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-              parent_dir_inode, direntry->d_name.name, direntry));
+              parent_dir_inode, direntry->d_name.name, direntry);
        /* check whether path exists */
@@ -632,7 +643,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                int i;
                for (i = 0; i < direntry->d_name.len; i++)
                        if (direntry->d_name.name[i] == '\\') {
-                                cFYI(1, ("Invalid file name"));
+                                cFYI(1, "Invalid file name");
                                FreeXid(xid);
                                return ERR_PTR(-EINVAL);
                        }
@@ -657,11 +668,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        if (direntry->d_inode != NULL) {
-                cFYI(1, ("non-NULL inode in lookup"));
+                cFYI(1, "non-NULL inode in lookup");
        } else {
-                cFYI(1, ("NULL inode in lookup"));
+                cFYI(1, "NULL inode in lookup");
        }
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
        /* Posix open is only called (at lookup time) for file create now.
         * For opens (rather than creates), because we do not know if it
@@ -677,7 +688,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
                     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
                     (nd->intent.open.flags & O_CREAT)) {
-                        rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+                        rc = cifs_posix_open(full_path, &newInode,
+                                        parent_dir_inode->i_sb,
                                        nd->intent.open.create_mode,
                                        nd->intent.open.flags, &oplock,
                                        &fileHandle, xid);
@@ -706,8 +718,25 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
                else
                        direntry->d_op = &cifs_dentry_ops;
                d_add(direntry, newInode);
-                if (posix_open)
+                if (posix_open) {
-                        filp = lookup_instantiate_filp(nd, direntry, NULL);
+                        filp = lookup_instantiate_filp(nd, direntry,
+                                                       generic_file_open);
+                        if (IS_ERR(filp)) {
+                                rc = PTR_ERR(filp);
+                                CIFSSMBClose(xid, pTcon, fileHandle);
+                                goto lookup_out;
+                        }
+                        cfile = cifs_new_fileinfo(newInode, fileHandle, filp,
+                                                  nd->path.mnt,
+                                                  nd->intent.open.flags);
+                        if (cfile == NULL) {
+                                fput(filp);
+                                CIFSSMBClose(xid, pTcon, fileHandle);
+                                rc = -ENOMEM;
+                                goto lookup_out;
+                        }
+                }
                /* since paths are not looked up by component - the parent
                   directories are presumed to be good here */
                renew_parental_timestamps(direntry);
@@ -723,11 +752,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        /*      if it was once a directory (but how can we tell?) we could do
                shrink_dcache_parent(direntry); */
        } else if (rc != -EACCES) {
-                cERROR(1, ("Unexpected lookup error %d", rc));
+                cERROR(1, "Unexpected lookup error %d", rc);
                /* We special case check for Access Denied - since that
                is a common return code */
        }
+lookup_out:
        kfree(full_path);
        FreeXid(xid);
        return ERR_PTR(rc);
@@ -742,8 +772,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
                if (cifs_revalidate_dentry(direntry))
                        return 0;
        } else {
-                cFYI(1, ("neg dentry 0x%p name = %s",
+                cFYI(1, "neg dentry 0x%p name = %s",
-                         direntry, direntry->d_name.name));
+                         direntry, direntry->d_name.name);
                if (time_after(jiffies, direntry->d_time + HZ) ||
                        !lookupCacheEnabled) {
                        d_drop(direntry);
@@ -758,7 +788,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
        int rc = 0;
-        cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name));
+        cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
        return rc;
 }     */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 6f8a0e3fb25b..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
 *   Copyright (c) 2007 Igor Mammedov
 *   Author(s): Igor Mammedov (niallain@gmail.com)
 *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *              David Howells (dhowells@redhat.com)
 *
 *   Contains the CIFS DFS upcall routines used for hostname to
 *   IP address translation.
@@ -24,145 +26,73 @@
 */
 #include <linux/slab.h>
-#include <keys/user-type.h>
+#include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
-/* Checks if supplied name is IP address
+/**
- * returns:
+ * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- *              1 - name is IP
+ * @unc: UNC path specifying the server
- *              0 - name is not IP
+ * @ip_addr: Where to return the IP address.
- */
+ *
-static int
+ * The IP address will be returned in string form, and the caller is
-is_ip(char *name)
+ * responsible for freeing it.
-{
+ *
-        struct sockaddr_storage ss;
+ * Returns length of result on success, -ve on error.
-        return cifs_convert_address(name, &ss);
-}
-static int
-dns_resolver_instantiate(struct key *key, const void *data,
-                size_t datalen)
-{
-        int rc = 0;
-        char *ip;
-        ip = kmalloc(datalen + 1, GFP_KERNEL);
-        if (!ip)
-                return -ENOMEM;
-        memcpy(ip, data, datalen);
-        ip[datalen] = '\0';
-        /* make sure this looks like an address */
-        if (!is_ip(ip)) {
-                kfree(ip);
-                return -EINVAL;
-        }
-        key->type_data.x[0] = datalen;
-        key->payload.data = ip;
-        return rc;
-}
-static void
-dns_resolver_destroy(struct key *key)
-{
-        kfree(key->payload.data);
-}
-struct key_type key_type_dns_resolver = {
-        .name        = "dns_resolver",
-        .def_datalen = sizeof(struct in_addr),
-        .describe    = user_describe,
-        .instantiate = dns_resolver_instantiate,
-        .destroy     = dns_resolver_destroy,
-        .match       = user_match,
-};
-/* Resolves server name to ip address.
- * input:
- *      unc - server UNC
- * output:
- *      *ip_addr - pointer to server ip, caller responcible for freeing it.
- * return 0 on success
 */
 int
 dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 {
-        int rc = -EAGAIN;
+        struct sockaddr_storage ss;
-        struct key *rkey = ERR_PTR(-EAGAIN);
+        const char *hostname, *sep;
        char *name;
-        char *data = NULL;
+        int len, rc;
-        int len;
        if (!ip_addr || !unc)
                return -EINVAL;
-        /* search for server name delimiter */
        len = strlen(unc);
        if (len < 3) {
-                cFYI(1, ("%s: unc is too short: %s", __func__, unc));
+                cFYI(1, "%s: unc is too short: %s", __func__, unc);
                return -EINVAL;
        }
+        /* Discount leading slashes for cifs */
        len -= 2;
-        name = memchr(unc+2, '\\', len);
+        hostname = unc + 2;
-        if (!name) {
-                cFYI(1, ("%s: probably server name is whole unc: %s",
+        /* Search for server name delimiter */
-                                        __func__, unc));
+        sep = memchr(hostname, '\\', len);
-        } else {
+        if (sep)
-                len = (name - unc) - 2/* leading // */;
+                len = sep - unc;
-        }
+        else
+                cFYI(1, "%s: probably server name is whole unc: %s",
+                     __func__, unc);
+        /* Try to interpret hostname as an IPv4 or IPv6 address */
+        rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
+        if (rc > 0)
+                goto name_is_IP_address;
+        /* Perform the upcall */
+        rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+        if (rc < 0)
+                cERROR(1, "%s: unable to resolve: %*.*s",
+                       __func__, len, len, hostname);
+        else
+                cFYI(1, "%s: resolved: %*.*s to %s",
+                     __func__, len, len, hostname, *ip_addr);
+        return rc;
-        name = kmalloc(len+1, GFP_KERNEL);
+name_is_IP_address:
-        if (!name) {
+        name = kmalloc(len + 1, GFP_KERNEL);
-                rc = -ENOMEM;
+        if (!name)
-                return rc;
+                return -ENOMEM;
-        }
+        memcpy(name, hostname, len);
-        memcpy(name, unc+2, len);
        name[len] = 0;
+        cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
-        if (is_ip(name)) {
+        *ip_addr = name;
-                cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
+        return 0;
-                                        __func__, name));
-                data = name;
-                goto skip_upcall;
-        }
-        rkey = request_key(&key_type_dns_resolver, name, "");
-        if (!IS_ERR(rkey)) {
-                len = rkey->type_data.x[0];
-                data = rkey->payload.data;
-        } else {
-                cERROR(1, ("%s: unable to resolve: %s", __func__, name));
-                goto out;
-        }
-skip_upcall:
-        if (data) {
-                *ip_addr = kmalloc(len + 1, GFP_KERNEL);
-                if (*ip_addr) {
-                        memcpy(*ip_addr, data, len + 1);
-                        if (!IS_ERR(rkey))
-                                cFYI(1, ("%s: resolved: %s to %s", __func__,
-                                                        name,
-                                                        *ip_addr
-                                        ));
-                        rc = 0;
-                } else {
-                        rc = -ENOMEM;
-                }
-                if (!IS_ERR(rkey))
-                        key_put(rkey);
-        }
-out:
-        kfree(name);
-        return rc;
 }
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 966e9288930b..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
 #define _DNS_RESOLVE_H
 #ifdef __KERNEL__
-#include <linux/key-type.h>
-extern struct key_type key_type_dns_resolver;
 extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
 #endif /* KERNEL */
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16a..993f82045bf6 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
        /* BB need to add code here eventually to enable export via NFSD */
-        cFYI(1, ("get parent for %p", dentry));
+        cFYI(1, "get parent for %p", dentry);
        return ERR_PTR(-EACCES);
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9b11a8f56f3a..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
 *
 *   vfs operations that deal with files
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *              Jeremy Allison (jra@samba.org)
 *
@@ -40,6 +40,7 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 static inline int cifs_convert_flags(unsigned int flags)
 {
@@ -108,8 +109,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 /* all arguments to this function must be checked for validity in caller */
 static inline int
 cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-                             struct cifsInodeInfo *pCifsInode,
+                             struct cifsInodeInfo *pCifsInode, __u32 oplock,
-                             struct cifsFileInfo *pCifsFile, __u32 oplock,
                             u16 netfid)
 {
@@ -136,15 +136,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
                if (file->f_path.dentry->d_inode->i_mapping) {
                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
                        if (rc != 0)
                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
                invalidate_remote_inode(file->f_path.dentry->d_inode);
        } */
@@ -152,8 +152,8 @@ psx_client_can_cache:
        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p",
-                         file->f_path.dentry->d_inode));
+                         file->f_path.dentry->d_inode);
        } else if ((oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -163,44 +163,12 @@ psx_client_can_cache:
        return 0;
 }
-static struct cifsFileInfo *
-cifs_fill_filedata(struct file *file)
-{
-        struct list_head *tmp;
-        struct cifsFileInfo *pCifsFile = NULL;
-        struct cifsInodeInfo *pCifsInode = NULL;
-        /* search inode for this file and fill in file->private_data */
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-        read_lock(&GlobalSMBSeslock);
-        list_for_each(tmp, &pCifsInode->openFileList) {
-                pCifsFile = list_entry(tmp, struct cifsFileInfo, flist);
-                if ((pCifsFile->pfile == NULL) &&
-                    (pCifsFile->pid == current->tgid)) {
-                        /* mode set in cifs_create */
-                        /* needed for writepage */
-                        pCifsFile->pfile = file;
-                        file->private_data = pCifsFile;
-                        break;
-                }
-        }
-        read_unlock(&GlobalSMBSeslock);
-        if (file->private_data != NULL) {
-                return pCifsFile;
-        } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-                        cERROR(1, ("could not find file instance for "
-                                   "new file %p", file));
-        return NULL;
-}
 /* all arguments to this function must be checked for validity in caller */
-static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
+static inline int cifs_open_inode_helper(struct inode *inode,
-        struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
        struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
        char *full_path, int xid)
 {
+        struct cifsInodeInfo *pCifsInode = CIFS_I(inode);
        struct timespec temp;
        int rc;
@@ -214,36 +182,35 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* if not oplocked, invalidate inode pages if mtime or file
           size changed */
        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
-        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
+        if (timespec_equal(&inode->i_mtime, &temp) &&
-                           (file->f_path.dentry->d_inode->i_size ==
+                           (inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
-                cFYI(1, ("inode unchanged on server"));
+                cFYI(1, "inode unchanged on server");
        } else {
-                if (file->f_path.dentry->d_inode->i_mapping) {
+                if (inode->i_mapping) {
                        /* BB no need to lock inode until after invalidate
                        since namei code should already have it locked? */
-                        rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+                        rc = filemap_write_and_wait(inode->i_mapping);
                        if (rc != 0)
-                                CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+                                pCifsInode->write_behind_rc = rc;
                }
-                cFYI(1, ("invalidating remote inode since open detected it "
+                cFYI(1, "invalidating remote inode since open detected it "
-                         "changed"));
+                         "changed");
-                invalidate_remote_inode(file->f_path.dentry->d_inode);
+                invalidate_remote_inode(inode);
        }
 client_can_cache:
        if (pTcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&file->f_path.dentry->d_inode,
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
-                        full_path, inode->i_sb, xid);
+                                              xid);
        else
-                rc = cifs_get_inode_info(&file->f_path.dentry->d_inode,
+                rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
-                        full_path, buf, inode->i_sb, xid, NULL);
+                                         xid, NULL);
        if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                pCifsInode->clientCanCacheAll = true;
                pCifsInode->clientCanCacheRead = true;
-                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                cFYI(1, "Exclusive Oplock granted on inode %p", inode);
-                         file->f_path.dentry->d_inode));
        } else if ((*oplock & 0xF) == OPLOCK_READ)
                pCifsInode->clientCanCacheRead = true;
@@ -257,7 +224,7 @@ int cifs_open(struct inode *inode, struct file *file)
        __u32 oplock;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pCifsFile;
+        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
        int desiredAccess;
@@ -271,22 +238,15 @@ int cifs_open(struct inode *inode, struct file *file)
        tcon = cifs_sb->tcon;
        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-        pCifsFile = cifs_fill_filedata(file);
-        if (pCifsFile) {
-                rc = 0;
-                FreeXid(xid);
-                return rc;
-        }
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
-                FreeXid(xid);
+                goto out;
-                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -298,27 +258,42 @@ int cifs_open(struct inode *inode, struct file *file)
            (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+                oflags |= SMB_O_CREAT;
                /* can not refresh inode info since size could be stale */
-                rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
+                rc = cifs_posix_open(full_path, &inode, inode->i_sb,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                cifs_sb->mnt_file_mode /* ignored */,
-                                     oflags, &oplock, &netfid, xid);
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix open succeeded"));
+                        cFYI(1, "posix open succeeded");
                        /* no need for special case handling of setting mode
                           on read only files needed here */
-                        pCifsFile = cifs_fill_filedata(file);
+                        rc = cifs_posix_open_inode_helper(inode, file,
-                        cifs_posix_open_inode_helper(inode, file, pCifsInode,
+                                        pCifsInode, oplock, netfid);
-                                                     pCifsFile, oplock, netfid);
+                        if (rc != 0) {
+                                CIFSSMBClose(xid, tcon, netfid);
+                                goto out;
+                        }
+                        pCifsFile = cifs_new_fileinfo(inode, netfid, file,
+                                                        file->f_path.mnt,
+                                                        oflags);
+                        if (pCifsFile == NULL) {
+                                CIFSSMBClose(xid, tcon, netfid);
+                                rc = -ENOMEM;
+                        }
+                        cifs_fscache_set_inode_cookie(inode, file);
                        goto out;
                } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        if (tcon->ses->serverNOS)
-                                cERROR(1, ("server %s of type %s returned"
+                                cERROR(1, "server %s of type %s returned"
                                           " unexpected error on SMB posix open"
                                           ", disabling posix open support."
                                           " Check if server update available.",
                                           tcon->ses->serverName,
-                                           tcon->ses->serverNOS));
+                                           tcon->ses->serverNOS);
                        tcon->broken_posix_open = true;
                } else if ((rc != -EIO) && (rc != -EREMOTE) &&
                         (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -386,20 +361,22 @@ int cifs_open(struct inode *inode, struct file *file)
                                & CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
        if (rc) {
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
                goto out;
        }
+        rc = cifs_open_inode_helper(inode, tcon, &oplock, buf, full_path, xid);
+        if (rc != 0)
+                goto out;
        pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
                                        file->f_flags);
-        file->private_data = pCifsFile;
+        if (pCifsFile == NULL) {
-        if (file->private_data == NULL) {
                rc = -ENOMEM;
                goto out;
        }
-        rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
+        cifs_fscache_set_inode_cookie(inode, file);
-                                    &oplock, buf, full_path, xid);
        if (oplock & CIFS_CREATE_ACTION) {
                /* time to set mode which we can not set earlier due to
@@ -455,7 +432,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        __u16 netfid;
        if (file->private_data)
-                pCifsFile = (struct cifsFileInfo *)file->private_data;
+                pCifsFile = file->private_data;
        else
                return -EBADF;
@@ -469,7 +446,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        }
        if (file->f_path.dentry == NULL) {
-                cERROR(1, ("no valid name if dentry freed"));
+                cERROR(1, "no valid name if dentry freed");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -477,7 +454,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
        inode = file->f_path.dentry->d_inode;
        if (inode == NULL) {
-                cERROR(1, ("inode not valid"));
+                cERROR(1, "inode not valid");
                dump_stack();
                rc = -EBADF;
                goto reopen_error_exit;
@@ -499,8 +476,8 @@ reopen_error_exit:
                return rc;
        }
-        cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
+        cFYI(1, "inode = 0x%p file flags 0x%x for %s",
-                 inode, file->f_flags, full_path));
+                 inode, file->f_flags, full_path);
        if (oplockEnabled)
                oplock = REQ_OPLOCK;
@@ -512,11 +489,11 @@ reopen_error_exit:
                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
                int oflags = (int) cifs_posix_convert_flags(file->f_flags);
                /* can not refresh inode info since size could be stale */
-                rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
+                rc = cifs_posix_open(full_path, NULL, inode->i_sb,
-                                     cifs_sb->mnt_file_mode /* ignored */,
+                                cifs_sb->mnt_file_mode /* ignored */,
-                                     oflags, &oplock, &netfid, xid);
+                                oflags, &oplock, &netfid, xid);
                if (rc == 0) {
-                        cFYI(1, ("posix reopen succeeded"));
+                        cFYI(1, "posix reopen succeeded");
                        goto reopen_success;
                }
                /* fallthrough to retry open the old way on errors, especially
@@ -537,8 +514,8 @@ reopen_error_exit:
                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                mutex_unlock(&pCifsFile->fh_mutex);
-                cFYI(1, ("cifs_open returned 0x%x", rc));
+                cFYI(1, "cifs_open returned 0x%x", rc);
-                cFYI(1, ("oplock: %d", oplock));
+                cFYI(1, "oplock: %d", oplock);
        } else {
 reopen_success:
                pCifsFile->netfid = netfid;
@@ -570,8 +547,8 @@ reopen_success:
                        if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
                                pCifsInode->clientCanCacheAll = true;
                                pCifsInode->clientCanCacheRead = true;
-                                cFYI(1, ("Exclusive Oplock granted on inode %p",
+                                cFYI(1, "Exclusive Oplock granted on inode %p",
-                                         file->f_path.dentry->d_inode));
+                                         file->f_path.dentry->d_inode);
                        } else if ((oplock & 0xF) == OPLOCK_READ) {
                                pCifsInode->clientCanCacheRead = true;
                                pCifsInode->clientCanCacheAll = false;
@@ -593,8 +570,7 @@ int cifs_close(struct inode *inode, struct file *file)
        int xid, timeout;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
-        struct cifsFileInfo *pSMBFile =
+        struct cifsFileInfo *pSMBFile = file->private_data;
-                (struct cifsFileInfo *)file->private_data;
        xid = GetXid();
@@ -619,8 +595,7 @@ int cifs_close(struct inode *inode, struct file *file)
                                        the struct would be in each open file,
                                        but this should give enough time to
                                        clear the socket */
-                                        cFYI(DBG2,
+                                        cFYI(DBG2, "close delay, write pending");
-                                                ("close delay, write pending"));
                                        msleep(timeout);
                                        timeout *= 4;
                                }
@@ -653,7 +628,7 @@ int cifs_close(struct inode *inode, struct file *file)
        read_lock(&GlobalSMBSeslock);
        if (list_empty(&(CIFS_I(inode)->openFileList))) {
-                cFYI(1, ("closing last open instance for inode %p", inode));
+                cFYI(1, "closing last open instance for inode %p", inode);
                /* if the file is not open we do not know if we can cache info
                   on this inode, much less write behind and read ahead */
                CIFS_I(inode)->clientCanCacheRead = false;
@@ -670,11 +645,10 @@ int cifs_closedir(struct inode *inode, struct file *file)
 {
        int rc = 0;
        int xid;
-        struct cifsFileInfo *pCFileStruct =
+        struct cifsFileInfo *pCFileStruct = file->private_data;
-            (struct cifsFileInfo *)file->private_data;
        char *ptmp;
-        cFYI(1, ("Closedir inode = 0x%p", inode));
+        cFYI(1, "Closedir inode = 0x%p", inode);
        xid = GetXid();
@@ -685,22 +659,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
                pTcon = cifs_sb->tcon;
-                cFYI(1, ("Freeing private data in close dir"));
+                cFYI(1, "Freeing private data in close dir");
                write_lock(&GlobalSMBSeslock);
                if (!pCFileStruct->srch_inf.endOfSearch &&
                    !pCFileStruct->invalidHandle) {
                        pCFileStruct->invalidHandle = true;
                        write_unlock(&GlobalSMBSeslock);
                        rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-                        cFYI(1, ("Closing uncompleted readdir with rc %d",
+                        cFYI(1, "Closing uncompleted readdir with rc %d",
-                                 rc));
+                                 rc);
                        /* not much we can do if it fails anyway, ignore rc */
                        rc = 0;
                } else
                        write_unlock(&GlobalSMBSeslock);
                ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
                if (ptmp) {
-                        cFYI(1, ("closedir free smb buf in srch struct"));
+                        cFYI(1, "closedir free smb buf in srch struct");
                        pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
                        if (pCFileStruct->srch_inf.smallBuf)
                                cifs_small_buf_release(ptmp);
@@ -748,49 +722,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        rc = -EACCES;
        xid = GetXid();
-        cFYI(1, ("Lock parm: 0x%x flockflags: "
+        cFYI(1, "Lock parm: 0x%x flockflags: "
                 "0x%x flocktype: 0x%x start: %lld end: %lld",
                cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-                pfLock->fl_end));
+                pfLock->fl_end);
        if (pfLock->fl_flags & FL_POSIX)
-                cFYI(1, ("Posix"));
+                cFYI(1, "Posix");
        if (pfLock->fl_flags & FL_FLOCK)
-                cFYI(1, ("Flock"));
+                cFYI(1, "Flock");
        if (pfLock->fl_flags & FL_SLEEP) {
-                cFYI(1, ("Blocking lock"));
+                cFYI(1, "Blocking lock");
                wait_flag = true;
        }
        if (pfLock->fl_flags & FL_ACCESS)
-                cFYI(1, ("Process suspended by mandatory locking - "
+                cFYI(1, "Process suspended by mandatory locking - "
-                         "not implemented yet"));
+                         "not implemented yet");
        if (pfLock->fl_flags & FL_LEASE)
-                cFYI(1, ("Lease on file - not implemented yet"));
+                cFYI(1, "Lease on file - not implemented yet");
        if (pfLock->fl_flags &
            (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-                cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags));
+                cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
        if (pfLock->fl_type == F_WRLCK) {
-                cFYI(1, ("F_WRLCK "));
+                cFYI(1, "F_WRLCK ");
                numLock = 1;
        } else if (pfLock->fl_type == F_UNLCK) {
-                cFYI(1, ("F_UNLCK"));
+                cFYI(1, "F_UNLCK");
                numUnlock = 1;
                /* Check if unlock includes more than
                one lock range */
        } else if (pfLock->fl_type == F_RDLCK) {
-                cFYI(1, ("F_RDLCK"));
+                cFYI(1, "F_RDLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else if (pfLock->fl_type == F_EXLCK) {
-                cFYI(1, ("F_EXLCK"));
+                cFYI(1, "F_EXLCK");
                numLock = 1;
        } else if (pfLock->fl_type == F_SHLCK) {
-                cFYI(1, ("F_SHLCK"));
+                cFYI(1, "F_SHLCK");
                lockType |= LOCKING_ANDX_SHARED_LOCK;
                numLock = 1;
        } else
-                cFYI(1, ("Unknown type of lock"));
+                cFYI(1, "Unknown type of lock");
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        tcon = cifs_sb->tcon;
@@ -833,8 +807,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                         0 /* wait flag */ );
                        pfLock->fl_type = F_UNLCK;
                        if (rc != 0)
-                                cERROR(1, ("Error unlocking previously locked "
+                                cERROR(1, "Error unlocking previously locked "
-                                           "range %d during test of lock", rc));
+                                           "range %d during test of lock", rc);
                        rc = 0;
                } else {
@@ -856,9 +830,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                0 /* wait flag */);
                                        pfLock->fl_type = F_RDLCK;
                                        if (rc != 0)
-                                                cERROR(1, ("Error unlocking "
+                                                cERROR(1, "Error unlocking "
                                                "previously locked range %d "
-                                                "during test of lock", rc));
+                                                "during test of lock", rc);
                                        rc = 0;
                                } else {
                                        pfLock->fl_type = F_WRLCK;
@@ -892,8 +866,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                      length, pfLock,
                                      posix_lock_type, wait_flag);
        } else {
-                struct cifsFileInfo *fid =
+                struct cifsFileInfo *fid = file->private_data;
-                        (struct cifsFileInfo *)file->private_data;
                if (numLock) {
                        rc = CIFSSMBLock(xid, tcon, netfid, length,
@@ -923,9 +896,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                                        1, 0, li->type, false);
                                        if (stored_rc)
                                                rc = stored_rc;
+                                        else {
-                                        list_del(&li->llist);
+                                                list_del(&li->llist);
-                                        kfree(li);
+                                                kfree(li);
+                                        }
                                }
                        }
                        mutex_unlock(&fid->lock_mutex);
@@ -988,13 +962,12 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
        pTcon = cifs_sb->tcon;
-        /* cFYI(1,
+        /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
-           (" write %d bytes to offset %lld of %s", write_size,
+           *poffset, file->f_path.dentry->d_name.name); */
-           *poffset, file->f_path.dentry->d_name.name)); */
        if (file->private_data == NULL)
                return -EBADF;
-        open_file = (struct cifsFileInfo *) file->private_data;
+        open_file = file->private_data;
        rc = generic_write_checks(file, poffset, &write_size, 0);
        if (rc)
@@ -1091,12 +1064,12 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("write %zd bytes to offset %lld of %s", write_size,
+        cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-           *poffset, file->f_path.dentry->d_name.name));
+           *poffset, file->f_path.dentry->d_name.name);
        if (file->private_data == NULL)
                return -EBADF;
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        xid = GetXid();
@@ -1233,7 +1206,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
        it being zero) during stress testcases so we need to check for it */
        if (cifs_inode == NULL) {
-                cERROR(1, ("Null inode passed to cifs_writeable_file"));
+                cERROR(1, "Null inode passed to cifs_writeable_file");
                dump_stack();
                return NULL;
        }
@@ -1277,7 +1250,7 @@ refind_writable:
                        again. Note that it would be bad
                        to hold up writepages here (rather than
                        in caller) with continuous retries */
-                        cFYI(1, ("wp failed on reopen file"));
+                        cFYI(1, "wp failed on reopen file");
                        read_lock(&GlobalSMBSeslock);
                        /* can not use this handle, no write
                           pending on this one after all */
@@ -1353,7 +1326,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                else if (bytes_written < 0)
                        rc = bytes_written;
        } else {
-                cFYI(1, ("No writeable filehandles for inode"));
+                cFYI(1, "No writeable filehandles for inode");
                rc = -EIO;
        }
@@ -1525,7 +1498,7 @@ retry:
                         */
                        open_file = find_writable_file(CIFS_I(mapping->host));
                        if (!open_file) {
-                                cERROR(1, ("No writable handles for inode"));
+                                cERROR(1, "No writable handles for inode");
                                rc = -EBADF;
                        } else {
                                long_op = cifs_write_timeout(cifsi, offset);
@@ -1538,8 +1511,8 @@ retry:
                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
-                                        cERROR(1, ("Write2 ret %d, wrote %d",
+                                        cERROR(1, "Write2 ret %d, wrote %d",
-                                                  rc, bytes_written));
+                                                  rc, bytes_written);
                                        /* BB what if continued retry is
                                           requested via mount flags? */
                                        if (rc == -ENOSPC)
@@ -1600,7 +1573,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
        page_cache_get(page);
        if (!PageUptodate(page))
-                cFYI(1, ("ppw - page not up to date"));
+                cFYI(1, "ppw - page not up to date");
        /*
         * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1629,8 +1602,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        int rc;
        struct inode *inode = mapping->host;
-        cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+        cFYI(1, "write_end for page %p from pos %lld with %d bytes",
-                 page, pos, copied));
+                 page, pos, copied);
        if (PageChecked(page)) {
                if (copied == len)
@@ -1675,19 +1648,18 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *smbfile =
+        struct cifsFileInfo *smbfile = file->private_data;
-                (struct cifsFileInfo *)file->private_data;
        struct inode *inode = file->f_path.dentry->d_inode;
        xid = GetXid();
-        cFYI(1, ("Sync file - name: %s datasync: 0x%x",
+        cFYI(1, "Sync file - name: %s datasync: 0x%x",
-                dentry->d_name.name, datasync));
+                file->f_path.dentry->d_name.name, datasync);
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
@@ -1711,7 +1683,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        unsigned int rpages = 0;
        int rc = 0;
-        cFYI(1, ("sync page %p",page));
+        cFYI(1, "sync page %p", page);
        mapping = page->mapping;
        if (!mapping)
                return 0;
@@ -1722,7 +1694,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*      fill in rpages then
        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
+/*      cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
 #if 0
        if (rc < 0)
@@ -1756,7 +1728,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
                CIFS_I(inode)->write_behind_rc = 0;
        }
-        cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
+        cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
        return rc;
 }
@@ -1785,10 +1757,10 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1866,10 +1838,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cFYI(1, ("attempting read on write only file instance"));
+                cFYI(1, "attempting read on write only file instance");
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
@@ -1920,7 +1892,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        xid = GetXid();
        rc = cifs_revalidate_file(file);
        if (rc) {
-                cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
+                cFYI(1, "Validation prior to mmap failed, error=%d", rc);
                FreeXid(xid);
                return rc;
        }
@@ -1931,8 +1903,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static void cifs_copy_cache_pages(struct address_space *mapping,
-        struct list_head *pages, int bytes_read, char *data,
+        struct list_head *pages, int bytes_read, char *data)
-        struct pagevec *plru_pvec)
 {
        struct page *page;
        char *target;
@@ -1944,14 +1915,15 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                page = list_entry(pages->prev, struct page, lru);
                list_del(&page->lru);
-                if (add_to_page_cache(page, mapping, page->index,
+                if (add_to_page_cache_lru(page, mapping, page->index,
                                      GFP_KERNEL)) {
                        page_cache_release(page);
-                        cFYI(1, ("Add page cache failed"));
+                        cFYI(1, "Add page cache failed");
                        data += PAGE_CACHE_SIZE;
                        bytes_read -= PAGE_CACHE_SIZE;
                        continue;
                }
+                page_cache_release(page);
                target = kmap_atomic(page, KM_USER0);
@@ -1970,9 +1942,10 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (!pagevec_add(plru_pvec, page))
-                        __pagevec_lru_add_file(plru_pvec);
                data += PAGE_CACHE_SIZE;
+                /* add page to FS-Cache */
+                cifs_readpage_to_fscache(mapping->host, page);
        }
        return;
 }
@@ -1990,7 +1963,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        unsigned int read_size, i;
        char *smb_read_data = NULL;
        struct smb_com_read_rsp *pSMBr;
-        struct pagevec lru_pvec;
        struct cifsFileInfo *open_file;
        int buf_type = CIFS_NO_BUFFER;
@@ -2000,12 +1972,20 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                FreeXid(xid);
                return rc;
        }
-        open_file = (struct cifsFileInfo *)file->private_data;
+        open_file = file->private_data;
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        pTcon = cifs_sb->tcon;
-        pagevec_init(&lru_pvec, 0);
+        /*
-        cFYI(DBG2, ("rpages: num pages %d", num_pages));
+         * Reads as many pages as possible from fscache. Returns -ENOBUFS
+         * immediately if the cookie is negative
+         */
+        rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
+                                         &num_pages);
+        if (rc == 0)
+                goto read_complete;
+        cFYI(DBG2, "rpages: num pages %d", num_pages);
        for (i = 0; i < num_pages; ) {
                unsigned contig_pages;
                struct page *tmp_page;
@@ -2038,8 +2018,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                /* Read size needs to be in multiples of one page */
                read_size = min_t(const unsigned int, read_size,
                                  cifs_sb->rsize & PAGE_CACHE_MASK);
-                cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
+                cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-                                read_size, contig_pages));
+                                read_size, contig_pages);
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
                        if ((open_file->invalidHandle) &&
@@ -2066,14 +2046,14 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                        }
                }
                if ((rc < 0) || (smb_read_data == NULL)) {
-                        cFYI(1, ("Read error in readpages: %d", rc));
+                        cFYI(1, "Read error in readpages: %d", rc);
                        break;
                } else if (bytes_read > 0) {
                        task_io_account_read(bytes_read);
                        pSMBr = (struct smb_com_read_rsp *)smb_read_data;
                        cifs_copy_cache_pages(mapping, page_list, bytes_read,
                                smb_read_data + 4 /* RFC1001 hdr */ +
-                                le16_to_cpu(pSMBr->DataOffset), &lru_pvec);
+                                le16_to_cpu(pSMBr->DataOffset));
                        i +=  bytes_read >> PAGE_CACHE_SHIFT;
                        cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2089,9 +2069,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                /* break; */
                        }
                } else {
-                        cFYI(1, ("No bytes read (%d) at offset %lld . "
+                        cFYI(1, "No bytes read (%d) at offset %lld . "
-                                 "Cleaning remaining pages from readahead list",
+                                "Cleaning remaining pages from readahead list",
-                                 bytes_read, offset));
+                                bytes_read, offset);
                        /* BB turn off caching and do new lookup on
                           file size at server? */
                        break;
@@ -2106,8 +2086,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                bytes_read = 0;
        }
-        pagevec_lru_add_file(&lru_pvec);
 /* need to free smb_read_data buf before exit */
        if (smb_read_data) {
                if (buf_type == CIFS_SMALL_BUFFER)
@@ -2117,6 +2095,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                smb_read_data = NULL;
        }
+read_complete:
        FreeXid(xid);
        return rc;
 }
@@ -2127,6 +2106,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        char *read_data;
        int rc;
+        /* Is the page cached? */
+        rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+        if (rc == 0)
+                goto read_complete;
        page_cache_get(page);
        read_data = kmap(page);
        /* for reads over a certain size could initiate async read ahead */
@@ -2136,7 +2120,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        if (rc < 0)
                goto io_error;
        else
-                cFYI(1, ("Bytes read %d", rc));
+                cFYI(1, "Bytes read %d", rc);
        file->f_path.dentry->d_inode->i_atime =
                current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2146,11 +2130,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
        flush_dcache_page(page);
        SetPageUptodate(page);
+        /* send this page to the cache */
+        cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
        rc = 0;
 io_error:
        kunmap(page);
        page_cache_release(page);
+read_complete:
        return rc;
 }
@@ -2168,8 +2158,8 @@ static int cifs_readpage(struct file *file, struct page *page)
                return rc;
        }
-        cFYI(1, ("readpage %p at offset %d 0x%x\n",
+        cFYI(1, "readpage %p at offset %d 0x%x\n",
-                 page, (int)offset, (int)offset));
+                 page, (int)offset, (int)offset);
        rc = cifs_readpage_worker(file, page, &offset);
@@ -2239,7 +2229,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        struct page *page;
        int rc = 0;
-        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+        cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
@@ -2300,8 +2290,23 @@ out:
        return rc;
 }
-static void
+static int cifs_release_page(struct page *page, gfp_t gfp)
-cifs_oplock_break(struct slow_work *work)
+{
+        if (PagePrivate(page))
+                return 0;
+        return cifs_fscache_release_page(page, gfp);
+}
+static void cifs_invalidate_page(struct page *page, unsigned long offset)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
+        if (offset == 0)
+                cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
+}
+void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
                                                  oplock_break);
@@ -2311,12 +2316,10 @@ cifs_oplock_break(struct slow_work *work)
        int rc, waitrc = 0;
        if (inode && S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+                if (cinode->clientCanCacheRead)
-                if (cinode->clientCanCacheAll == 0)
                        break_lease(inode, O_RDONLY);
-                else if (cinode->clientCanCacheRead == 0)
+                else
                        break_lease(inode, O_WRONLY);
-#endif
                rc = filemap_fdatawrite(inode->i_mapping);
                if (cinode->clientCanCacheRead == 0) {
                        waitrc = filemap_fdatawait(inode->i_mapping);
@@ -2326,7 +2329,7 @@ cifs_oplock_break(struct slow_work *work)
                        rc = waitrc;
                if (rc)
                        cinode->write_behind_rc = rc;
-                cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
        /*
@@ -2338,35 +2341,32 @@ cifs_oplock_break(struct slow_work *work)
        if (!cfile->closePend && !cfile->oplock_break_cancelled) {
                rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
                                 LOCKING_ANDX_OPLOCK_RELEASE, false);
-                cFYI(1, ("Oplock release rc = %d", rc));
+                cFYI(1, "Oplock release rc = %d", rc);
        }
+        /*
+         * We might have kicked in before is_valid_oplock_break()
+         * finished grabbing reference for us.  Make sure it's done by
+         * waiting for GlobalSMSSeslock.
+         */
+        write_lock(&GlobalSMBSeslock);
+        write_unlock(&GlobalSMBSeslock);
+        cifs_oplock_break_put(cfile);
 }
-static int
+void cifs_oplock_break_get(struct cifsFileInfo *cfile)
-cifs_oplock_break_get(struct slow_work *work)
 {
-        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-                                                  oplock_break);
        mntget(cfile->mnt);
        cifsFileInfo_get(cfile);
-        return 0;
 }
-static void
+void cifs_oplock_break_put(struct cifsFileInfo *cfile)
-cifs_oplock_break_put(struct slow_work *work)
 {
-        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
-                                                  oplock_break);
        mntput(cfile->mnt);
        cifsFileInfo_put(cfile);
 }
-const struct slow_work_ops cifs_oplock_break_ops = {
-        .get_ref        = cifs_oplock_break_get,
-        .put_ref        = cifs_oplock_break_put,
-        .execute        = cifs_oplock_break,
-};
 const struct address_space_operations cifs_addr_ops = {
        .readpage = cifs_readpage,
        .readpages = cifs_readpages,
@@ -2375,6 +2375,8 @@ const struct address_space_operations cifs_addr_ops = {
        .write_begin = cifs_write_begin,
        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
+        .releasepage = cifs_release_page,
+        .invalidatepage = cifs_invalidate_page,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
@@ -2391,6 +2393,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .write_begin = cifs_write_begin,
        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
+        .releasepage = cifs_release_page,
+        .invalidatepage = cifs_invalidate_page,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
new file mode 100644
index 000000000000..9f3f5c4be161
--- /dev/null
+++ b/fs/cifs/fscache.c
@@ -0,0 +1,236 @@
+/*
+ *   fs/cifs/fscache.c - CIFS filesystem cache interface
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Author(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include "fscache.h"
+#include "cifsglob.h"
+#include "cifs_debug.h"
+#include "cifs_fs_sb.h"
+void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
+{
+        server->fscache =
+                fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
+                                &cifs_fscache_server_index_def, server);
+        cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server,
+                                server->fscache);
+}
+void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
+{
+        cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server,
+                                server->fscache);
+        fscache_relinquish_cookie(server->fscache, 0);
+        server->fscache = NULL;
+}
+void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon)
+{
+        struct TCP_Server_Info *server = tcon->ses->server;
+        tcon->fscache =
+                fscache_acquire_cookie(server->fscache,
+                                &cifs_fscache_super_index_def, tcon);
+        cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)",
+                                server->fscache, tcon->fscache);
+}
+void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon)
+{
+        cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache);
+        fscache_relinquish_cookie(tcon->fscache, 0);
+        tcon->fscache = NULL;
+}
+static void cifs_fscache_enable_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        if (cifsi->fscache)
+                return;
+        cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                                &cifs_fscache_inode_object_def,
+                                cifsi);
+        cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)",
+                        cifs_sb->tcon->fscache, cifsi->fscache);
+}
+void cifs_fscache_release_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        if (cifsi->fscache) {
+                cFYI(1, "CIFS releasing inode cookie (0x%p)",
+                                cifsi->fscache);
+                fscache_relinquish_cookie(cifsi->fscache, 0);
+                cifsi->fscache = NULL;
+        }
+}
+static void cifs_fscache_disable_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        if (cifsi->fscache) {
+                cFYI(1, "CIFS disabling inode cookie (0x%p)",
+                                cifsi->fscache);
+                fscache_relinquish_cookie(cifsi->fscache, 1);
+                cifsi->fscache = NULL;
+        }
+}
+void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+                cifs_fscache_disable_inode_cookie(inode);
+        else {
+                cifs_fscache_enable_inode_cookie(inode);
+                cFYI(1, "CIFS: fscache inode cookie set");
+        }
+}
+void cifs_fscache_reset_inode_cookie(struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct fscache_cookie *old = cifsi->fscache;
+        if (cifsi->fscache) {
+                /* retire the current fscache cache and get a new one */
+                fscache_relinquish_cookie(cifsi->fscache, 1);
+                cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache,
+                                        &cifs_fscache_inode_object_def,
+                                        cifsi);
+                cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p",
+                                cifsi->fscache, old);
+        }
+}
+int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        if (PageFsCache(page)) {
+                struct inode *inode = page->mapping->host;
+                struct cifsInodeInfo *cifsi = CIFS_I(inode);
+                cFYI(1, "CIFS: fscache release page (0x%p/0x%p)",
+                                page, cifsi->fscache);
+                if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
+                        return 0;
+        }
+        return 1;
+}
+static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
+                                                int error)
+{
+        cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)",
+                        page, error);
+        if (!error)
+                SetPageUptodate(page);
+        unlock_page(page);
+}
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+        int ret;
+        cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p",
+                        CIFS_I(inode)->fscache, page, inode);
+        ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
+                                         cifs_readpage_from_fscache_complete,
+                                         NULL,
+                                         GFP_KERNEL);
+        switch (ret) {
+        case 0: /* page found in fscache, read submitted */
+                cFYI(1, "CIFS: readpage_from_fscache: submitted");
+                return ret;
+        case -ENOBUFS:  /* page won't be cached */
+        case -ENODATA:  /* page not in cache */
+                cFYI(1, "CIFS: readpage_from_fscache %d", ret);
+                return 1;
+        default:
+                cERROR(1, "unknown error ret = %d", ret);
+        }
+        return ret;
+}
+/*
+ * Retrieve a set of pages from FS-Cache
+ */
+int __cifs_readpages_from_fscache(struct inode *inode,
+                                struct address_space *mapping,
+                                struct list_head *pages,
+                                unsigned *nr_pages)
+{
+        int ret;
+        cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)",
+                        CIFS_I(inode)->fscache, *nr_pages, inode);
+        ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
+                                          pages, nr_pages,
+                                          cifs_readpage_from_fscache_complete,
+                                          NULL,
+                                          mapping_gfp_mask(mapping));
+        switch (ret) {
+        case 0: /* read submitted to the cache for all pages */
+                cFYI(1, "CIFS: readpages_from_fscache: submitted");
+                return ret;
+        case -ENOBUFS:  /* some pages are not cached and can't be */
+        case -ENODATA:  /* some pages are not cached */
+                cFYI(1, "CIFS: readpages_from_fscache: no page");
+                return 1;
+        default:
+                cFYI(1, "unknown error ret = %d", ret);
+        }
+        return ret;
+}
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
+        int ret;
+        cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p",
+                        CIFS_I(inode)->fscache, page, inode);
+        ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
+        if (ret != 0)
+                fscache_uncache_page(CIFS_I(inode)->fscache, page);
+}
+void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+        struct cifsInodeInfo *cifsi = CIFS_I(inode);
+        struct fscache_cookie *cookie = cifsi->fscache;
+        cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie);
+        fscache_wait_on_page_write(cookie, page);
+        fscache_uncache_page(cookie, page);
+}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
new file mode 100644
index 000000000000..31b88ec2341e
--- /dev/null
+++ b/fs/cifs/fscache.h
@@ -0,0 +1,136 @@
+/*
+ *   fs/cifs/fscache.h - CIFS filesystem cache interface definitions
+ *
+ *   Copyright (c) 2010 Novell, Inc.
+ *   Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _CIFS_FSCACHE_H
+#define _CIFS_FSCACHE_H
+#include <linux/fscache.h>
+#include "cifsglob.h"
+#ifdef CONFIG_CIFS_FSCACHE
+extern struct fscache_netfs cifs_fscache_netfs;
+extern const struct fscache_cookie_def cifs_fscache_server_index_def;
+extern const struct fscache_cookie_def cifs_fscache_super_index_def;
+extern const struct fscache_cookie_def cifs_fscache_inode_object_def;
+extern int cifs_fscache_register(void);
+extern void cifs_fscache_unregister(void);
+/*
+ * fscache.c
+ */
+extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *);
+extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *);
+extern void cifs_fscache_release_inode_cookie(struct inode *);
+extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void cifs_fscache_reset_inode_cookie(struct inode *);
+extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
+extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
+extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
+extern int __cifs_readpages_from_fscache(struct inode *,
+                                         struct address_space *,
+                                         struct list_head *,
+                                         unsigned *);
+extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
+static inline void cifs_fscache_invalidate_page(struct page *page,
+                                               struct inode *inode)
+{
+        if (PageFsCache(page))
+                __cifs_fscache_invalidate_page(page, inode);
+}
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+                                             struct page *page)
+{
+        if (CIFS_I(inode)->fscache)
+                return __cifs_readpage_from_fscache(inode, page);
+        return -ENOBUFS;
+}
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+                                              struct address_space *mapping,
+                                              struct list_head *pages,
+                                              unsigned *nr_pages)
+{
+        if (CIFS_I(inode)->fscache)
+                return __cifs_readpages_from_fscache(inode, mapping, pages,
+                                                     nr_pages);
+        return -ENOBUFS;
+}
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+                                            struct page *page)
+{
+        if (PageFsCache(page))
+                __cifs_readpage_to_fscache(inode, page);
+}
+#else /* CONFIG_CIFS_FSCACHE */
+static inline int cifs_fscache_register(void) { return 0; }
+static inline void cifs_fscache_unregister(void) {}
+static inline void
+cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {}
+static inline void
+cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {}
+static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void
+cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {}
+static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
+                                                 struct file *filp) {}
+static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+        return 1; /* May release page */
+}
+static inline void cifs_fscache_invalidate_page(struct page *page,
+                        struct inode *inode) {}
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+{
+        return -ENOBUFS;
+}
+static inline int cifs_readpages_from_fscache(struct inode *inode,
+                                              struct address_space *mapping,
+                                              struct list_head *pages,
+                                              unsigned *nr_pages)
+{
+        return -ENOBUFS;
+}
+static inline void cifs_readpage_to_fscache(struct inode *inode,
+                        struct page *page) {}
+#endif /* CONFIG_CIFS_FSCACHE */
+#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 29b9ea244c81..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/inode.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -29,6 +29,7 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "fscache.h"
 static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
@@ -86,30 +87,30 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-        cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+        cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
        if (inode->i_state & I_NEW) {
-                cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+                cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
                return;
        }
        /* don't bother with revalidation if we have an oplock */
        if (cifs_i->clientCanCacheRead) {
-                cFYI(1, ("%s: inode %llu is oplocked", __func__,
+                cFYI(1, "%s: inode %llu is oplocked", __func__,
-                         cifs_i->uniqueid));
+                         cifs_i->uniqueid);
                return;
        }
         /* revalidate if mtime or size have changed */
        if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
            cifs_i->server_eof == fattr->cf_eof) {
-                cFYI(1, ("%s: inode %llu is unchanged", __func__,
+                cFYI(1, "%s: inode %llu is unchanged", __func__,
-                         cifs_i->uniqueid));
+                         cifs_i->uniqueid);
                return;
        }
-        cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
+        cFYI(1, "%s: invalidating inode %llu mapping", __func__,
-                 cifs_i->uniqueid));
+                 cifs_i->uniqueid);
        cifs_i->invalid_mapping = true;
 }
@@ -137,15 +138,14 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
                inode->i_mode = fattr->cf_mode;
        cifs_i->cifsAttrs = fattr->cf_cifsattrs;
-        cifs_i->uniqueid = fattr->cf_uniqueid;
        if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
                cifs_i->time = 0;
        else
                cifs_i->time = jiffies;
-        cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
+        cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-                 oldtime, cifs_i->time));
+                 oldtime, cifs_i->time);
        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
@@ -170,6 +170,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
 }
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+                return;
+        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
 /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
 void
 cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -227,7 +238,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
                /* safest to call it a file if we do not know */
                fattr->cf_mode |= S_IFREG;
                fattr->cf_dtype = DT_REG;
-                cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+                cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
                break;
        }
@@ -256,7 +267,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-        cFYI(1, ("creating fake fattr for DFS referral"));
+        cFYI(1, "creating fake fattr for DFS referral");
        memset(fattr, 0, sizeof(*fattr));
        fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -278,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        struct cifsFileInfo *cfile = filp->private_data;
        xid = GetXid();
        rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -305,7 +316,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        tcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        /* could have done a find first instead but this returns more info */
        rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -323,6 +334,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
        if (*pinode == NULL) {
                /* get new inode */
+                cifs_fill_uniqueid(sb, &fattr);
                *pinode = cifs_iget(sb, &fattr);
                if (!*pinode)
                        rc = -ENOMEM;
@@ -373,7 +385,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                 &bytes_read, &pbuf, &buf_type);
                if ((rc == 0) && (bytes_read >= 8)) {
                        if (memcmp("IntxBLK", pbuf, 8) == 0) {
-                                cFYI(1, ("Block device"));
+                                cFYI(1, "Block device");
                                fattr->cf_mode |= S_IFBLK;
                                fattr->cf_dtype = DT_BLK;
                                if (bytes_read == 24) {
@@ -385,7 +397,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-                                cFYI(1, ("Char device"));
+                                cFYI(1, "Char device");
                                fattr->cf_mode |= S_IFCHR;
                                fattr->cf_dtype = DT_CHR;
                                if (bytes_read == 24) {
@@ -397,7 +409,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
                                        fattr->cf_rdev = MKDEV(mjr, mnr);
                                }
                        } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-                                cFYI(1, ("Symlink"));
+                                cFYI(1, "Symlink");
                                fattr->cf_mode |= S_IFLNK;
                                fattr->cf_dtype = DT_LNK;
                        } else {
@@ -439,10 +451,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
        else if (rc > 3) {
                mode = le32_to_cpu(*((__le32 *)ea_value));
                fattr->cf_mode &= ~SFBITS_MASK;
-                cFYI(1, ("special bits 0%o org mode 0%o", mode,
+                cFYI(1, "special bits 0%o org mode 0%o", mode,
-                         fattr->cf_mode));
+                         fattr->cf_mode);
                fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-                cFYI(1, ("special mode bits 0%o", mode));
+                cFYI(1, "special mode bits 0%o", mode);
        }
        return 0;
@@ -504,7 +516,7 @@ int cifs_get_file_info(struct file *filp)
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+        struct cifsFileInfo *cfile = filp->private_data;
        xid = GetXid();
        rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
@@ -548,11 +560,11 @@ int cifs_get_inode_info(struct inode **pinode,
        struct cifs_fattr fattr;
        pTcon = cifs_sb->tcon;
-        cFYI(1, ("Getting info on %s", full_path));
+        cFYI(1, "Getting info on %s", full_path);
        if ((pfindData == NULL) && (*pinode != NULL)) {
                if (CIFS_I(*pinode)->clientCanCacheRead) {
-                        cFYI(1, ("No need to revalidate cached inode sizes"));
+                        cFYI(1, "No need to revalidate cached inode sizes");
                        return rc;
                }
        }
@@ -618,7 +630,7 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1 || !fattr.cf_uniqueid) {
-                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+                                cFYI(1, "GetSrvInodeNum rc %d", rc1);
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
                                cifs_autodisable_serverino(cifs_sb);
                        }
@@ -634,13 +646,13 @@ int cifs_get_inode_info(struct inode **pinode,
            cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
                tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
                if (tmprc)
-                        cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
+                        cFYI(1, "cifs_sfu_type failed: %d", tmprc);
        }
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-                cFYI(1, ("Getting mode bits from ACL"));
+                cFYI(1, "Getting mode bits from ACL");
                cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
        }
 #endif
@@ -712,18 +724,17 @@ cifs_find_inode(struct inode *inode, void *opaque)
 {
        struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+        /* don't match inode with different uniqueid */
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
-        /*
+        /* don't match inode of different type */
-         * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+        if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
-         * verboten. Disable serverino and return it as if it were found, the
+                return 0;
-         * caller can discard it, generate a uniqueid and retry the find
-         */
+        /* if it's not a directory or has no dentries, then flag it */
-        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
                fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
-                cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
-        }
        return 1;
 }
@@ -737,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
        return 0;
 }
+/*
+ * walk dentry list for an inode and report whether it has aliases that
+ * are hashed. We use this to determine if a directory inode can actually
+ * be used.
+ */
+static bool
+inode_has_hashed_dentries(struct inode *inode)
+{
+        struct dentry *dentry;
+        spin_lock(&dcache_lock);
+        list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+                if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
+                        spin_unlock(&dcache_lock);
+                        return true;
+                }
+        }
+        spin_unlock(&dcache_lock);
+        return false;
+}
 /* Given fattrs, get a corresponding inode */
 struct inode *
 cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -745,19 +777,23 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
        struct inode *inode;
 retry_iget5_locked:
-        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+        cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
        /* hash down to 32-bits on 32-bit arch */
        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
        if (inode) {
-                /* was there a problematic inode number collision? */
+                /* was there a potentially problematic inode collision? */
                if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
-                        iput(inode);
-                        fattr->cf_uniqueid = iunique(sb, ROOT_I);
                        fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
-                        goto retry_iget5_locked;
+                        if (inode_has_hashed_dentries(inode)) {
+                                cifs_autodisable_serverino(CIFS_SB(sb));
+                                iput(inode);
+                                fattr->cf_uniqueid = iunique(sb, ROOT_I);
+                                goto retry_iget5_locked;
+                        }
                }
                cifs_fattr_to_inode(inode, fattr);
@@ -765,6 +801,12 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
+                        if (S_ISREG(inode->i_mode))
+                                inode->i_data.backing_dev_info = sb->s_bdi;
+#ifdef CONFIG_CIFS_FSCACHE
+                        /* initialize per-inode cache cookie pointer */
+                        CIFS_I(inode)->fscache = NULL;
+#endif
                        unlock_new_inode(inode);
                }
        }
@@ -794,10 +836,15 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
                                                xid, NULL);
        if (!inode)
-                return ERR_PTR(-ENOMEM);
+                return ERR_PTR(rc);
+#ifdef CONFIG_CIFS_FSCACHE
+        /* populate tcon->resource_id */
+        cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid;
+#endif
        if (rc && cifs_sb->tcon->ipc) {
-                cFYI(1, ("ipc connection - fake read inode"));
+                cFYI(1, "ipc connection - fake read inode");
                inode->i_mode |= S_IFDIR;
                inode->i_nlink = 2;
                inode->i_op = &cifs_ipc_inode_ops;
@@ -859,7 +906,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
         * server times.
         */
        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-                cFYI(1, ("CIFS - CTIME changed"));
+                cFYI(1, "CIFS - CTIME changed");
                info_buf.ChangeTime =
                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
        } else
@@ -894,8 +941,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
                        goto out;
        }
-        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+        cFYI(1, "calling SetFileInfo since SetPathInfo for "
-                 "times not supported by this server"));
+                 "times not supported by this server");
        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
                         CREATE_NOT_DIR, &netfid, &oplock,
@@ -1053,7 +1100,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
        struct iattr *attrs = NULL;
        __u32 dosattr = 0, origattr = 0;
-        cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
+        cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
        xid = GetXid();
@@ -1072,7 +1119,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
                rc = CIFSPOSIXDelFile(xid, tcon, full_path,
                        SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("posix del rc %d", rc));
+                cFYI(1, "posix del rc %d", rc);
                if ((rc == 0) || (rc == -ENOENT))
                        goto psx_del_no_retry;
        }
@@ -1146,7 +1193,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
        struct inode *newinode = NULL;
        struct cifs_fattr fattr;
-        cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
+        cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
        xid = GetXid();
@@ -1181,7 +1228,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        kfree(pInfo);
                        goto mkdir_retry_old;
                } else if (rc) {
-                        cFYI(1, ("posix mkdir returned 0x%x", rc));
+                        cFYI(1, "posix mkdir returned 0x%x", rc);
                        d_drop(direntry);
                } else {
                        if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1198,6 +1245,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                                direntry->d_op = &cifs_dentry_ops;
                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+                        cifs_fill_uniqueid(inode->i_sb, &fattr);
                        newinode = cifs_iget(inode->i_sb, &fattr);
                        if (!newinode) {
                                kfree(pInfo);
@@ -1207,12 +1255,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                        d_instantiate(direntry, newinode);
 #ifdef CONFIG_CIFS_DEBUG2
-                        cFYI(1, ("instantiated dentry %p %s to inode %p",
+                        cFYI(1, "instantiated dentry %p %s to inode %p",
-                                direntry, direntry->d_name.name, newinode));
+                                direntry, direntry->d_name.name, newinode);
                        if (newinode->i_nlink != 2)
-                                cFYI(1, ("unexpected number of links %d",
+                                cFYI(1, "unexpected number of links %d",
-                                        newinode->i_nlink));
+                                        newinode->i_nlink);
 #endif
                }
                kfree(pInfo);
@@ -1223,7 +1271,7 @@ mkdir_retry_old:
        rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
-                cFYI(1, ("cifs_mkdir returned 0x%x", rc));
+                cFYI(1, "cifs_mkdir returned 0x%x", rc);
                d_drop(direntry);
        } else {
 mkdir_get_info:
@@ -1326,7 +1374,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
-        cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
+        cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
        xid = GetXid();
@@ -1389,6 +1437,10 @@ cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
        if (rc == 0 || rc != -ETXTBSY)
                return rc;
+        /* open-file renames don't work across directories */
+        if (to_dentry->d_parent != from_dentry->d_parent)
+                return rc;
        /* open the file to be renamed -- we need DELETE perms */
        rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
                         CREATE_NOT_DIR, &srcfid, &oplock, NULL,
@@ -1412,29 +1464,18 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 {
        char *fromName = NULL;
        char *toName = NULL;
-        struct cifs_sb_info *cifs_sb_source;
+        struct cifs_sb_info *cifs_sb;
-        struct cifs_sb_info *cifs_sb_target;
        struct cifsTconInfo *tcon;
        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid, rc, tmprc;
-        cifs_sb_target = CIFS_SB(target_dir->i_sb);
+        cifs_sb = CIFS_SB(source_dir->i_sb);
-        cifs_sb_source = CIFS_SB(source_dir->i_sb);
+        tcon = cifs_sb->tcon;
-        tcon = cifs_sb_source->tcon;
        xid = GetXid();
        /*
-         * BB: this might be allowed if same server, but different share.
-         * Consider adding support for this
-         */
-        if (tcon != cifs_sb_target->tcon) {
-                rc = -EXDEV;
-                goto cifs_rename_exit;
-        }
-        /*
         * we already have the rename sem so we do not need to
         * grab it again here to protect the path integrity
         */
@@ -1469,17 +1510,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
                info_buf_target = info_buf_source + 1;
                tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
                                        info_buf_source,
-                                        cifs_sb_source->local_nls,
+                                        cifs_sb->local_nls,
-                                        cifs_sb_source->mnt_cifs_flags &
+                                        cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc != 0)
                        goto unlink_target;
-                tmprc = CIFSSMBUnixQPathInfo(xid, tcon,
+                tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName,
-                                        toName, info_buf_target,
+                                        info_buf_target,
-                                        cifs_sb_target->local_nls,
+                                        cifs_sb->local_nls,
-                                        /* remap based on source sb */
+                                        cifs_sb->mnt_cifs_flags &
-                                        cifs_sb_source->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (tmprc == 0 && (info_buf_source->UniqueId ==
@@ -1528,6 +1568,11 @@ cifs_inode_needs_reval(struct inode *inode)
        if (time_after_eq(jiffies, cifs_i->time + HZ))
                return true;
+        /* hardlinked files w/ noserverino get "special" treatment */
+        if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+            S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+                return true;
        return false;
 }
@@ -1547,6 +1592,7 @@ cifs_invalidate_mapping(struct inode *inode)
                        cifs_i->write_behind_rc = rc;
        }
        invalidate_remote_inode(inode);
+        cifs_fscache_reset_inode_cookie(inode);
 }
 int cifs_revalidate_file(struct file *filp)
@@ -1594,9 +1640,9 @@ int cifs_revalidate_dentry(struct dentry *dentry)
                goto check_inval;
        }
-        cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+        cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
                 "jiffies %ld", full_path, inode, inode->i_count.counter,
-                 dentry, dentry->d_time, jiffies));
+                 dentry, dentry->d_time, jiffies);
        if (CIFS_SB(sb)->tcon->unix_ext)
                rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
@@ -1642,26 +1688,16 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
        return rc;
 }
-static int cifs_vmtruncate(struct inode *inode, loff_t offset)
+static void cifs_setsize(struct inode *inode, loff_t offset)
 {
        loff_t oldsize;
-        int err;
        spin_lock(&inode->i_lock);
-        err = inode_newsize_ok(inode, offset);
-        if (err) {
-                spin_unlock(&inode->i_lock);
-                goto out;
-        }
        oldsize = inode->i_size;
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
        truncate_pagecache(inode, oldsize, offset);
-        if (inode->i_op->truncate)
-                inode->i_op->truncate(inode);
-out:
-        return err;
 }
 static int
@@ -1690,12 +1726,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
                cifsFileInfo_put(open_file);
-                cFYI(1, ("SetFSize for attrs rc = %d", rc));
+                cFYI(1, "SetFSize for attrs rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
                                          &bytes_written, NULL, NULL, 1);
-                        cFYI(1, ("Wrt seteof rc %d", rc));
+                        cFYI(1, "Wrt seteof rc %d", rc);
                }
        } else
                rc = -EINVAL;
@@ -1709,7 +1745,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                   false, cifs_sb->local_nls,
                                   cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+                cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        __u16 netfid;
                        int oplock = 0;
@@ -1726,7 +1762,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                                                  attrs->ia_size,
                                                  &bytes_written, NULL,
                                                  NULL, 1);
-                                cFYI(1, ("wrt seteof rc %d", rc));
+                                cFYI(1, "wrt seteof rc %d", rc);
                                CIFSSMBClose(xid, pTcon, netfid);
                        }
                }
@@ -1734,7 +1770,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
        if (rc == 0) {
                cifsInode->server_eof = attrs->ia_size;
-                rc = cifs_vmtruncate(inode, attrs->ia_size);
+                cifs_setsize(inode, attrs->ia_size);
                cifs_truncate_page(inode->i_mapping, inode->i_size);
        }
@@ -1754,19 +1790,17 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
        struct cifs_unix_set_info_args *args = NULL;
        struct cifsFileInfo *open_file;
-        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+        cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
        xid = GetXid();
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
-                /* check if we have permission to change attrs */
+                attrs->ia_valid |= ATTR_FORCE;
-                rc = inode_change_ok(inode, attrs);
-                if (rc < 0)
+        rc = inode_change_ok(inode, attrs);
-                        goto out;
+        if (rc < 0)
-                else
+                goto out;
-                        rc = 0;
-        }
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
@@ -1852,18 +1886,24 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
        }
-        if (!rc) {
+        if (rc)
-                rc = inode_setattr(inode, attrs);
+                goto out;
-                /* force revalidate when any of these times are set since some
+        if ((attrs->ia_valid & ATTR_SIZE) &&
-                   of the fs types (eg ext3, fat) do not have fine enough
+            attrs->ia_size != i_size_read(inode))
-                   time granularity to match protocol, and we do not have a
+                truncate_setsize(inode, attrs->ia_size);
-                   a way (yet) to query the server fs's time granularity (and
-                   whether it rounds times down).
+        setattr_copy(inode, attrs);
-                */
+        mark_inode_dirty(inode);
-                if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
-                        cifsInode->time = 0;
+        /* force revalidate when any of these times are set since some
-        }
+           of the fs types (eg ext3, fat) do not have fine enough
+           time granularity to match protocol, and we do not have a
+           a way (yet) to query the server fs's time granularity (and
+           whether it rounds times down).
+        */
+        if (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+                cifsInode->time = 0;
 out:
        kfree(args);
        kfree(full_path);
@@ -1885,17 +1925,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        xid = GetXid();
-        cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
+        cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
-                 direntry->d_name.name, attrs->ia_valid));
+                 direntry->d_name.name, attrs->ia_valid);
-        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
-                /* check if we have permission to change attrs */
+                attrs->ia_valid |= ATTR_FORCE;
-                rc = inode_change_ok(inode, attrs);
-                if (rc < 0) {
+        rc = inode_change_ok(inode, attrs);
-                        FreeXid(xid);
+        if (rc < 0) {
-                        return rc;
+                FreeXid(xid);
-                } else
+                return rc;
-                        rc = 0;
        }
        full_path = build_path_from_dentry(direntry);
@@ -1943,7 +1982,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
                attrs->ia_valid &= ~ATTR_MODE;
        if (attrs->ia_valid & ATTR_MODE) {
-                cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
+                cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
                mode = attrs->ia_mode;
        }
@@ -2003,8 +2042,17 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
        /* do not need local check to inode_check_ok since the server does
           that */
-        if (!rc)
+        if (rc)
-                rc = inode_setattr(inode, attrs);
+                goto cifs_setattr_exit;
+        if ((attrs->ia_valid & ATTR_SIZE) &&
+            attrs->ia_size != i_size_read(inode))
+                truncate_setsize(inode, attrs->ia_size);
+        setattr_copy(inode, attrs);
+        mark_inode_dirty(inode);
+        return 0;
 cifs_setattr_exit:
        kfree(full_path);
        FreeXid(xid);
@@ -2029,7 +2077,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-        cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
+        cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
        /* may have to add back in if and when safe distributed caching of
           directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a00..9d38a71c8e14 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -41,13 +41,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        __u64   ExtAttrMask = 0;
        __u64   caps;
        struct cifsTconInfo *tcon;
-        struct cifsFileInfo *pSMBFile =
+        struct cifsFileInfo *pSMBFile = filep->private_data;
-                (struct cifsFileInfo *)filep->private_data;
 #endif /* CONFIG_CIFS_POSIX */
        xid = GetXid();
-        cFYI(1, ("ioctl file %p  cmd %u  arg %lu", filep, command, arg));
+        cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
        cifs_sb = CIFS_SB(inode->i_sb);
@@ -64,12 +63,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
-                        cFYI(1, ("User unmount attempted"));
+                        cFYI(1, "User unmount attempted");
                        if (cifs_sb->mnt_uid == current_uid())
                                rc = 0;
                        else {
                                rc = -EACCES;
-                                cFYI(1, ("uids do not match"));
+                                cFYI(1, "uids do not match");
                        }
                        break;
 #ifdef CONFIG_CIFS_POSIX
@@ -97,11 +96,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                                /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
                                        extAttrBits, &ExtAttrMask);*/
                        }
-                        cFYI(1, ("set flags not implemented yet"));
+                        cFYI(1, "set flags not implemented yet");
                        break;
 #endif /* CONFIG_CIFS_POSIX */
                default:
-                        cFYI(1, ("unsupported ioctl"));
+                        cFYI(1, "unsupported ioctl");
                        break;
        }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c1a9d4236a8c..473ca8033656 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -139,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
        if (!full_path)
                goto out;
-        cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+        cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
        rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
                                     cifs_sb->local_nls);
@@ -178,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                return rc;
        }
-        cFYI(1, ("Full path: %s", full_path));
+        cFYI(1, "Full path: %s", full_path);
-        cFYI(1, ("symname is %s", symname));
+        cFYI(1, "symname is %s", symname);
        /* BB what if DFS and this volume is on different share? BB */
        if (pTcon->unix_ext)
@@ -198,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
                                                 inode->i_sb, xid, NULL);
                if (rc != 0) {
-                        cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
+                        cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
-                              rc));
+                              rc);
                } else {
                        if (pTcon->nocase)
                                direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a812..3ccadc1326d6 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
        if (GlobalTotalActiveXid > GlobalMaxActiveXid)
                GlobalMaxActiveXid = GlobalTotalActiveXid;
        if (GlobalTotalActiveXid > 65000)
-                cFYI(1, ("warning: more than 65000 requests active"));
+                cFYI(1, "warning: more than 65000 requests active");
        xid = GlobalCurrentXid++;
        spin_unlock(&GlobalMid_Lock);
        return xid;
@@ -88,7 +88,7 @@ void
 sesInfoFree(struct cifsSesInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to sesInfoFree"));
+                cFYI(1, "Null buffer passed to sesInfoFree");
                return;
        }
@@ -126,7 +126,7 @@ void
 tconInfoFree(struct cifsTconInfo *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to tconInfoFree"));
+                cFYI(1, "Null buffer passed to tconInfoFree");
                return;
        }
        atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                /* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/
+                /* cFYI(1, "Null buffer passed to cifs_buf_release");*/
                return;
        }
        mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
        if (buf_to_free == NULL) {
-                cFYI(1, ("Null buffer passed to cifs_small_buf_release"));
+                cFYI(1, "Null buffer passed to cifs_small_buf_release");
                return;
        }
        mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                /*      with userid/password pairs found on the smb session   */
                /*      for other target tcp/ip addresses               BB    */
                                if (current_fsuid() != treeCon->ses->linux_uid) {
-                                        cFYI(1, ("Multiuser mode and UID "
+                                        cFYI(1, "Multiuser mode and UID "
-                                                 "did not match tcon uid"));
+                                                 "did not match tcon uid");
                                        read_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
-                                                                cFYI(1, ("found matching uid substitute right smb_uid"));
+                                                                cFYI(1, "found matching uid substitute right smb_uid");
                                                                buffer->Uid = ses->Suid;
                                                                break;
                                                        } else {
                                /* BB eventually call cifs_setup_session here */
-                                                                cFYI(1, ("local UID found but no smb sess with this server exists"));
+                                                                cFYI(1, "local UID found but no smb sess with this server exists");
                                                        }
                                                }
                                        }
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
                        if (smb->Command == SMB_COM_LOCKING_ANDX)
                                return 0;
                        else
-                                cERROR(1, ("Received Request not response"));
+                                cERROR(1, "Received Request not response");
                }
        } else { /* bad signature or mid */
                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1,
+                        cERROR(1, "Bad protocol string signature header %x",
-                               ("Bad protocol string signature header %x",
+                                *(unsigned int *) smb->Protocol);
-                                *(unsigned int *) smb->Protocol));
                if (mid != smb->Mid)
-                        cERROR(1, ("Mids do not match"));
+                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid));
+        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
        return 1;
 }
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
        __u32 len = smb->smb_buf_length;
        __u32 clc_len;  /* calculated length */
-        cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
+        cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
        if (length < 2 + sizeof(struct smb_hdr)) {
                if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                                tmp[sizeof(struct smb_hdr)+1] = 0;
                                return 0;
                        }
-                        cERROR(1, ("rcvd invalid byte count (bcc)"));
+                        cERROR(1, "rcvd invalid byte count (bcc)");
                } else {
-                        cERROR(1, ("Length less than smb header size"));
+                        cERROR(1, "Length less than smb header size");
                }
                return 1;
        }
        if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+                cERROR(1, "smb length greater than MaxBufSize, mid=%d",
-                                   smb->Mid));
+                                   smb->Mid);
                return 1;
        }
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
        clc_len = smbCalcSize_LE(smb);
        if (4 + len != length) {
-                cERROR(1, ("Length read does not match RFC1001 length %d",
+                cERROR(1, "Length read does not match RFC1001 length %d",
-                           len));
+                           len);
                return 1;
        }
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
-                                clc_len, 4 + len, smb->Mid));
+                                clc_len, 4 + len, smb->Mid);
                /* Windows XP can return a few bytes too much, presumably
                an illegal pad, at the end of byte range lock responses
                so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                if ((4+len > clc_len) && (len <= clc_len + 512))
                        return 0;
                else {
-                        cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d",
+                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
-                                        len, smb->Mid));
+                                        len, smb->Mid);
                        return 1;
                }
        }
@@ -499,9 +498,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        struct cifsTconInfo *tcon;
        struct cifsInodeInfo *pCifsInode;
        struct cifsFileInfo *netfile;
-        int rc;
-        cFYI(1, ("Checking for oplock break or dnotify response"));
+        cFYI(1, "Checking for oplock break or dnotify response");
        if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
           (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
                struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +511,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        pnotify = (struct file_notify_information *)
                                ((char *)&pSMBr->hdr.Protocol + data_offset);
-                        cFYI(1, ("dnotify on %s Action: 0x%x",
+                        cFYI(1, "dnotify on %s Action: 0x%x",
-                                 pnotify->FileName, pnotify->Action));
+                                 pnotify->FileName, pnotify->Action);
                        /*   cifs_dump_mem("Rcvd notify Data: ",buf,
                                sizeof(struct smb_hdr)+60); */
                        return true;
                }
                if (pSMBr->hdr.Status.CifsError) {
-                        cFYI(1, ("notify err 0x%d",
+                        cFYI(1, "notify err 0x%d",
-                                pSMBr->hdr.Status.CifsError));
+                                pSMBr->hdr.Status.CifsError);
                        return true;
                }
                return false;
@@ -535,7 +533,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                   large dirty files cached on the client */
                if ((NT_STATUS_INVALID_HANDLE) ==
                   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-                        cFYI(1, ("invalid handle on oplock break"));
+                        cFYI(1, "invalid handle on oplock break");
                        return true;
                } else if (ERRbadfid ==
                   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +545,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
        if (pSMB->hdr.WordCount != 8)
                return false;
-        cFYI(1, ("oplock type 0x%d level 0x%d",
+        cFYI(1, "oplock type 0x%d level 0x%d",
-                 pSMB->LockType, pSMB->OplockLevel));
+                 pSMB->LockType, pSMB->OplockLevel);
        if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
                return false;
@@ -579,30 +577,35 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                                        return true;
                                }
-                                cFYI(1, ("file id match, oplock break"));
+                                cFYI(1, "file id match, oplock break");
                                pCifsInode = CIFS_I(netfile->pInode);
                                pCifsInode->clientCanCacheAll = false;
                                if (pSMB->OplockLevel == 0)
                                        pCifsInode->clientCanCacheRead = false;
-                                rc = slow_work_enqueue(&netfile->oplock_break);
-                                if (rc) {
+                                /*
-                                        cERROR(1, ("failed to enqueue oplock "
+                                 * cifs_oplock_break_put() can't be called
-                                                   "break: %d\n", rc));
+                                 * from here.  Get reference after queueing
-                                } else {
+                                 * succeeded.  cifs_oplock_break() will
-                                        netfile->oplock_break_cancelled = false;
+                                 * synchronize using GlobalSMSSeslock.
-                                }
+                                 */
+                                if (queue_work(system_nrt_wq,
+                                               &netfile->oplock_break))
+                                        cifs_oplock_break_get(netfile);
+                                netfile->oplock_break_cancelled = false;
                                read_unlock(&GlobalSMBSeslock);
                                read_unlock(&cifs_tcp_ses_lock);
                                return true;
                        }
                        read_unlock(&GlobalSMBSeslock);
                        read_unlock(&cifs_tcp_ses_lock);
-                        cFYI(1, ("No matching file for oplock break"));
+                        cFYI(1, "No matching file for oplock break");
                        return true;
                }
        }
        read_unlock(&cifs_tcp_ses_lock);
-        cFYI(1, ("Can not process oplock break for non-existent connection"));
+        cFYI(1, "Can not process oplock break for non-existent connection");
        return true;
 }
@@ -721,11 +724,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
                cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-                cERROR(1, ("Autodisabling the use of server inode numbers on "
+                cERROR(1, "Autodisabling the use of server inode numbers on "
                           "%s. This server doesn't seem to support them "
                           "properly. Hardlinks will not be recognized on this "
                           "mount. Consider mounting with the \"noserverino\" "
                           "option to silence this message.",
-                           cifs_sb->tcon->treeName));
+                           cifs_sb->tcon->treeName);
        }
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730d..9aad47a2d62f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
        {ERRremcd, -EACCES},
        {ERRdiffdevice, -EXDEV},
        {ERRnofiles, -ENOENT},
+        {ERRwriteprot, -EROFS},
        {ERRbadshare, -ETXTBSY},
        {ERRlock, -EACCES},
        {ERRunsup, -EINVAL},
@@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
 * Returns 0 on failure.
 */
 static int
-cifs_inet_pton(const int address_family, const char *cp, void *dst)
+cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
 {
        int ret = 0;
        /* calculate length by finding first slash or NULL */
        if (address_family == AF_INET)
-                ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
+                ret = in4_pton(cp, len, dst, '\\', NULL);
        else if (address_family == AF_INET6)
-                ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
+                ret = in6_pton(cp, len, dst , '\\', NULL);
-        cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
+        cFYI(DBG2, "address conversion returned %d for %*.*s",
+             ret, len, len, cp);
        if (ret > 0)
                ret = 1;
        return ret;
@@ -164,43 +166,70 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 * Returns 0 on failure.
 */
 int
-cifs_convert_address(char *src, void *dst)
+cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
-        int rc;
+        int rc, alen, slen;
-        char *pct, *endp;
+        const char *pct;
+        char *endp, scope_id[13];
        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
        /* IPv4 address */
-        if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) {
+        if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
                s4->sin_family = AF_INET;
                return 1;
        }
-        /* temporarily terminate string */
+        /* attempt to exclude the scope ID from the address part */
-        pct = strchr(src, '%');
+        pct = memchr(src, '%', len);
-        if (pct)
+        alen = pct ? pct - src : len;
-                *pct = '\0';
-        rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
-        /* repair temp termination (if any) and make pct point to scopeid */
-        if (pct)
-                *pct++ = '%';
+        rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
        if (!rc)
                return rc;
        s6->sin6_family = AF_INET6;
        if (pct) {
+                /* grab the scope ID */
+                slen = len - (alen + 1);
+                if (slen <= 0 || slen > 12)
+                        return 0;
+                memcpy(scope_id, pct + 1, slen);
+                scope_id[slen] = '\0';
                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
-                if (!*pct || *endp)
+                if (endp != scope_id + slen)
                        return 0;
        }
        return rc;
 }
+int
+cifs_set_port(struct sockaddr *addr, const unsigned short int port)
+{
+        switch (addr->sa_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)addr)->sin_port = htons(port);
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
+                break;
+        default:
+                return 0;
+        }
+        return 1;
+}
+int
+cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
+                   const unsigned short int port)
+{
+        if (!cifs_convert_address(dst, src, len))
+                return 0;
+        return cifs_set_port(dst, port);
+}
 /*****************************************************************************
 convert a NT status code to a dos class/code
 *****************************************************************************/
@@ -870,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
        }
        /* else ERRHRD class errors or junk  - return EIO */
-        cFYI(1, ("Mapping smb error code %d to POSIX err %d",
+        cFYI(1, "Mapping smb error code %d to POSIX err %d",
-                 smberrcode, rc));
+                 smberrcode, rc);
        /* generic corrective action e.g. reconnect SMB session on
         * ERRbaduid could be added */
@@ -940,20 +969,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
-        cFYI(1, ("date %d time %d", date, time));
+        cFYI(1, "date %d time %d", date, time);
        sec = 2 * st->TwoSeconds;
        min = st->Minutes;
        if ((sec > 59) || (min > 59))
-                cERROR(1, ("illegal time min %d sec %d", min, sec));
+                cERROR(1, "illegal time min %d sec %d", min, sec);
        sec += (min * 60);
        sec += 60 * 60 * st->Hours;
        if (st->Hours > 24)
-                cERROR(1, ("illegal hours %d", st->Hours));
+                cERROR(1, "illegal hours %d", st->Hours);
        days = sd->Day;
        month = sd->Month;
        if ((days > 31) || (month > 12)) {
-                cERROR(1, ("illegal date, month %d day: %d", month, days));
+                cERROR(1, "illegal date, month %d day: %d", month, days);
                if (month > 12)
                        month = 12;
        }
@@ -979,7 +1008,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
        ts.tv_sec = sec + offset;
-        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+        /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
        ts.tv_nsec = 0;
        return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 18e0bc1fb593..d5e591fab475 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -47,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
        if (file) {
                cf = file->private_data;
                if (cf == NULL) {
-                        cFYI(1, ("empty cifs private file data"));
+                        cFYI(1, "empty cifs private file data");
                        return;
                }
                if (cf->invalidHandle)
-                        cFYI(1, ("invalid handle"));
+                        cFYI(1, "invalid handle");
                if (cf->srch_inf.endOfSearch)
-                        cFYI(1, ("end of search"));
+                        cFYI(1, "end of search");
                if (cf->srch_inf.emptyDir)
-                        cFYI(1, ("empty dir"));
+                        cFYI(1, "empty dir");
        }
 }
 #else
@@ -76,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        struct inode *inode;
        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, ("For %s", name->name));
+        cFYI(1, "For %s", name->name);
        if (parent->d_op && parent->d_op->d_hash)
                parent->d_op->d_hash(parent, name);
@@ -214,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
                                fid,
                                cifs_sb->local_nls);
                if (CIFSSMBClose(xid, ptcon, fid)) {
-                        cFYI(1, ("Error closing temporary reparsepoint open)"));
+                        cFYI(1, "Error closing temporary reparsepoint open");
                }
        }
 }
@@ -252,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
        if (full_path == NULL)
                return -ENOMEM;
-        cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
+        cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 ffirst_retry:
        /* test for Unix extensions */
@@ -297,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
                if (ustr[len] == 0)
                        return len << 1;
        }
-        cFYI(1, ("Unicode string longer than PATH_MAX found"));
+        cFYI(1, "Unicode string longer than PATH_MAX found");
        return len << 1;
 }
@@ -314,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
                                pfData->FileNameLength;
        } else
                new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-        cFYI(1, ("new entry %p old entry %p", new_entry, old_entry));
+        cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
        /* validate that new_entry is not past end of SMB */
        if (new_entry >= end_of_smb) {
-                cERROR(1,
+                cERROR(1, "search entry %p began after end of SMB %p old entry %p",
-                      ("search entry %p began after end of SMB %p old entry %p",
+                        new_entry, end_of_smb, old_entry);
-                        new_entry, end_of_smb, old_entry));
                return NULL;
        } else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
                    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
                  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
                   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-                cERROR(1, ("search entry %p extends after end of SMB %p",
+                cERROR(1, "search entry %p extends after end of SMB %p",
-                        new_entry, end_of_smb));
+                        new_entry, end_of_smb);
                return NULL;
        } else
                return new_entry;
@@ -380,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
                filename = &pFindData->FileName[0];
                len = pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d",
+                cFYI(1, "Unknown findfirst level %d",
-                         cfile->srch_inf.info_level));
+                         cfile->srch_inf.info_level);
        }
        if (filename) {
@@ -481,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
                len = (unsigned int)pFindData->FileNameLength;
                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        cifsFile->srch_inf.resume_name_len = len;
@@ -525,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
             is_dir_changed(file)) ||
           (index_to_find < first_entry_in_buffer)) {
                /* close and restart search */
-                cFYI(1, ("search backing up - close and restart search"));
+                cFYI(1, "search backing up - close and restart search");
                write_lock(&GlobalSMBSeslock);
                if (!cifsFile->srch_inf.endOfSearch &&
                    !cifsFile->invalidHandle) {
@@ -535,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                } else
                        write_unlock(&GlobalSMBSeslock);
                if (cifsFile->srch_inf.ntwrk_buf_start) {
-                        cFYI(1, ("freeing SMB ff cache buf on search rewind"));
+                        cFYI(1, "freeing SMB ff cache buf on search rewind");
                        if (cifsFile->srch_inf.smallBuf)
                                cifs_small_buf_release(cifsFile->srch_inf.
                                                ntwrk_buf_start);
@@ -546,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                rc = initiate_cifs_search(xid, file);
                if (rc) {
-                        cFYI(1, ("error %d reinitiating a search on rewind",
+                        cFYI(1, "error %d reinitiating a search on rewind",
-                                 rc));
+                                 rc);
                        return rc;
                }
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -555,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
              (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-                cFYI(1, ("calling findnext2"));
+                cFYI(1, "calling findnext2");
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -575,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
                                        - cifsFile->srch_inf.entries_in_buffer;
                pos_in_buf = index_to_find - first_entry_in_buffer;
-                cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
+                cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
                for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
                        /* go entry by entry figuring out which is first */
@@ -584,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                }
                if ((current_entry == NULL) && (i < pos_in_buf)) {
                        /* BB fixme - check if we should flag this error */
-                        cERROR(1, ("reached end of buf searching for pos in buf"
+                        cERROR(1, "reached end of buf searching for pos in buf"
                          " %d index to find %lld rc %d",
-                          pos_in_buf, index_to_find, rc));
+                          pos_in_buf, index_to_find, rc);
                }
                rc = 0;
                *ppCurrentEntry = current_entry;
        } else {
-                cFYI(1, ("index not in buffer - could not findnext into it"));
+                cFYI(1, "index not in buffer - could not findnext into it");
                return 0;
        }
        if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-                cFYI(1, ("can not return entries pos_in_buf beyond last"));
+                cFYI(1, "can not return entries pos_in_buf beyond last");
                *num_to_ret = 0;
        } else
                *num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -656,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
                /* one byte length, no name conversion */
                len = (unsigned int)pFindData->FileNameLength;
        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
+                cFYI(1, "Unknown findfirst level %d", level);
                return -EINVAL;
        }
        if (len > max_len) {
-                cERROR(1, ("bad search response length %d past smb end", len));
+                cERROR(1, "bad search response length %d past smb end", len);
                return -EINVAL;
        }
@@ -754,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
         * case already. Why should we be clobbering other errors from it?
         */
        if (rc) {
-                cFYI(1, ("filldir rc = %d", rc));
+                cFYI(1, "filldir rc = %d", rc);
                rc = -EOVERFLOW;
        }
        dput(tmp_dentry);
@@ -786,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 0:
                if (filldir(direntry, ".", 1, file->f_pos,
                     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for current dir failed"));
+                        cERROR(1, "Filldir for current dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -794,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        case 1:
                if (filldir(direntry, "..", 2, file->f_pos,
                     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-                        cERROR(1, ("Filldir for parent dir failed"));
+                        cERROR(1, "Filldir for parent dir failed");
                        rc = -ENOMEM;
                        break;
                }
@@ -807,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                if (file->private_data == NULL) {
                        rc = initiate_cifs_search(xid, file);
-                        cFYI(1, ("initiate cifs search rc %d", rc));
+                        cFYI(1, "initiate cifs search rc %d", rc);
                        if (rc) {
                                FreeXid(xid);
                                return rc;
@@ -821,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                cifsFile = file->private_data;
                if (cifsFile->srch_inf.endOfSearch) {
                        if (cifsFile->srch_inf.emptyDir) {
-                                cFYI(1, ("End of search, empty dir"));
+                                cFYI(1, "End of search, empty dir");
                                rc = 0;
                                break;
                        }
@@ -833,26 +832,31 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                rc = find_cifs_entry(xid, pTcon, file,
                                &current_entry, &num_to_fill);
                if (rc) {
-                        cFYI(1, ("fce error %d", rc));
+                        cFYI(1, "fce error %d", rc);
                        goto rddir2_exit;
                } else if (current_entry != NULL) {
-                        cFYI(1, ("entry %lld found", file->f_pos));
+                        cFYI(1, "entry %lld found", file->f_pos);
                } else {
-                        cFYI(1, ("could not find entry"));
+                        cFYI(1, "could not find entry");
                        goto rddir2_exit;
                }
-                cFYI(1, ("loop through %d times filling dir for net buf %p",
+                cFYI(1, "loop through %d times filling dir for net buf %p",
-                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start));
+                        num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
                max_len = smbCalcSize((struct smb_hdr *)
                                cifsFile->srch_inf.ntwrk_buf_start);
                end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
                tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+                if (tmp_buf == NULL) {
+                        rc = -ENOMEM;
+                        break;
+                }
                for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
                        if (current_entry == NULL) {
                                /* evaluate whether this case is an error */
-                                cERROR(1, ("past SMB end,  num to fill %d i %d",
+                                cERROR(1, "past SMB end,  num to fill %d i %d",
-                                          num_to_fill, i));
+                                          num_to_fill, i);
                                break;
                        }
                        /* if buggy server returns . and .. late do
@@ -867,8 +871,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
                        file->f_pos++;
                        if (file->f_pos ==
                                cifsFile->srch_inf.index_of_last_entry) {
-                                cFYI(1, ("last entry in buf at pos %lld %s",
+                                cFYI(1, "last entry in buf at pos %lld %s",
-                                        file->f_pos, tmp_buf));
+                                        file->f_pos, tmp_buf);
                                cifs_save_resume_key(current_entry, cifsFile);
                                break;
                        } else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7c3fd7463f44..0a57cb7db5dd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -35,9 +35,11 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                         unsigned char *p24);
-/* Checks if this is the first smb session to be reconnected after
+/*
-   the socket has been reestablished (so we know whether to use vc 0).
+ * Checks if this is the first smb session to be reconnected after
-   Called while holding the cifs_tcp_ses_lock, so do not block */
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
 static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 {
        struct list_head *tmp;
@@ -284,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        int len;
        char *data = *pbcc_area;
-        cFYI(1, ("bleft %d", bleft));
+        cFYI(1, "bleft %d", bleft);
        /*
         * Windows servers do not always double null terminate their final
@@ -301,7 +303,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverOS);
        ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverOS=%s", ses->serverOS));
+        cFYI(1, "serverOS=%s", ses->serverOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -310,7 +312,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverNOS);
        ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverNOS=%s", ses->serverNOS));
+        cFYI(1, "serverNOS=%s", ses->serverNOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
        bleft -= len;
@@ -319,7 +321,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
        kfree(ses->serverDomain);
        ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-        cFYI(1, ("serverDomain=%s", ses->serverDomain));
+        cFYI(1, "serverDomain=%s", ses->serverDomain);
        return;
 }
@@ -332,7 +334,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        int len;
        char *bcc_ptr = *pbcc_area;
-        cFYI(1, ("decode sessetup ascii. bleft %d", bleft));
+        cFYI(1, "decode sessetup ascii. bleft %d", bleft);
        len = strnlen(bcc_ptr, bleft);
        if (len >= bleft)
@@ -344,7 +346,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        if (ses->serverOS)
                strncpy(ses->serverOS, bcc_ptr, len);
        if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-                        cFYI(1, ("OS/2 server"));
+                        cFYI(1, "OS/2 server");
                        ses->flags |= CIFS_SES_OS2;
        }
@@ -373,7 +375,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
        /* BB For newer servers which do not support Unicode,
           but thus do return domain here we could add parsing
           for it later, but it is not very important */
-        cFYI(1, ("ascii: bytes left %d", bleft));
+        cFYI(1, "ascii: bytes left %d", bleft);
        return rc;
 }
@@ -384,16 +386,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
        if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-                cERROR(1, ("challenge blob len %d too small", blob_len));
+                cERROR(1, "challenge blob len %d too small", blob_len);
                return -EINVAL;
        }
        if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-                cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+                cERROR(1, "blob signature incorrect %s", pblob->Signature);
                return -EINVAL;
        }
        if (pblob->MessageType != NtLmChallenge) {
-                cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+                cERROR(1, "Incorrect message type %d", pblob->MessageType);
                return -EINVAL;
        }
@@ -447,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
   This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                                   struct cifsSesInfo *ses,
-                                   const struct nls_table *nls_cp, int first)
+                                   const struct nls_table *nls_cp, bool first)
 {
        AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
        __u32 flags;
@@ -546,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
                                  struct cifsSesInfo *ses,
-                                  const struct nls_table *nls, int first_time)
+                                  const struct nls_table *nls, bool first_time)
 {
        int bloblen;
@@ -559,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 #endif
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-                const struct nls_table *nls_cp)
+               const struct nls_table *nls_cp)
 {
        int rc = 0;
        int wct;
@@ -577,13 +579,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
        int bytes_remaining;
        struct key *spnego_key = NULL;
        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+        bool first_time;
        if (ses == NULL)
                return -EINVAL;
+        read_lock(&cifs_tcp_ses_lock);
+        first_time = is_first_ses_reconnect(ses);
+        read_unlock(&cifs_tcp_ses_lock);
        type = ses->server->secType;
-        cFYI(1, ("sess setup type %d", type));
+        cFYI(1, "sess setup type %d", type);
 ssetup_ntlmssp_authenticate:
        if (phase == NtLmChallenge)
                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -664,7 +671,7 @@ ssetup_ntlmssp_authenticate:
                changed to do higher than lanman dialect and
                we reconnected would we ever calc signing_key? */
-                cFYI(1, ("Negotiating LANMAN setting up strings"));
+                cFYI(1, "Negotiating LANMAN setting up strings");
                /* Unicode not allowed for LANMAN dialects */
                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -723,15 +730,7 @@ ssetup_ntlmssp_authenticate:
                /* calculate session key */
                setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
-                if (first_time) /* should this be moved into common code
+                /* FIXME: calculate MAC key */
-                                   with similar ntlmv2 path? */
-                /*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
-                                response BB FIXME, v2_sess_key); */
-                /* copy session key */
-        /*      memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
-                bcc_ptr += LM2_SESS_KEY_SIZE; */
                memcpy(bcc_ptr, (char *)v2_sess_key,
                       sizeof(struct ntlmv2_resp));
                bcc_ptr += sizeof(struct ntlmv2_resp);
@@ -744,7 +743,7 @@ ssetup_ntlmssp_authenticate:
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos || type == MSKerberos) {
+        } else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
@@ -758,17 +757,17 @@ ssetup_ntlmssp_authenticate:
                /* check version field to make sure that cifs.upcall is
                   sending us a response in an expected form */
                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-                        cERROR(1, ("incorrect version of cifs.upcall (expected"
+                        cERROR(1, "incorrect version of cifs.upcall (expected"
                                   " %d but got %d)",
-                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
                        rc = -EKEYREJECTED;
                        goto ssetup_exit;
                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
                    sizeof(ses->server->mac_signing_key.data.krb5)) {
-                        cERROR(1, ("Kerberos signing key too long (%u bytes)",
+                        cERROR(1, "Kerberos signing key too long (%u bytes)",
-                                msg->sesskey_len));
+                                msg->sesskey_len);
                        rc = -EOVERFLOW;
                        goto ssetup_exit;
                }
@@ -796,7 +795,7 @@ ssetup_ntlmssp_authenticate:
                /* BB: is this right? */
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-                cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+                cERROR(1, "Kerberos negotiated but upcall support disabled!");
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
@@ -804,12 +803,12 @@ ssetup_ntlmssp_authenticate:
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (type == RawNTLMSSP) {
                        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                                cERROR(1, ("NTLMSSP requires Unicode support"));
+                                cERROR(1, "NTLMSSP requires Unicode support");
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
-                        cFYI(1, ("ntlmssp session setup phase %d", phase));
+                        cFYI(1, "ntlmssp session setup phase %d", phase);
                        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
                        capabilities |= CAP_EXTENDED_SECURITY;
                        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -827,7 +826,7 @@ ssetup_ntlmssp_authenticate:
                                   on the response (challenge) */
                                smb_buf->Uid = ses->Suid;
                        } else {
-                                cERROR(1, ("invalid phase %d", phase));
+                                cERROR(1, "invalid phase %d", phase);
                                rc = -ENOSYS;
                                goto ssetup_exit;
                        }
@@ -839,12 +838,12 @@ ssetup_ntlmssp_authenticate:
                        }
                        unicode_oslm_strings(&bcc_ptr, nls_cp);
                } else {
-                        cERROR(1, ("secType %d not supported!", type));
+                        cERROR(1, "secType %d not supported!", type);
                        rc = -ENOSYS;
                        goto ssetup_exit;
                }
 #else
-                cERROR(1, ("secType %d not supported!", type));
+                cERROR(1, "secType %d not supported!", type);
                rc = -ENOSYS;
                goto ssetup_exit;
 #endif
@@ -862,7 +861,7 @@ ssetup_ntlmssp_authenticate:
                          CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
        /* SMB request buf freed in SendReceive2 */
-        cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
+        cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
        smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -870,7 +869,7 @@ ssetup_ntlmssp_authenticate:
        if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
                if (phase != NtLmNegotiate) {
-                        cERROR(1, ("Unexpected more processing error"));
+                        cERROR(1, "Unexpected more processing error");
                        goto ssetup_exit;
                }
                /* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -882,14 +881,14 @@ ssetup_ntlmssp_authenticate:
        if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
                rc = -EIO;
-                cERROR(1, ("bad word count %d", smb_buf->WordCount));
+                cERROR(1, "bad word count %d", smb_buf->WordCount);
                goto ssetup_exit;
        }
        action = le16_to_cpu(pSMB->resp.Action);
        if (action & GUEST_LOGIN)
-                cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+                cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-        cFYI(1, ("UID = %d ", ses->Suid));
+        cFYI(1, "UID = %d ", ses->Suid);
        /* response can have either 3 or 4 word count - Samba sends 3 */
        /* and lanman response is 3 */
        bytes_remaining = BCC(smb_buf);
@@ -899,7 +898,7 @@ ssetup_ntlmssp_authenticate:
                __u16 blob_len;
                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
                if (blob_len > bytes_remaining) {
-                        cERROR(1, ("bad security blob length %d", blob_len));
+                        cERROR(1, "bad security blob length %d", blob_len);
                        rc = -EINVAL;
                        goto ssetup_exit;
                }
@@ -933,7 +932,7 @@ ssetup_exit:
        }
        kfree(str_area);
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
-                cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
+                cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
                cifs_small_buf_release(iov[0].iov_base);
        } else if (resp_buf_type == CIFS_LARGE_BUFFER)
                cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index c5084d27db7c..7f16cb825fe5 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -76,6 +76,7 @@
 #define ERRnofiles              18      /* A File Search command can find no
                                           more files matching the specified
                                           criteria. */
+#define ERRwriteprot            19      /* media is write protected */
 #define ERRgeneral              31
 #define ERRbadshare             32      /* The sharing mode specified for an
                                           Open conflicts with existing FIDs on
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ad081fe7eb18..82f78c4d6978 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -35,7 +35,6 @@
 #include "cifs_debug.h"
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
@@ -43,7 +42,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
        struct mid_q_entry *temp;
        if (server == NULL) {
-                cERROR(1, ("Null TCP session in AllocMidQEntry"));
+                cERROR(1, "Null TCP session in AllocMidQEntry");
                return NULL;
        }
@@ -55,7 +54,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
                temp->mid = smb_buffer->Mid;    /* always LE */
                temp->pid = current->pid;
                temp->command = smb_buffer->Command;
-                cFYI(1, ("For smb_command %d", temp->command));
+                cFYI(1, "For smb_command %d", temp->command);
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
@@ -140,7 +139,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                total_len += iov[i].iov_len;
        smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-        cFYI(1, ("Sending smb:  total_len %d", total_len));
+        cFYI(1, "Sending smb:  total_len %d", total_len);
        dump_smb(smb_buffer, len);
        i = 0;
@@ -168,9 +167,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                           reconnect which may clear the network problem.
                        */
                        if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-                                cERROR(1,
+                                cERROR(1, "sends on sock %p stuck for 15 seconds",
-                                   ("sends on sock %p stuck for 15 seconds",
+                                    ssocket);
-                                    ssocket));
                                rc = -EAGAIN;
                                break;
                        }
@@ -184,13 +182,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                        total_len = 0;
                        break;
                } else if (rc > total_len) {
-                        cERROR(1, ("sent %d requested %d", rc, total_len));
+                        cERROR(1, "sent %d requested %d", rc, total_len);
                        break;
                }
                if (rc == 0) {
                        /* should never happen, letting socket clear before
                           retrying is our only obvious option here */
-                        cERROR(1, ("tcp sent no data"));
+                        cERROR(1, "tcp sent no data");
                        msleep(500);
                        continue;
                }
@@ -213,8 +211,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-                cFYI(1, ("partial send (%d remaining), terminating session",
+                cFYI(1, "partial send (%d remaining), terminating session",
-                        total_len));
+                        total_len);
                /* If we have only sent part of an SMB then the next SMB
                   could be taken as the remainder of this one.  We need
                   to kill the socket so the server throws away the partial
@@ -223,7 +221,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
        }
        if (rc < 0) {
-                cERROR(1, ("Error %d sending data on socket to server", rc));
+                cERROR(1, "Error %d sending data on socket to server", rc);
        } else
                rc = 0;
@@ -296,7 +294,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
        }
        if (ses->server->tcpStatus == CifsNeedReconnect) {
-                cFYI(1, ("tcp session dead - return to caller to retry"));
+                cFYI(1, "tcp session dead - return to caller to retry");
                return -EAGAIN;
        }
@@ -348,7 +346,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
                        lrt += time_to_wait;
                        if (time_after(jiffies, lrt)) {
                                /* No replies for time_to_wait. */
-                                cERROR(1, ("server not responding"));
+                                cERROR(1, "server not responding");
                                return -1;
                        }
                } else {
@@ -379,7 +377,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
        iov[0].iov_len = in_buf->smb_buf_length + 4;
        flags |= CIFS_NO_RESP;
        rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-        cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
+        cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
        return rc;
 }
@@ -402,7 +400,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        if ((ses == NULL) || (ses->server == NULL)) {
                cifs_small_buf_release(in_buf);
-                cERROR(1, ("Null session"));
+                cERROR(1, "Null session");
                return -EIO;
        }
@@ -471,7 +469,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -490,8 +488,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response to cmd %d mid %d",
+                cERROR(1, "No response to cmd %d mid %d",
-                        midQ->command, midQ->mid));
+                        midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -504,7 +502,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -521,8 +519,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -548,7 +546,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -569,7 +567,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                                                   DeleteMidQEntry */
        } else {
                rc = -EIO;
-                cFYI(1, ("Bad MID state?"));
+                cFYI(1, "Bad MID state?");
        }
 out:
@@ -591,11 +589,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        struct mid_q_entry *midQ;
        if (ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -607,8 +605,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -665,7 +663,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        else if (long_op == CIFS_BLOCKING_OP)
                timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
        else {
-                cERROR(1, ("unknown timeout flag %d", long_op));
+                cERROR(1, "unknown timeout flag %d", long_op);
                rc = -EIO;
                goto out;
        }
@@ -681,8 +679,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        spin_lock(&GlobalMid_Lock);
        if (midQ->resp_buf == NULL) {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -695,7 +693,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -712,8 +710,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -736,7 +734,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
                        if (rc) {
-                                cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, "Unexpected SMB signature");
                                /* BB FIXME add code to kill session */
                        }
                }
@@ -753,7 +751,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
        } else {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
        }
 out:
@@ -824,13 +822,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        struct cifsSesInfo *ses;
        if (tcon == NULL || tcon->ses == NULL) {
-                cERROR(1, ("Null smb session"));
+                cERROR(1, "Null smb session");
                return -EIO;
        }
        ses = tcon->ses;
        if (ses->server == NULL) {
-                cERROR(1, ("Null tcp session"));
+                cERROR(1, "Null tcp session");
                return -EIO;
        }
@@ -842,8 +840,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           use ses->maxReq */
        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                cERROR(1, "Illegal length, greater than maximum frame, %d",
-                           in_buf->smb_buf_length));
+                           in_buf->smb_buf_length);
                return -EIO;
        }
@@ -933,8 +931,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                spin_unlock(&GlobalMid_Lock);
                receive_len = midQ->resp_buf->smb_buf_length;
        } else {
-                cERROR(1, ("No response for cmd %d mid %d",
+                cERROR(1, "No response for cmd %d mid %d",
-                          midQ->command, midQ->mid));
+                          midQ->command, midQ->mid);
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
                        if (ses->server->tcpStatus == CifsExiting)
                                rc = -EHOSTDOWN;
@@ -947,7 +945,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                if (rc != -EHOSTDOWN) {
                        if (midQ->midState == MID_RETRY_NEEDED) {
                                rc = -EAGAIN;
-                                cFYI(1, ("marking request for retry"));
+                                cFYI(1, "marking request for retry");
                        } else {
                                rc = -EIO;
                        }
@@ -958,8 +956,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        }
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
+                cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
-                        receive_len, xid));
+                        receive_len, xid);
                rc = -EIO;
                goto out;
        }
@@ -968,7 +966,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
                rc = -EIO;
-                cERROR(1, ("Bad MID state?"));
+                cERROR(1, "Bad MID state?");
                goto out;
        }
@@ -986,7 +984,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                                           &ses->server->mac_signing_key,
                                           midQ->sequence_number+1);
                if (rc) {
-                        cERROR(1, ("Unexpected SMB signature"));
+                        cERROR(1, "Unexpected SMB signature");
                        /* BB FIXME add code to kill session */
                }
        }
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f555ce077d4f..a1509207bfa6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -70,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
                return rc;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
                && (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                     "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
                /* BB what if no namespace prefix? */
                /* Should we just pass them to server, except for
                system and perhaps security prefixes? */
@@ -131,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                search server for EAs or streams to
                returns as xattrs */
        if (value_size > MAX_EA_VALUE_SIZE) {
-                cFYI(1, ("size of EA value too large"));
+                cFYI(1, "size of EA value too large");
                kfree(full_path);
                FreeXid(xid);
                return -EOPNOTSUPP;
        }
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto set_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-                        cFYI(1, ("attempt to set cifs inode metadata"));
+                        cFYI(1, "attempt to set cifs inode metadata");
                ea_name += 5; /* skip past user. prefix */
                rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -169,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_ACCESS, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX ACL rc %d", rc));
+                        cFYI(1, "set POSIX ACL rc %d", rc);
 #else
-                        cFYI(1, ("set POSIX ACL not supported"));
+                        cFYI(1, "set POSIX ACL not supported");
 #endif
                } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                                   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -182,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
                                        ACL_TYPE_DEFAULT, cifs_sb->local_nls,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("set POSIX default ACL rc %d", rc));
+                        cFYI(1, "set POSIX default ACL rc %d", rc);
 #else
-                        cFYI(1, ("set default POSIX ACL not supported"));
+                        cFYI(1, "set default POSIX ACL not supported");
 #endif
                } else {
-                        cFYI(1, ("illegal xattr request %s (only user namespace"
+                        cFYI(1, "illegal xattr request %s (only user namespace"
-                                 " supported)", ea_name));
+                                " supported)", ea_name);
                  /* BB what if no namespace prefix? */
                  /* Should we just pass them to server, except for
                  system and perhaps security prefixes? */
@@ -235,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
        /* return dos attributes as pseudo xattr */
        /* return alt name if available as pseudo attr */
        if (ea_name == NULL) {
-                cFYI(1, ("Null xattr names not supported"));
+                cFYI(1, "Null xattr names not supported");
        } else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
                        goto get_ea_exit;
                if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-                        cFYI(1, ("attempt to query cifs inode metadata"));
+                        cFYI(1, "attempt to query cifs inode metadata");
                        /* revalidate/getattr then populate from inode */
                } /* BB add else when above is implemented */
                ea_name += 5; /* skip past user. prefix */
@@ -287,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                }
 #endif /* EXPERIMENTAL */
 #else
-                cFYI(1, ("query POSIX ACL not supported yet"));
+                cFYI(1, "query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
        } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
                          strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -299,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-                cFYI(1, ("query POSIX default ACL not supported yet"));
+                cFYI(1, "query POSIX default ACL not supported yet");
 #endif
        } else if (strncmp(ea_name,
                  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-                cFYI(1, ("Trusted xattr namespace not supported yet"));
+                cFYI(1, "Trusted xattr namespace not supported yet");
        } else if (strncmp(ea_name,
                  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-                cFYI(1, ("Security xattr namespace not supported yet"));
+                cFYI(1, "Security xattr namespace not supported yet");
        } else
                cFYI(1,
-                    ("illegal xattr request %s (only user namespace supported)",
+                    "illegal xattr request %s (only user namespace supported)",
-                        ea_name));
+                     ea_name);
        /* We could add an additional check for streams ie
            if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
+int coda_fsync(struct file *coda_file, int datasync);
-               int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        return 0;
 }
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
        struct file *host_file;
-        struct inode *coda_inode = coda_dentry->d_inode;
+        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
+        err = vfs_fsync(host_file, datasync);
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d97f9935a028..6526e6f21ecf 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -35,7 +35,7 @@
 #include "coda_int.h"
 /* VFS super_block ops */
-static void coda_clear_inode(struct inode *);
+static void coda_evict_inode(struct inode *);
 static void coda_put_super(struct super_block *);
 static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -93,7 +93,7 @@ static const struct super_operations coda_super_operations =
 {
        .alloc_inode    = coda_alloc_inode,
        .destroy_inode  = coda_destroy_inode,
-        .clear_inode    = coda_clear_inode,
+        .evict_inode    = coda_evict_inode,
        .put_super      = coda_put_super,
        .statfs         = coda_statfs,
        .remount_fs     = coda_remount,
@@ -224,8 +224,10 @@ static void coda_put_super(struct super_block *sb)
        printk("Coda: Bye bye.\n");
 }
-static void coda_clear_inode(struct inode *inode)
+static void coda_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        coda_cache_clear_inode(inode);
 }
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
 * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
 *
 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data);
+                        unsigned long user_data);
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
+const struct inode_operations coda_ioctl_inode_operations = {
-{
        .permission     = coda_ioctl_permission,
        .setattr        = coda_setattr,
 };
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
-        .ioctl          = coda_pioctl,
+        .unlocked_ioctl = coda_pioctl,
 };
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data)
+                        unsigned long user_data)
 {
        struct path path;
-        int error;
+        int error;
        struct PioctlData data;
-        struct inode *target_inode = NULL;
+        struct inode *inode = filp->f_dentry->d_inode;
-        struct coda_inode_info *cnp;
+        struct inode *target_inode = NULL;
+        struct coda_inode_info *cnp;
-        /* get the Pioctl data arguments from user space */
+        lock_kernel();
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-            return -EINVAL;
+        /* get the Pioctl data arguments from user space */
-        }
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-       
+                error = -EINVAL;
-        /* 
+                goto out;
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-        } else {
-                error = user_lpath(data.path, &path);
        }
-                
-        if ( error ) {
+        /*
-                return error;
+         * Look up the pathname. Note that the pathname is in
-        } else {
+         * user memory, and namei takes care of this
+         */
+        if (data.follow)
+                error = user_path(data.path, &path);
+        else
+                error = user_lpath(data.path, &path);
+        if (error)
+                goto out;
+        else
                target_inode = path.dentry->d_inode;
-        }
-        
        /* return if it is not a Coda inode */
-        if ( target_inode->i_sb != inode->i_sb ) {
+        if (target_inode->i_sb != inode->i_sb) {
                path_put(&path);
-                return  -EINVAL;
+                error = -EINVAL;
+                goto out;
        }
        /* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
        path_put(&path);
-        return error;
-}
+out:
+        unlock_kernel();
+        return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..116af7546cf0 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        return mask;
 }
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
-                            unsigned int cmd, unsigned long arg)
 {
        unsigned int data;
@@ -178,15 +177,15 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
                nbytes = req->uc_outSize; /* don't have more space! */
        }
        if (copy_from_user(req->uc_data, buf, nbytes)) {
-                req->uc_flags |= REQ_ABORT;
+                req->uc_flags |= CODA_REQ_ABORT;
                wake_up(&req->uc_sleep);
                retval = -EFAULT;
                goto out;
        }
        /* adjust outsize. is this useful ?? */
-        req->uc_outSize = nbytes;       
+        req->uc_outSize = nbytes;
-        req->uc_flags |= REQ_WRITE;
+        req->uc_flags |= CODA_REQ_WRITE;
        count = nbytes;
        /* Convert filedescriptor into a file handle */
@@ -255,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
                retval = -EFAULT;
        
        /* If request was not a signal, enqueue and don't free */
-        if (!(req->uc_flags & REQ_ASYNC)) {
+        if (!(req->uc_flags & CODA_REQ_ASYNC)) {
-                req->uc_flags |= REQ_READ;
+                req->uc_flags |= CODA_REQ_READ;
                list_add_tail(&(req->uc_chain), &vcp->vc_processing);
                goto out;
        }
@@ -316,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
                list_del(&req->uc_chain);
                /* Async requests need to be freed here */
-                if (req->uc_flags & REQ_ASYNC) {
+                if (req->uc_flags & CODA_REQ_ASYNC) {
                        CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr));
                        kfree(req);
                        continue;
                }
-                req->uc_flags |= REQ_ABORT;
+                req->uc_flags |= CODA_REQ_ABORT;
                wake_up(&req->uc_sleep);
        }
        list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) {
                list_del(&req->uc_chain);
-                req->uc_flags |= REQ_ABORT;
+                req->uc_flags |= CODA_REQ_ABORT;
                wake_up(&req->uc_sleep);
        }
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
        .read           = coda_psdev_read,
        .write          = coda_psdev_write,
        .poll           = coda_psdev_poll,
-        .ioctl          = coda_psdev_ioctl,
+        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
 };
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index f09c5ed76f6c..b8893ab6f9e6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old)
                               (((r)->uc_opcode != CODA_CLOSE && \
                                 (r)->uc_opcode != CODA_STORE && \
                                 (r)->uc_opcode != CODA_RELEASE) || \
-                                (r)->uc_flags & REQ_READ))
+                                (r)->uc_flags & CODA_REQ_READ))
 static inline void coda_waitfor_upcall(struct upc_req *req)
 {
@@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req)
                        set_current_state(TASK_UNINTERRUPTIBLE);
                /* got a reply */
-                if (req->uc_flags & (REQ_WRITE | REQ_ABORT))
+                if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT))
                        break;
                if (blocked && time_after(jiffies, timeout) &&
@@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp,
        coda_waitfor_upcall(req);
        /* Op went through, interrupt or not... */
-        if (req->uc_flags & REQ_WRITE) {
+        if (req->uc_flags & CODA_REQ_WRITE) {
                out = (union outputArgs *)req->uc_data;
                /* here we map positive Venus errors to kernel errors */
                error = -out->oh.result;
@@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp,
        }
        error = -EINTR;
-        if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) {
+        if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
                printk(KERN_WARNING "coda: Unexpected interruption.\n");
                goto exit;
        }
        /* Interrupted before venus read it. */
-        if (!(req->uc_flags & REQ_READ))
+        if (!(req->uc_flags & CODA_REQ_READ))
                goto exit;
        /* Venus saw the upcall, make sure we can send interrupt signal */
@@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp,
        sig_inputArgs->ih.opcode = CODA_SIGNAL;
        sig_inputArgs->ih.unique = req->uc_unique;
-        sig_req->uc_flags = REQ_ASYNC;
+        sig_req->uc_flags = CODA_REQ_ASYNC;
        sig_req->uc_opcode = sig_inputArgs->ih.opcode;
        sig_req->uc_unique = sig_inputArgs->ih.unique;
        sig_req->uc_inSize = sizeof(struct coda_in_hdr);
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..0644a154672b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -8,13 +8,14 @@
 *  Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
 *  Copyright (C) 1998       Eddie C. Dost  (ecd@skynet.be)
 *  Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- *  Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ *  Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -76,7 +77,8 @@ int compat_printk(const char *fmt, ...)
 * Not all architectures have sys_utime, so implement this in terms
 * of sys_utimes.
 */
-asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __user *t)
+asmlinkage long compat_sys_utime(const char __user *filename,
+                                 struct compat_utimbuf __user *t)
 {
        struct timespec tv[2];
@@ -90,7 +92,7 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
        return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
-asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
+asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
 {
        struct timespec tv[2];
@@ -105,7 +107,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
        return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
-asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
 {
        struct timespec tv[2];
@@ -124,7 +126,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, st
        return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
-asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval __user *t)
+asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
 {
        return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
@@ -168,7 +170,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
        return err;
 }
-asmlinkage long compat_sys_newstat(char __user * filename,
+asmlinkage long compat_sys_newstat(const char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
@@ -180,7 +182,7 @@ asmlinkage long compat_sys_newstat(char __user * filename,
        return cp_compat_stat(&stat, statbuf);
 }
-asmlinkage long compat_sys_newlstat(char __user * filename,
+asmlinkage long compat_sys_newlstat(const char __user * filename,
                struct compat_stat __user *statbuf)
 {
        struct kstat stat;
@@ -193,7 +195,8 @@ asmlinkage long compat_sys_newlstat(char __user * filename,
 }
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename,
+asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+                const char __user *filename,
                struct compat_stat __user *statbuf, int flag)
 {
        struct kstat stat;
@@ -266,7 +269,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
        error = user_path(pathname, &path);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(path.dentry, &tmp);
+                error = vfs_statfs(&path, &tmp);
                if (!error)
                        error = put_compat_statfs(buf, &tmp);
                path_put(&path);
@@ -284,7 +287,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs(file->f_path.dentry, &tmp);
+        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
        fput(file);
@@ -334,7 +337,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
        error = user_path(pathname, &path);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(path.dentry, &tmp);
+                error = vfs_statfs(&path, &tmp);
                if (!error)
                        error = put_compat_statfs64(buf, &tmp);
                path_put(&path);
@@ -355,7 +358,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
        file = fget(fd);
        if (!file)
                goto out;
-        error = vfs_statfs(file->f_path.dentry, &tmp);
+        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
        fput(file);
@@ -378,7 +381,7 @@ asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
        sb = user_get_super(new_decode_dev(dev));
        if (!sb)
                return -EINVAL;
-        err = vfs_statfs(sb->s_root, &sbuf);
+        err = statfs_by_dentry(sb->s_root, &sbuf);
        drop_super(sb);
        if (err)
                return err;
@@ -568,6 +571,79 @@ out:
        return ret;
 }
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+ssize_t compat_rw_copy_check_uvector(int type,
+                const struct compat_iovec __user *uvector, unsigned long nr_segs,
+                unsigned long fast_segs, struct iovec *fast_pointer,
+                struct iovec **ret_pointer)
+{
+        compat_ssize_t tot_len;
+        struct iovec *iov = *ret_pointer = fast_pointer;
+        ssize_t ret = 0;
+        int seg;
+        /*
+         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * traditionally returned zero for zero segments, so...
+         */
+        if (nr_segs == 0)
+                goto out;
+        ret = -EINVAL;
+        if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+                goto out;
+        if (nr_segs > fast_segs) {
+                ret = -ENOMEM;
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                if (iov == NULL) {
+                        *ret_pointer = fast_pointer;
+                        goto out;
+                }
+        }
+        *ret_pointer = iov;
+        /*
+         * Single unix specification:
+         * We should -EINVAL if an element length is not >= 0 and fitting an
+         * ssize_t.  The total length is fitting an ssize_t
+         *
+         * Be careful here because iov_len is a size_t not an ssize_t
+         */
+        tot_len = 0;
+        ret = -EINVAL;
+        for (seg = 0; seg < nr_segs; seg++) {
+                compat_ssize_t tmp = tot_len;
+                compat_uptr_t buf;
+                compat_ssize_t len;
+                if (__get_user(len, &uvector->iov_len) ||
+                   __get_user(buf, &uvector->iov_base)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
+                        goto out;
+                tot_len += len;
+                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+                        goto out;
+                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                iov->iov_base = compat_ptr(buf);
+                iov->iov_len = (compat_size_t) len;
+                uvector++;
+                iov++;
+        }
+        ret = tot_len;
+out:
+        return ret;
+}
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -600,7 +676,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
        iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
        ret = copy_iocb(nr, iocb, iocb64);
        if (!ret)
-                ret = sys_io_submit(ctx_id, nr, iocb64);
+                ret = do_io_submit(ctx_id, nr, iocb64, 1);
        return ret;
 }
@@ -763,9 +839,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME       "nfs4"
-asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
+asmlinkage long compat_sys_mount(const char __user * dev_name,
-                                 char __user * type, unsigned long flags,
+                                 const char __user * dir_name,
-                                 void __user * data)
+                                 const char __user * type, unsigned long flags,
+                                 const void __user * data)
 {
        char *kernel_type;
        unsigned long data_page;
@@ -818,8 +895,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
        return retval;
 }
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
 struct compat_old_linux_dirent {
        compat_ulong_t  d_ino;
        compat_ulong_t  d_offset;
@@ -908,7 +983,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
        struct compat_linux_dirent __user * dirent;
        struct compat_getdents_callback *buf = __buf;
        compat_ulong_t d_ino;
-        int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
+        int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+                namlen + 2, sizeof(compat_long_t));
        buf->error = -EINVAL;   /* only used if we fail.. */
        if (reclen > buf->count)
@@ -995,8 +1071,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
 {
        struct linux_dirent64 __user *dirent;
        struct compat_getdents_callback64 *buf = __buf;
-        int jj = NAME_OFFSET(dirent);
+        int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
-        int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
+                sizeof(u64));
        u64 off;
        buf->error = -EINVAL;   /* only used if we fail.. */
@@ -1077,70 +1153,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov=iovstack, *vector;
+        struct iovec *iov = iovstack;
        ssize_t ret;
-        int seg;
        io_fn_t fn;
        iov_fn_t fnv;
-        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
-         */
-        ret = 0;
-        if (nr_segs == 0)
-                goto out;
-        /*
-         * First get the "struct iovec" from user memory and
-         * verify all the pointers
-         */
        ret = -EINVAL;
-        if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-                goto out;
        if (!file->f_op)
                goto out;
-        if (nr_segs > UIO_FASTIOV) {
-                ret = -ENOMEM;
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (!iov)
-                        goto out;
-        }
        ret = -EFAULT;
        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
                goto out;
-        /*
+        tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-         * Single unix specification:
+                                               UIO_FASTIOV, iovstack, &iov);
-         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
-         *
-         * Be careful here because iov_len is a size_t not an ssize_t
-         */
-        tot_len = 0;
-        vector = iov;
-        ret = -EINVAL;
-        for (seg = 0 ; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
-                compat_ssize_t len;
-                compat_uptr_t buf;
-                if (__get_user(len, &uvector->iov_len) ||
-                    __get_user(buf, &uvector->iov_base)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                if (len < 0)    /* size_t not fitting an compat_ssize_t .. */
-                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
-                vector->iov_base = compat_ptr(buf);
-                vector->iov_len = (compat_size_t) len;
-                uvector++;
-                vector++;
-        }
        if (tot_len == 0) {
                ret = 0;
                goto out;
@@ -1169,11 +1196,10 @@ out:
        if (iov != iovstack)
                kfree(iov);
        if ((ret + (type == READ)) > 0) {
-                struct dentry *dentry = file->f_path.dentry;
                if (type == READ)
-                        fsnotify_access(dentry);
+                        fsnotify_access(file);
                else
-                        fsnotify_modify(dentry);
+                        fsnotify_modify(file);
        }
        return ret;
 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 641640dc7ae5..03e59aa318eb 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -4,7 +4,7 @@
 * Copyright (C) 1997-2000  Jakub Jelinek  (jakub@redhat.com)
 * Copyright (C) 1998  Eddie C. Dost  (ecd@skynet.be)
 * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs 
- * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
+ * Copyright (C) 2003       Pavel Machek (pavel@ucw.cz)
 *
 * These routines maintain argument size conversion between 32bit and 64bit
 * ioctls.
@@ -131,23 +131,6 @@ static int w_long(unsigned int fd, unsigned int cmd,
        return err;
 }
-static int rw_long(unsigned int fd, unsigned int cmd,
-                compat_ulong_t __user *argp)
-{
-        mm_segment_t old_fs = get_fs();
-        int err;
-        unsigned long val;
-        if(get_user(val, argp))
-                return -EFAULT;
-        set_fs (KERNEL_DS);
-        err = sys_ioctl(fd, cmd, (unsigned long)&val);
-        set_fs (old_fs);
-        if (!err && put_user(val, argp))
-                return -EFAULT;
-        return err;
-}
 struct compat_video_event {
        int32_t         type;
        compat_time_t   timestamp;
@@ -594,15 +577,12 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
        return err;
 }
-static int ioc_settimeout(unsigned int fd, unsigned int cmd,
-                compat_ulong_t __user *argp)
-{
-        return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp);
-}
 /* Bluetooth ioctls */
-#define HCIUARTSETPROTO _IOW('U', 200, int)
+#define HCIUARTSETPROTO         _IOW('U', 200, int)
-#define HCIUARTGETPROTO _IOR('U', 201, int)
+#define HCIUARTGETPROTO         _IOR('U', 201, int)
+#define HCIUARTGETDEVICE        _IOR('U', 202, int)
+#define HCIUARTSETFLAGS         _IOW('U', 203, int)
+#define HCIUARTGETFLAGS         _IOR('U', 204, int)
 #define BNEPCONNADD     _IOW('B', 200, int)
 #define BNEPCONNDEL     _IOW('B', 201, int)
@@ -966,6 +946,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP)
 COMPATIBLE_IOCTL(TIOCGPTN)
 COMPATIBLE_IOCTL(TIOCSPTLCK)
 COMPATIBLE_IOCTL(TIOCSERGETLSR)
+COMPATIBLE_IOCTL(TIOCSIG)
 #ifdef TCGETS2
 COMPATIBLE_IOCTL(TCGETS2)
 COMPATIBLE_IOCTL(TCSETS2)
@@ -1281,13 +1262,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
 COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
 COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
 COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* AUTOFS */
-COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
-COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
-COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
 COMPATIBLE_IOCTL(RAW_GETBIND)
@@ -1328,6 +1302,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL)
 COMPATIBLE_IOCTL(HCISETLINKMODE)
 COMPATIBLE_IOCTL(HCISETACLMTU)
 COMPATIBLE_IOCTL(HCISETSCOMTU)
+COMPATIBLE_IOCTL(HCIBLOCKADDR)
+COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
@@ -1552,9 +1528,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case RAW_GETBIND:
                return raw_ioctl(fd, cmd, argp);
 #endif
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
-        case AUTOFS_IOC_SETTIMEOUT32:
-                return ioc_settimeout(fd, cmd, argp);
        /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
        case SMB_IOC_GETMOUNTUID_32:
@@ -1609,9 +1582,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case KDSKBMETA:
        case KDSKBLED:
        case KDSETLED:
-        /* AUTOFS */
-        case AUTOFS_IOC_READY:
-        case AUTOFS_IOC_FAIL:
        /* NBD */
        case NBD_SET_SOCK:
        case NBD_SET_BLKSIZE:
@@ -1729,8 +1699,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                                goto out_fput;
                }
-                if (!filp->f_op ||
+                if (!filp->f_op || !filp->f_op->unlocked_ioctl)
-                    (!filp->f_op->ioctl && !filp->f_op->unlocked_ioctl))
                        goto do_ioctl;
                break;
        }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                return -EINVAL;
        sd_iattr = sd->s_iattr;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        error = inode_setattr(inode, iattr);
-        if (error)
-                return error;
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
                sd->s_iattr = sd_iattr;
        }
        /* attributes were changed atleast once in past */
+        error = simple_setattr(dentry, iattr);
+        if (error)
+                return error;
        if (ia_valid & ATTR_UID)
                sd_iattr->ia_uid = iattr->ia_uid;
        if (ia_valid & ATTR_GID)
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index dd3634e4c967..1e7a33028d33 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,66 +39,55 @@ static DEFINE_MUTEX(read_mutex);
 #define CRAMINO(x)      (((x)->offset && (x)->size)?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
+static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
-static int cramfs_iget5_test(struct inode *inode, void *opaque)
-{
-        struct cramfs_inode *cramfs_inode = opaque;
-        return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
-}
-static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
-        struct cramfs_inode *cramfs_inode = opaque;
+        static struct timespec zerotime;
-        inode->i_ino = CRAMINO(cramfs_inode);
+        inode->i_mode = cramfs_inode->mode;
-        return 0;
+        inode->i_uid = cramfs_inode->uid;
+        inode->i_size = cramfs_inode->size;
+        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+        inode->i_gid = cramfs_inode->gid;
+        /* Struct copy intentional */
+        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+        /* inode->i_nlink is left 1 - arguably wrong for directories,
+           but it's the best we can do without reading the directory
+           contents.  1 yields the right result in GNU find, even
+           without -noleaf option. */
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_fop = &generic_ro_fops;
+                inode->i_data.a_ops = &cramfs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &cramfs_dir_inode_operations;
+                inode->i_fop = &cramfs_directory_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &cramfs_aops;
+        } else {
+                init_special_inode(inode, inode->i_mode,
+                        old_decode_dev(cramfs_inode->size));
+        }
 }
 static struct inode *get_cramfs_inode(struct super_block *sb,
                                struct cramfs_inode * cramfs_inode)
 {
-        struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
+        struct inode *inode;
-                                            cramfs_iget5_test, cramfs_iget5_set,
+        if (CRAMINO(cramfs_inode) == 1) {
-                                            cramfs_inode);
+                inode = new_inode(sb);
-        static struct timespec zerotime;
+                if (inode) {
+                        inode->i_ino = 1;
-        if (inode && (inode->i_state & I_NEW)) {
+                        setup_inode(inode, cramfs_inode);
-                inode->i_mode = cramfs_inode->mode;
+                }
-                inode->i_uid = cramfs_inode->uid;
+        } else {
-                inode->i_size = cramfs_inode->size;
+                inode = iget_locked(sb, CRAMINO(cramfs_inode));
-                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+                if (inode && (inode->i_state & I_NEW)) {
-                inode->i_gid = cramfs_inode->gid;
+                        setup_inode(inode, cramfs_inode);
-                /* Struct copy intentional */
+                        unlock_new_inode(inode);
-                inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
-                /* inode->i_nlink is left 1 - arguably wrong for directories,
-                   but it's the best we can do without reading the directory
-                   contents.  1 yields the right result in GNU find, even
-                   without -noleaf option. */
-                if (S_ISREG(inode->i_mode)) {
-                        inode->i_fop = &generic_ro_fops;
-                        inode->i_data.a_ops = &cramfs_aops;
-                } else if (S_ISDIR(inode->i_mode)) {
-                        inode->i_op = &cramfs_dir_inode_operations;
-                        inode->i_fop = &cramfs_directory_operations;
-                } else if (S_ISLNK(inode->i_mode)) {
-                        inode->i_op = &page_symlink_inode_operations;
-                        inode->i_data.a_ops = &cramfs_aops;
-                } else {
-                        init_special_inode(inode, inode->i_mode,
-                                old_decode_dev(cramfs_inode->size));
                }
-                unlock_new_inode(inode);
        }
        return inode;
 }
-static void cramfs_drop_inode(struct inode *inode)
-{
-        if (inode->i_ino == 1)
-                generic_delete_inode(inode);
-        else
-                generic_drop_inode(inode);
-}
 /*
 * We have our own block cache: don't fill up the buffer cache
 * with the rom-image, because the way the filesystem is set
@@ -542,7 +531,6 @@ static const struct super_operations cramfs_ops = {
        .put_super      = cramfs_put_super,
        .remount_fs     = cramfs_remount,
        .statfs         = cramfs_statfs,
-        .drop_inode     = cramfs_drop_inode,
 };
 static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
 */
 static void prune_dcache(int count)
 {
-        struct super_block *sb;
+        struct super_block *sb, *p = NULL;
        int w_count;
        int unused = dentry_stat.nr_unused;
        int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
        if (unused == 0 || count == 0)
                return;
        spin_lock(&dcache_lock);
-restart:
        if (count >= unused)
                prune_ratio = 1;
        else
                prune_ratio = unused / count;
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_nr_dentry_unused == 0)
                        continue;
                sb->s_count++;
@@ -589,16 +590,16 @@ restart:
                        up_read(&sb->s_umount);
                }
                spin_lock(&sb_lock);
+                if (p)
+                        __put_super(p);
                count -= pruned;
-                /*
+                p = sb;
-                 * restart only when sb is no longer on the list and
+                /* more work left to do? */
-                 * we have more work to do.
+                if (count <= 0)
-                 */
+                        break;
-                if (__put_super_and_need_restart(sb) && count > 0) {
-                        spin_unlock(&sb_lock);
-                        goto restart;
-                }
        }
+        if (p)
+                __put_super(p);
        spin_unlock(&sb_lock);
        spin_unlock(&dcache_lock);
 }
@@ -897,7 +898,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
 *
 * In this case we return -1 to tell the caller that we baled.
 */
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                if (!(gfp_mask & __GFP_FS))
@@ -1331,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
 *
- * Searches the children of the parent dentry for the name in question. If
+ * d_lookup searches the children of the parent dentry for the name in
- * the dentry is found its reference count is incremented and the dentry
+ * question. If the dentry is found its reference count is incremented and the
- * is returned. The caller must use dput to free the entry when it has
+ * dentry is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
+ * finished using it. %NULL is returned if the dentry does not exist.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal. 
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
 */
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
        struct dentry * dentry = NULL;
@@ -1371,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
        unsigned int len = name->len;
@@ -1381,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
        struct hlist_node *node;
        struct dentry *dentry;
+        /*
+         * The hash list is protected using RCU.
+         *
+         * Take d_lock when comparing a candidate dentry, to avoid races
+         * with d_move().
+         *
+         * It is possible that concurrent renames can mess up our list
+         * walk here and result in missing our dentry, resulting in the
+         * false-negative result. d_lookup() protects against concurrent
+         * renames using rename_lock seqlock.
+         *
+         * See Documentation/vfs/dcache-locking.txt for more details.
+         */
        rcu_read_lock();
        
        hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1395,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                /*
                 * Recheck the dentry after taking the lock - d_move may have
-                 * changed things.  Don't bother checking the hash because we're
+                 * changed things. Don't bother checking the hash because
-                 * about to compare the whole name anyway.
+                 * we're about to compare the whole name anyway.
                 */
                if (dentry->d_parent != parent)
                        goto next;
@@ -1529,6 +1540,7 @@ void d_delete(struct dentry * dentry)
        spin_lock(&dentry->d_lock);
        isdir = S_ISDIR(dentry->d_inode->i_mode);
        if (atomic_read(&dentry->d_count) == 1) {
+                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_iput(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
@@ -1903,48 +1915,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 /**
- * __d_path - return the path of a dentry
+ * Prepend path string to a buffer
+ *
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
- * @buffer: buffer to return value in
+ * @buffer: pointer to the end of the buffer
- * @buflen: buffer length
+ * @buflen: pointer to buffer length
- *
- * Convert a dentry into an ASCII path name. If the entry has been deleted
- * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns a pointer into the buffer or an error code if the
+ * Caller holds the dcache_lock.
- * path was too long.
- *
- * "buflen" should be positive. Caller holds the dcache_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
 */
-char *__d_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path, struct path *root,
-               char *buffer, int buflen)
+                        char **buffer, int *buflen)
 {
        struct dentry *dentry = path->dentry;
        struct vfsmount *vfsmnt = path->mnt;
-        char *end = buffer + buflen;
+        bool slash = false;
-        char *retval;
+        int error = 0;
-        spin_lock(&vfsmount_lock);
-        prepend(&end, &buflen, "\0", 1);
-        if (d_unlinked(dentry) &&
-                (prepend(&end, &buflen, " (deleted)", 10) != 0))
-                        goto Elong;
-        if (buflen < 1)
+        br_read_lock(vfsmount_lock);
-                goto Elong;
+        while (dentry != root->dentry || vfsmnt != root->mnt) {
-        /* Get '/' right */
-        retval = end-1;
-        *retval = '/';
-        for (;;) {
                struct dentry * parent;
-                if (dentry == root->dentry && vfsmnt == root->mnt)
-                        break;
                if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                        /* Global root? */
                        if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1956,28 +1950,88 @@ char *__d_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                error = prepend_name(buffer, buflen, &dentry->d_name);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                if (!error)
-                        goto Elong;
+                        error = prepend(buffer, buflen, "/", 1);
-                retval = end;
+                if (error)
+                        break;
+                slash = true;
                dentry = parent;
        }
 out:
-        spin_unlock(&vfsmount_lock);
+        if (!error && !slash)
-        return retval;
+                error = prepend(buffer, buflen, "/", 1);
+        br_read_unlock(vfsmount_lock);
+        return error;
 global_root:
-        retval += 1;    /* hit the slash */
+        /*
-        if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+         * Filesystems needing to implement special "root names"
-                goto Elong;
+         * should do so with ->d_dname()
+         */
+        if (IS_ROOT(dentry) &&
+            (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
+                WARN(1, "Root dentry has weird name <%.*s>\n",
+                     (int) dentry->d_name.len, dentry->d_name.name);
+        }
        root->mnt = vfsmnt;
        root->dentry = dentry;
        goto out;
+}
-Elong:
+/**
-        retval = ERR_PTR(-ENAMETOOLONG);
+ * __d_path - return the path of a dentry
-        goto out;
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry (may be modified by this function)
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive. Caller holds the dcache_lock.
+ *
+ * If path is not reachable from the supplied root, then the value of
+ * root is changed (without modifying refcounts).
+ */
+char *__d_path(const struct path *path, struct path *root,
+               char *buf, int buflen)
+{
+        char *res = buf + buflen;
+        int error;
+        prepend(&res, &buflen, "\0", 1);
+        error = prepend_path(path, root, &res, &buflen);
+        if (error)
+                return ERR_PTR(error);
+        return res;
+}
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path, struct path *root,
+                                 char **buf, int *buflen)
+{
+        prepend(buf, buflen, "\0", 1);
+        if (d_unlinked(path->dentry)) {
+                int error = prepend(buf, buflen, " (deleted)", 10);
+                if (error)
+                        return error;
+        }
+        return prepend_path(path, root, buf, buflen);
+}
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+        return prepend(buffer, buflen, "(unreachable)", 13);
 }
 /**
@@ -1998,9 +2052,10 @@ Elong:
 */
 char *d_path(const struct path *path, char *buf, int buflen)
 {
-        char *res;
+        char *res = buf + buflen;
        struct path root;
        struct path tmp;
+        int error;
        /*
         * We have various synthetic filesystems that never get mounted.  On
@@ -2012,19 +2067,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
        if (path->dentry->d_op && path->dentry->d_op->d_dname)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
-        read_lock(&current->fs->lock);
+        get_fs_root(current->fs, &root);
-        root = current->fs->root;
-        path_get(&root);
-        read_unlock(&current->fs->lock);
        spin_lock(&dcache_lock);
        tmp = root;
-        res = __d_path(path, &tmp, buf, buflen);
+        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error)
+                res = ERR_PTR(error);
        spin_unlock(&dcache_lock);
        path_put(&root);
        return res;
 }
 EXPORT_SYMBOL(d_path);
+/**
+ * d_path_with_unreachable - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * The difference from d_path() is that this prepends "(unreachable)"
+ * to paths which are unreachable from the current process' root.
+ */
+char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
+{
+        char *res = buf + buflen;
+        struct path root;
+        struct path tmp;
+        int error;
+        if (path->dentry->d_op && path->dentry->d_op->d_dname)
+                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+        get_fs_root(current->fs, &root);
+        spin_lock(&dcache_lock);
+        tmp = root;
+        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (!error && !path_equal(&tmp, &root))
+                error = prepend_unreachable(&res, &buflen);
+        spin_unlock(&dcache_lock);
+        path_put(&root);
+        if (error)
+                res =  ERR_PTR(error);
+        return res;
+}
 /*
 * Helper function for dentry_operations.d_dname() members
 */
@@ -2049,16 +2136,12 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
 /*
 * Write full pathname from the root of the filesystem into the buffer.
 */
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
        char *end = buf + buflen;
        char *retval;
-        spin_lock(&dcache_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (d_unlinked(dentry) &&
-                (prepend(&end, &buflen, "//deleted", 9) != 0))
-                        goto Elong;
        if (buflen < 1)
                goto Elong;
        /* Get '/' right */
@@ -2076,7 +2159,28 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                retval = end;
                dentry = parent;
        }
+        return retval;
+Elong:
+        return ERR_PTR(-ENAMETOOLONG);
+}
+EXPORT_SYMBOL(__dentry_path);
+char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+        char *p = NULL;
+        char *retval;
+        spin_lock(&dcache_lock);
+        if (d_unlinked(dentry)) {
+                p = buf + buflen;
+                if (prepend(&p, &buflen, "//deleted", 10) != 0)
+                        goto Elong;
+                buflen++;
+        }
+        retval = __dentry_path(dentry, buf, buflen);
        spin_unlock(&dcache_lock);
+        if (!IS_ERR(retval) && p)
+                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
 Elong:
        spin_unlock(&dcache_lock);
@@ -2110,27 +2214,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        if (!page)
                return -ENOMEM;
-        read_lock(&current->fs->lock);
+        get_fs_root_and_pwd(current->fs, &root, &pwd);
-        pwd = current->fs->pwd;
-        path_get(&pwd);
-        root = current->fs->root;
-        path_get(&root);
-        read_unlock(&current->fs->lock);
        error = -ENOENT;
        spin_lock(&dcache_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
-                char * cwd;
+                char *cwd = page + PAGE_SIZE;
+                int buflen = PAGE_SIZE;
-                cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+                prepend(&cwd, &buflen, "\0", 1);
+                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
                spin_unlock(&dcache_lock);
-                error = PTR_ERR(cwd);
+                if (error)
-                if (IS_ERR(cwd))
                        goto out;
+                /* Unreachable from current root */
+                if (!path_equal(&tmp, &root)) {
+                        error = prepend_unreachable(&cwd, &buflen);
+                        if (error)
+                                goto out;
+                }
                error = -ERANGE;
                len = PAGE_SIZE + page - cwd;
                if (len <= size) {
@@ -2195,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
        struct vfsmount *mnt = path1->mnt;
        struct dentry *dentry = path1->dentry;
        int res;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (mnt != path2->mnt) {
                for (;;) {
                        if (mnt->mnt_parent == mnt) {
-                                spin_unlock(&vfsmount_lock);
+                                br_read_unlock(vfsmount_lock);
                                return 0;
                        }
                        if (mnt->mnt_parent == path2->mnt)
@@ -2209,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
                dentry = mnt->mnt_mountpoint;
        }
        res = is_subdir(dentry, path2->dentry);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
 EXPORT_SYMBOL(path_is_under);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
 /*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
 *
 * These functions are exactly the same as the above functions (but use a hex
 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
 static int debugfs_size_t_set(void *data, u64 val)
 {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, s);
        memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
        error = mknod_ptmx(s);
        if (error)
-                goto out_dput;
+                goto out_undo_sget;
-        return 0;
+        simple_set_mnt(mnt, s);
-out_dput:
+        return 0;
-        dput(s->s_root); /* undo dget() in simple_set_mnt() */
 out_undo_sget:
        deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..48d74c7391d1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
        int reap_counter;               /* rate limit reaping */
        get_block_t *get_block;         /* block mapping function */
        dio_iodone_t *end_io;           /* IO completion function */
+        dio_submit_t *submit_io;        /* IO submition function */
+        loff_t logical_offset_in_bio;   /* current first logical block in bio */
        sector_t final_block_in_bio;    /* current final block in bio + 1 */
        sector_t next_block_for_io;     /* next block to be put under IO,
                                           in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
        unsigned cur_page_offset;       /* Offset into it, in bytes */
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        loff_t cur_page_fs_offset;      /* Offset in file */
        /* BIO completion state */
        spinlock_t bio_lock;            /* protects BIO fields below */
@@ -215,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio)
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static int dio_complete(struct dio *dio, loff_t offset, int ret)
+static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 {
        ssize_t transferred = 0;
@@ -236,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
                        transferred = dio->i_size - offset;
        }
-        if (dio->end_io && dio->result)
-                dio->end_io(dio->iocb, offset, transferred,
-                            dio->map_bh.b_private);
-        if (dio->flags & DIO_LOCKING)
-                /* lockdep: non-owner release */
-                up_read_non_owner(&dio->inode->i_alloc_sem);
        if (ret == 0)
                ret = dio->page_errors;
        if (ret == 0)
@@ -251,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
        if (ret == 0)
                ret = transferred;
+        if (dio->end_io && dio->result) {
+                dio->end_io(dio->iocb, offset, transferred,
+                            dio->map_bh.b_private, ret, is_async);
+        } else if (is_async) {
+                aio_complete(dio->iocb, ret, 0);
+        }
+        if (dio->flags & DIO_LOCKING)
+                /* lockdep: non-owner release */
+                up_read_non_owner(&dio->inode->i_alloc_sem);
        return ret;
 }
@@ -274,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (remaining == 0) {
-                int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
+                dio_complete(dio, dio->iocb->ki_pos, 0, true);
-                aio_complete(dio->iocb, ret, 0);
                kfree(dio);
        }
 }
@@ -300,6 +305,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+        struct dio *dio = bio->bi_private;
+        if (dio->is_async)
+                dio_bio_end_aio(bio, error);
+        else
+                dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
@@ -316,6 +341,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                bio->bi_end_io = dio_bio_end_io;
        dio->bio = bio;
+        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
        return 0;
 }
@@ -340,10 +366,15 @@ static void dio_bio_submit(struct dio *dio)
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
-        submit_bio(dio->rw, bio);
+        if (dio->submit_io)
+                dio->submit_io(dio->rw, bio, dio->inode,
+                               dio->logical_offset_in_bio);
+        else
+                submit_bio(dio->rw, bio);
        dio->bio = NULL;
        dio->boundary = 0;
+        dio->logical_offset_in_bio = 0;
 }
 /*
@@ -603,16 +634,32 @@ static int dio_send_cur_page(struct dio *dio)
        int ret = 0;
        if (dio->bio) {
+                loff_t cur_offset = dio->cur_page_fs_offset;
+                loff_t bio_next_offset = dio->logical_offset_in_bio +
+                        dio->bio->bi_size;
                /*
-                 * See whether this new request is contiguous with the old
+                 * See whether this new request is contiguous with the old.
+                 *
+                 * Btrfs cannot handl having logically non-contiguous requests
+                 * submitted.  For exmple if you have
+                 *
+                 * Logical:  [0-4095][HOLE][8192-12287]
+                 * Phyiscal: [0-4095]      [4096-8181]
+                 *
+                 * We cannot submit those pages together as one BIO.  So if our
+                 * current logical offset in the file does not equal what would
+                 * be the next logical offset in the bio, submit the bio we
+                 * have.
                 */
-                if (dio->final_block_in_bio != dio->cur_page_block)
+                if (dio->final_block_in_bio != dio->cur_page_block ||
+                    cur_offset != bio_next_offset)
                        dio_bio_submit(dio);
                /*
                 * Submit now if the underlying fs is about to perform a
                 * metadata read
                 */
-                if (dio->boundary)
+                else if (dio->boundary)
                        dio_bio_submit(dio);
        }
@@ -701,6 +748,7 @@ submit_page_section(struct dio *dio, struct page *page,
        dio->cur_page_offset = offset;
        dio->cur_page_len = len;
        dio->cur_page_block = blocknr;
+        dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
 out:
        return ret;
 }
@@ -935,7 +983,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
        const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
        unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-        struct dio *dio)
+        dio_submit_t submit_io, struct dio *dio)
 {
        unsigned long user_addr; 
        unsigned long flags;
@@ -952,6 +1000,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        dio->get_block = get_block;
        dio->end_io = end_io;
+        dio->submit_io = submit_io;
        dio->final_block_in_bio = -1;
        dio->next_block_for_io = -1;
@@ -1008,7 +1057,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                }
        } /* end iovec loop */
-        if (ret == -ENOTBLK && (rw & WRITE)) {
+        if (ret == -ENOTBLK) {
                /*
                 * The remaining part of the request will be
                 * be handled by buffered I/O when we return
@@ -1079,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (ret2 == 0) {
-                ret = dio_complete(dio, offset, ret);
+                ret = dio_complete(dio, offset, ret, false);
                kfree(dio);
        } else
                BUG_ON(ret != -EIOCBQUEUED);
@@ -1110,7 +1159,7 @@ ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int flags)
+        dio_submit_t submit_io, int flags)
 {
        int seg;
        size_t size;
@@ -1197,22 +1246,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                (end > i_size_read(inode)));
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
-                                nr_segs, blkbits, get_block, end_io, dio);
+                                nr_segs, blkbits, get_block, end_io,
+                                submit_io, dio);
-        /*
-         * In case of error extending write may have instantiated a few
-         * blocks outside i_size. Trim these off again for DIO_LOCKING.
-         *
-         * NOTE: filesystems with their own locking have to handle this
-         * on their own.
-         */
-        if (flags & DIO_LOCKING) {
-                if (unlikely((rw & WRITE) && retval < 0)) {
-                        loff_t isize = i_size_read(inode);
-                        if (end > isize)
-                                vmtruncate(inode, isize);
-                }
-        }
 out:
        return retval;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 17903b491298..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
                if (lkb->lkb_rqmode < mode)
                        break;
-        if (!lkb)
+        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
-                list_add_tail(new, head);
-        else
-                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
 }
 /* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c620526..37a34c2c622a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
        for (i = 0 ; i < CONN_HASH_SIZE; i++) {
                hlist_for_each_entry(con, h, &connection_hash[i], list) {
-                        if (con && con->sctp_assoc == assoc_id) {
+                        if (con->sctp_assoc == assoc_id) {
                                mutex_unlock(&connections_lock);
                                return con;
                        }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100d..ef17e0169da1 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
 int __init dlm_netlink_init(void)
 {
-        int rv;
+        return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
-        rv = genl_register_family(&family);
-        if (rv)
-                return rv;
-        rv = genl_register_ops(&family, &dlm_nl_ops);
-        if (rv < 0)
-                goto err;
-        return 0;
- err:
-        genl_unregister_family(&family);
-        return rv;
 }
 void dlm_netlink_exit(void)
 {
-        genl_unregister_ops(&family, &dlm_nl_ops);
        genl_unregister_family(&family);
 }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 8b6e73c47435..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
        if (type == AST_COMP && (ast_type & AST_COMP))
@@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
        if (eol) {
-                lkb->lkb_ast_type &= ~AST_BAST;
                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        }
@@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
 }
 static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-                               int bmode, char __user *buf, size_t count)
+                               int mode, char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        if (type == AST_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
-                result.bast_mode = bmode;
+                result.bast_mode = mode;
        } else {
                result.user_astaddr = ua->castaddr;
                result.user_astparam = ua->castparam;
@@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error, type=0, bmode=0, removed = 0;
+        int error = 0, removed;
+        int ret_type, ret_mode;
+        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
                error = copy_version_to_user(buf, count);
@@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 #endif
                return -EINVAL;
+ try_another:
        /* do we really need this? can a read happen after a close? */
        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
                return -EINVAL;
@@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        if (lkb->lkb_ast_type & AST_COMP) {
+        removed = 0;
-                lkb->lkb_ast_type &= ~AST_COMP;
+        ret_type = 0;
-                type = AST_COMP;
+        ret_mode = 0;
-        } else if (lkb->lkb_ast_type & AST_BAST) {
+        do_bast = lkb->lkb_ast_type & AST_BAST;
-                lkb->lkb_ast_type &= ~AST_BAST;
+        do_cast = lkb->lkb_ast_type & AST_COMP;
-                type = AST_BAST;
+        bastmode = lkb->lkb_bastmode;
-                bmode = lkb->lkb_bastmode;
+        castmode = lkb->lkb_castmode;
+        /* when both are queued figure out which to do first and
+           switch first so the other goes in the next read */
+        if (do_cast && do_bast) {
+                if (lkb->lkb_ast_first == AST_COMP) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = AST_BAST;
+                } else {
+                        ret_type = AST_BAST;
+                        ret_mode = bastmode;
+                        lkb->lkb_ast_type &= ~AST_BAST;
+                        lkb->lkb_ast_first = AST_COMP;
+                }
+        } else {
+                ret_type = lkb->lkb_ast_first;
+                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+                lkb->lkb_ast_type &= ~ret_type;
+                lkb->lkb_ast_first = 0;
+        }
+        /* if we're doing a bast but the bast is unnecessary, then
+           switch to do nothing or do a cast if that was needed next */
+        if ((ret_type == AST_BAST) &&
+            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                ret_type = 0;
+                ret_mode = 0;
+                if (do_cast) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = 0;
+                }
+        }
+        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+                log_print("device_read %x ast_first %x ast_type %x",
+                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
        }
        if (!lkb->lkb_ast_type) {
@@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        }
        spin_unlock(&proc->asts_spin);
-        error = copy_result_to_user(lkb->lkb_ua,
+        if (ret_type) {
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                error = copy_result_to_user(lkb->lkb_ua,
-                                type, bmode, buf, count);
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                ret_type, ret_mode, buf, count);
+                if (ret_type == AST_COMP)
+                        lkb->lkb_castmode_done = castmode;
+                if (ret_type == AST_BAST)
+                        lkb->lkb_bastmode_done = bastmode;
+        }
        /* removes reference for the proc->asts lists added by
           dlm_user_add_ast() and may result in the lkb being freed */
        if (removed)
                dlm_put_lkb(lkb);
+        /* the bast that was queued was eliminated (see unnecessary above),
+           leaving nothing to return */
+        if (!ret_type)
+                goto try_another;
        return error;
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..2195c213ab2f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,13 +12,13 @@
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb)
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
                        continue;
                if (inode->i_mapping->nrpages == 0)
                        continue;
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
        iput(toput_inode);
 }
-static void drop_pagecache(void)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        drop_pagecache_sb(sb);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
 static void drop_slab(void)
 {
        int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
        proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (write) {
                if (sysctl_drop_caches & 1)
-                        drop_pagecache();
+                        iterate_supers(drop_pagecache_sb, NULL);
                if (sysctl_drop_caches & 2)
                        drop_slab();
        }
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 1cc087635a5e..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
 /**
 * ecryptfs_init_crypt_ctx
- * @crypt_stat: Uninitilized crypt stats structure
+ * @crypt_stat: Uninitialized crypt stats structure
 *
 * Initialize the crypto context.
 *
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
 struct mutex key_tfm_list_mutex;
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
 {
        mutex_init(&key_tfm_list_mutex);
        INIT_LIST_HEAD(&key_tfm_list);
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
                                 + encoded_name_no_prefix_size);
                        (*encoded_name)[(*encoded_name_size)] = '\0';
-                        (*encoded_name_size)++;
                } else {
                        rc = -EOPNOTSUPP;
                }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
                                      struct page *page_for_lower,
                                      size_t offset_in_page, size_t size);
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
-                   size_t size);
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
                        struct inode *ecryptfs_inode);
 int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
                                     pgoff_t page_index,
                                     size_t offset_in_page, size_t size,
                                     struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
 int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
                                 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..622c95140802 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -199,7 +199,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                               "the persistent file for the dentry with name "
                               "[%s]; rc = [%d]\n", __func__,
                               ecryptfs_dentry->d_name.name, rc);
-                        goto out;
+                        goto out_free;
                }
        }
        if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
@@ -207,7 +207,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                rc = -EPERM;
                printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
                       "file must hence be opened RO\n", __func__);
-                goto out;
+                goto out_free;
        }
        ecryptfs_set_file_lower(
                file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -274,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync(ecryptfs_file_to_lower(file),
+        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
-                         ecryptfs_dentry_to_lower(dentry),
-                         datasync);
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -294,12 +292,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        return rc;
 }
-static int ecryptfs_ioctl(struct inode *inode, struct file *file,
+static long
-                          unsigned int cmd, unsigned long arg);
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct file *lower_file = NULL;
+        long rc = -ENOTTY;
+        if (ecryptfs_file_to_private(file))
+                lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+                rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+        return rc;
+}
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct file *lower_file = NULL;
+        long rc = -ENOIOCTLCMD;
+        if (ecryptfs_file_to_private(file))
+                lower_file = ecryptfs_file_to_lower(file);
+        if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+                rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+        return rc;
+}
+#endif
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
-        .ioctl = ecryptfs_ioctl,
+        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = ecryptfs_compat_ioctl,
+#endif
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
        .release = ecryptfs_release,
@@ -315,7 +341,10 @@ const struct file_operations ecryptfs_main_fops = {
        .write = do_sync_write,
        .aio_write = generic_file_aio_write,
        .readdir = ecryptfs_readdir,
-        .ioctl = ecryptfs_ioctl,
+        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = ecryptfs_compat_ioctl,
+#endif
        .mmap = generic_file_mmap,
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
@@ -324,20 +353,3 @@ const struct file_operations ecryptfs_main_fops = {
        .fasync = ecryptfs_fasync,
        .splice_read = generic_file_splice_read,
 };
-static int
-ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-               unsigned long arg)
-{
-        int rc = 0;
-        struct file *lower_file = NULL;
-        if (ecryptfs_file_to_private(file))
-                lower_file = ecryptfs_file_to_lower(file);
-        if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
-                rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
-                                             lower_file, cmd, arg);
-        else
-                rc = -ENOTTY;
-        return rc;
-}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
 static int grow_file(struct dentry *ecryptfs_dentry)
 {
        struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-        struct file fake_file;
-        struct ecryptfs_file_info tmp_file_info;
        char zero_virt[] = { 0x00 };
        int rc = 0;
-        memset(&fake_file, 0, sizeof(fake_file));
+        rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
-        fake_file.f_path.dentry = ecryptfs_dentry;
-        memset(&tmp_file_info, 0, sizeof(tmp_file_info));
-        ecryptfs_set_file_private(&fake_file, &tmp_file_info);
-        ecryptfs_set_file_lower(
-                &fake_file,
-                ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
-        rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
        i_size_write(ecryptfs_inode, 0);
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -273,7 +264,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                printk(KERN_ERR "%s: Out of memory whilst attempting "
                       "to allocate ecryptfs_dentry_info struct\n",
                        __func__);
-                goto out_dput;
+                goto out_put;
        }
        ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
        ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -348,14 +339,84 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 out_free_kmem:
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
        goto out;
-out_dput:
+out_put:
        dput(lower_dentry);
+        mntput(lower_mnt);
        d_drop(ecryptfs_dentry);
 out:
        return rc;
 }
 /**
+ * ecryptfs_new_lower_dentry
+ * @name: The name of the new dentry.
+ * @lower_dir_dentry: Parent directory of the new dentry.
+ * @nd: nameidata from last lookup.
+ *
+ * Create a new dentry or get it from lower parent dir.
+ */
+static struct dentry *
+ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
+                          struct nameidata *nd)
+{
+        struct dentry *new_dentry;
+        struct dentry *tmp;
+        struct inode *lower_dir_inode;
+        lower_dir_inode = lower_dir_dentry->d_inode;
+        tmp = d_alloc(lower_dir_dentry, name);
+        if (!tmp)
+                return ERR_PTR(-ENOMEM);
+        mutex_lock(&lower_dir_inode->i_mutex);
+        new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
+        mutex_unlock(&lower_dir_inode->i_mutex);
+        if (!new_dentry)
+                new_dentry = tmp;
+        else
+                dput(tmp);
+        return new_dentry;
+}
+/**
+ * ecryptfs_lookup_one_lower
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @lower_dir_dentry: lower parent directory
+ * @name: lower file name
+ *
+ * Get the lower dentry from vfs. If lower dentry does not exist yet,
+ * create it.
+ */
+static struct dentry *
+ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
+                          struct dentry *lower_dir_dentry, struct qstr *name)
+{
+        struct nameidata nd;
+        struct vfsmount *lower_mnt;
+        int err;
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
+                                    ecryptfs_dentry->d_parent));
+        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
+        mntput(lower_mnt);
+        if (!err) {
+                /* we dont need the mount */
+                mntput(nd.path.mnt);
+                return nd.path.dentry;
+        }
+        if (err != -ENOENT)
+                return ERR_PTR(err);
+        /* create a new lower dentry */
+        return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
+}
+/**
 * ecryptfs_lookup
 * @ecryptfs_dir_inode: The eCryptfs directory inode
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -372,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
+        struct qstr lower_name;
        int rc = 0;
        ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -382,14 +444,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+        lower_name.name = ecryptfs_dentry->d_name.name;
-        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+        lower_name.len = ecryptfs_dentry->d_name.len;
-                                      lower_dir_dentry,
+        lower_name.hash = ecryptfs_dentry->d_name.hash;
-                                      ecryptfs_dentry->d_name.len);
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
+        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
@@ -411,14 +479,20 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
-        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+        lower_name.name = encrypted_and_encoded_name;
-        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+        lower_name.len = encrypted_and_encoded_name_size;
-                                      lower_dir_dentry,
+        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
-                                      encrypted_and_encoded_name_size - 1);
+        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+                                                    &lower_name);
+                if (rc < 0)
+                        goto out_d_drop;
+        }
+        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
+                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
@@ -784,8 +858,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 {
        int rc = 0;
        struct inode *inode = dentry->d_inode;
-        struct dentry *lower_dentry;
-        struct file fake_ecryptfs_file;
        struct ecryptfs_crypt_stat *crypt_stat;
        loff_t i_size = i_size_read(inode);
        loff_t lower_size_before_truncate;
@@ -796,23 +868,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                goto out;
        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-        /* Set up a fake ecryptfs file, this is used to interface with
-         * the file in the underlying filesystem so that the
-         * truncation has an effect there as well. */
-        memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
-        fake_ecryptfs_file.f_path.dentry = dentry;
-        /* Released at out_free: label */
-        ecryptfs_set_file_private(&fake_ecryptfs_file,
-                                  kmem_cache_alloc(ecryptfs_file_info_cache,
-                                                   GFP_KERNEL));
-        if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        ecryptfs_set_file_lower(
-                &fake_ecryptfs_file,
-                ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
        /* Switch on growing or shrinking file */
        if (ia->ia_size > i_size) {
                char zero[] = { 0x00 };
@@ -822,7 +877,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                 * this triggers code that will fill in 0's throughout
                 * the intermediate portion of the previous end of the
                 * file and the new and of the file */
-                rc = ecryptfs_write(&fake_ecryptfs_file, zero,
+                rc = ecryptfs_write(inode, zero,
                                    (ia->ia_size - 1), 1);
        } else { /* ia->ia_size < i_size_read(inode) */
                /* We're chopping off all the pages down to the page
@@ -832,13 +887,23 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                size_t num_zeros = (PAGE_CACHE_SIZE
                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
+                /*
+                 * XXX(truncate) this should really happen at the begginning
+                 * of ->setattr.  But the code is too messy to that as part
+                 * of a larger patch.  ecryptfs is also totally missing out
+                 * on the inode_change_ok check at the beginning of
+                 * ->setattr while would include this.
+                 */
+                rc = inode_newsize_ok(inode, ia->ia_size);
+                if (rc)
+                        goto out;
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, ia->ia_size);
+                        truncate_setsize(inode, ia->ia_size);
-                        if (rc)
-                                goto out_free;
                        lower_ia->ia_size = ia->ia_size;
                        lower_ia->ia_valid |= ATTR_SIZE;
-                        goto out_free;
+                        goto out;
                }
                if (num_zeros) {
                        char *zeros_virt;
@@ -846,25 +911,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                        zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
                        if (!zeros_virt) {
                                rc = -ENOMEM;
-                                goto out_free;
+                                goto out;
                        }
-                        rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
+                        rc = ecryptfs_write(inode, zeros_virt,
                                            ia->ia_size, num_zeros);
                        kfree(zeros_virt);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to zero out "
                                       "the remainder of the end page on "
                                       "reducing truncate; rc = [%d]\n", rc);
-                                goto out_free;
+                                goto out;
                        }
                }
-                vmtruncate(inode, ia->ia_size);
+                truncate_setsize(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
                               "ecryptfs_write_inode_size_to_metadata; "
                               "rc = [%d]\n", rc);
-                        goto out_free;
+                        goto out;
                }
                /* We are reducing the size of the ecryptfs file, and need to
                 * know if we need to reduce the size of the lower file. */
@@ -878,10 +943,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                } else
                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
-out_free:
-        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
-                kmem_cache_free(ecryptfs_file_info_cache,
-                                ecryptfs_file_to_private(&fake_ecryptfs_file));
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
        if (!s) {
                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                rc = -ENOMEM;
                goto out;
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
        return 0;
 }
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
 {
        int rc = 0;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
 *
 * Returns zero on success; non-zero on error
 */
-static int ecryptfs_parse_options(struct super_block *sb, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 {
        char *p;
        int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int fn_cipher_key_bytes;
        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+                &sbi->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
        int token;
        char *sig_src;
@@ -483,68 +483,7 @@ out:
 }
 struct kmem_cache *ecryptfs_sb_info_cache;
+static struct file_system_type ecryptfs_fs_type;
-/**
- * ecryptfs_fill_super
- * @sb: The ecryptfs super block
- * @raw_data: The options passed to mount
- * @silent: Not used but required by function prototype
- *
- * Sets up what we can of the sb, rest is done in ecryptfs_read_super
- *
- * Returns zero on success; non-zero otherwise
- */
-static int
-ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        struct ecryptfs_sb_info *esi;
-        int rc = 0;
-        /* Released in ecryptfs_put_super() */
-        ecryptfs_set_superblock_private(sb,
-                                        kmem_cache_zalloc(ecryptfs_sb_info_cache,
-                                                         GFP_KERNEL));
-        esi = ecryptfs_superblock_to_private(sb);
-        if (!esi) {
-                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-        if (rc)
-                goto out;
-        sb->s_bdi = &esi->bdi;
-        sb->s_op = &ecryptfs_sops;
-        /* Released through deactivate_super(sb) from get_sb_nodev */
-        sb->s_root = d_alloc(NULL, &(const struct qstr) {
-                             .hash = 0,.name = "/",.len = 1});
-        if (!sb->s_root) {
-                ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        sb->s_root->d_op = &ecryptfs_dops;
-        sb->s_root->d_sb = sb;
-        sb->s_root->d_parent = sb->s_root;
-        /* Released in d_release when dput(sb->s_root) is called */
-        /* through deactivate_super(sb) from get_sb_nodev() */
-        ecryptfs_set_dentry_private(sb->s_root,
-                                    kmem_cache_zalloc(ecryptfs_dentry_info_cache,
-                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(sb->s_root)) {
-                ecryptfs_printk(KERN_ERR,
-                                "dentry_info_cache alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        rc = 0;
-out:
-        /* Should be able to rely on deactivate_super called from
-         * get_sb_nodev */
-        return rc;
-}
 /**
 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
                goto out;
        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
 *
- * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * The whole ecryptfs_get_sb process is broken into 3 functions:
 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
- *                        with as much information as it can before needing
- *                        the lower filesystem.
 * ecryptfs_read_super(): this accesses the lower filesystem and uses
 *                        ecryptfs_interpose to perform most of the linking
 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data,
                        struct vfsmount *mnt)
 {
+        struct super_block *s;
+        struct ecryptfs_sb_info *sbi;
+        struct ecryptfs_dentry_info *root_info;
+        const char *err = "Getting sb failed";
        int rc;
-        struct super_block *sb;
-        rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
-        if (rc < 0) {
+        if (!sbi) {
-                printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+                rc = -ENOMEM;
                goto out;
        }
-        sb = mnt->mnt_sb;
-        rc = ecryptfs_parse_options(sb, raw_data);
+        rc = ecryptfs_parse_options(sbi, raw_data);
        if (rc) {
-                printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+                err = "Error parsing options";
-                goto out_abort;
+                goto out;
+        }
+        s = sget(fs_type, NULL, set_anon_super, NULL);
+        if (IS_ERR(s)) {
+                rc = PTR_ERR(s);
+                goto out;
        }
-        rc = ecryptfs_read_super(sb, dev_name);
+        s->s_flags = flags;
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
        if (rc) {
-                printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+                deactivate_locked_super(s);
-                goto out_abort;
+                goto out;
        }
-        goto out;
-out_abort:
+        ecryptfs_set_superblock_private(s, sbi);
-        dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
+        s->s_bdi = &sbi->bdi;
-        deactivate_locked_super(sb);
+        /* ->kill_sb() will take care of sbi after that point */
+        sbi = NULL;
+        s->s_op = &ecryptfs_sops;
+        rc = -ENOMEM;
+        s->s_root = d_alloc(NULL, &(const struct qstr) {
+                             .hash = 0,.name = "/",.len = 1});
+        if (!s->s_root) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        s->s_root->d_op = &ecryptfs_dops;
+        s->s_root->d_sb = s;
+        s->s_root->d_parent = s->s_root;
+        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+        if (!root_info) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        /* ->kill_sb() will take care of root_info */
+        ecryptfs_set_dentry_private(s->s_root, root_info);
+        s->s_flags |= MS_ACTIVE;
+        rc = ecryptfs_read_super(s, dev_name);
+        if (rc) {
+                deactivate_locked_super(s);
+                err = "Reading sb failed";
+                goto out;
+        }
+        simple_set_mnt(mnt, s);
+        return 0;
 out:
+        if (sbi) {
+                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+        }
+        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
        return rc;
 }
@@ -633,11 +624,16 @@ out:
 * @sb: The ecryptfs super block
 *
 * Used to bring the superblock down and free the private data.
- * Private data is free'd in ecryptfs_put_super()
 */
 static void ecryptfs_kill_block_super(struct super_block *sb)
 {
-        generic_shutdown_super(sb);
+        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        kill_anon_super(sb);
+        if (!sb_info)
+                return;
+        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        bdi_destroy(&sb_info->bdi);
+        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 2d8dbce9d485..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
 static struct hlist_head *ecryptfs_daemon_hash;
 struct mutex ecryptfs_daemon_hash_mux;
-static int ecryptfs_hash_buckets;
+static int ecryptfs_hash_bits;
 #define ecryptfs_uid_hash(uid) \
-        hash_long((unsigned long)uid, ecryptfs_hash_buckets)
+        hash_long((unsigned long)uid, ecryptfs_hash_bits)
 static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -274,7 +274,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
                              struct user_namespace *user_ns, struct pid *pid,
                              u32 seq)
 {
-        struct ecryptfs_daemon *daemon;
+        struct ecryptfs_daemon *uninitialized_var(daemon);
        struct ecryptfs_msg_ctx *msg_ctx;
        size_t msg_size;
        struct nsproxy *nsproxy;
@@ -473,7 +473,7 @@ sleep:
        return rc;
 }
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
 {
        int i;
        int rc = 0;
@@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void)
        }
        mutex_init(&ecryptfs_daemon_hash_mux);
        mutex_lock(&ecryptfs_daemon_hash_mux);
-        ecryptfs_hash_buckets = 1;
+        ecryptfs_hash_bits = 1;
-        while (ecryptfs_number_of_users >> ecryptfs_hash_buckets)
+        while (ecryptfs_number_of_users >> ecryptfs_hash_bits)
-                ecryptfs_hash_buckets++;
+                ecryptfs_hash_bits++;
        ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head)
-                                        * ecryptfs_hash_buckets), GFP_KERNEL);
+                                        * (1 << ecryptfs_hash_bits)),
+                                       GFP_KERNEL);
        if (!ecryptfs_daemon_hash) {
                rc = -ENOMEM;
                printk(KERN_ERR "%s: Failed to allocate memory\n", __func__);
                mutex_unlock(&ecryptfs_daemon_hash_mux);
                goto out;
        }
-        for (i = 0; i < ecryptfs_hash_buckets; i++)
+        for (i = 0; i < (1 << ecryptfs_hash_bits); i++)
                INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]);
        mutex_unlock(&ecryptfs_daemon_hash_mux);
        ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx)
@@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void)
                int i;
                mutex_lock(&ecryptfs_daemon_hash_mux);
-                for (i = 0; i < ecryptfs_hash_buckets; i++) {
+                for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
                        int rc;
                        hlist_for_each_entry(daemon, elem,
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
 {
        int rc;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
 * Returns locked and up-to-date page (if ok), with increased
 * refcnt.
 */
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index)
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 {
-        struct dentry *dentry;
+        struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
-        struct inode *inode;
-        struct address_space *mapping;
-        struct page *page;
-        dentry = file->f_path.dentry;
-        inode = dentry->d_inode;
-        mapping = inode->i_mapping;
-        page = read_mapping_page(mapping, index, (void *)file);
        if (!IS_ERR(page))
                lock_page(page);
        return page;
@@ -198,7 +190,7 @@ out:
 static int ecryptfs_readpage(struct file *file, struct page *page)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
        int rc = 0;
        if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
-                        &ecryptfs_inode_to_private(
+                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
-                                file->f_path.dentry->d_inode)->crypt_stat;
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
                    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
        unsigned to = from + copied;
        struct inode *ecryptfs_inode = mapping->host;
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc;
        if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 /**
 * ecryptfs_write
- * @ecryptfs_file: The eCryptfs file into which to write
+ * @ecryptfs_inode: The eCryptfs file into which to write
 * @data: Virtual address where data to write is located
 * @offset: Offset in the eCryptfs file at which to begin writing the
 *          data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
                   size_t size)
 {
        struct page *ecryptfs_page;
        struct ecryptfs_crypt_stat *crypt_stat;
-        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        char *ecryptfs_page_virt;
        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
        loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                        if (num_bytes > total_remaining_zeros)
                                num_bytes = total_remaining_zeros;
                }
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 int ecryptfs_read(char *data, loff_t offset, size_t size,
                  struct file *ecryptfs_file)
 {
+        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        struct page *ecryptfs_page;
        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size =
+        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-                i_size_read(ecryptfs_file->f_dentry->d_inode);
        loff_t data_offset = 0;
        loff_t pos;
        int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
                if (num_bytes > total_remaining_bytes)
                        num_bytes = total_remaining_bytes;
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..f7fc286a3aa9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
 }
 /**
- * ecryptfs_put_super
- * @sb: Pointer to the ecryptfs super block
- *
- * Final actions when unmounting a file system.
- * This will handle deallocation and release of our private data.
- */
-static void ecryptfs_put_super(struct super_block *sb)
-{
-        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
-        lock_kernel();
-        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
-        bdi_destroy(&sb_info->bdi);
-        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
-        ecryptfs_set_superblock_private(sb, NULL);
-        unlock_kernel();
-}
-/**
 * ecryptfs_statfs
 * @sb: The ecryptfs super block
 * @buf: The struct kstatfs to fill in with stats
@@ -139,11 +118,15 @@ static void ecryptfs_put_super(struct super_block *sb)
 */
 static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        if (!lower_dentry->d_sb->s_op->statfs)
+                return -ENOSYS;
+        return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
 }
 /**
- * ecryptfs_clear_inode
+ * ecryptfs_evict_inode
 * @inode - The ecryptfs inode
 *
 * Called by iput() when the inode reference count reached zero
@@ -152,8 +135,10 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 * on the inode free list. We use this to drop out reference to the
 * lower inode.
 */
-static void ecryptfs_clear_inode(struct inode *inode)
+static void ecryptfs_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        iput(ecryptfs_inode_to_lower(inode));
 }
@@ -203,9 +188,8 @@ const struct super_operations ecryptfs_sops = {
        .alloc_inode = ecryptfs_alloc_inode,
        .destroy_inode = ecryptfs_destroy_inode,
        .drop_inode = generic_delete_inode,
-        .put_super = ecryptfs_put_super,
        .statfs = ecryptfs_statfs,
        .remount_fs = NULL,
-        .clear_inode = ecryptfs_clear_inode,
+        .evict_inode = ecryptfs_evict_inode,
        .show_options = ecryptfs_show_options
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_exclusive(&ep->wq, &wait);
-                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
diff --git a/fs/exec.c b/fs/exec.c
index 029308754eea..56536ad0e7cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -131,7 +130,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        fsnotify_open(file->f_path.dentry);
+        fsnotify_open(file);
        error = -ENOEXEC;
        if(file->f_op) {
@@ -244,9 +243,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
+        BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
-        vma->vm_flags = VM_STACK_FLAGS;
+        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        err = insert_vm_struct(mm, vma);
@@ -363,13 +363,13 @@ err:
 /*
 * count() counts the number of strings in array ARGV.
 */
-static int count(char __user * __user * argv, int max)
+static int count(const char __user * const __user * argv, int max)
 {
        int i = 0;
        if (argv != NULL) {
                for (;;) {
-                        char __user * p;
+                        const char __user * p;
                        if (get_user(p, argv))
                                return -EFAULT;
@@ -378,6 +378,9 @@ static int count(char __user * __user * argv, int max)
                        argv++;
                        if (i++ >= max)
                                return -E2BIG;
+                        if (fatal_signal_pending(current))
+                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
@@ -389,7 +392,7 @@ static int count(char __user * __user * argv, int max)
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
                        struct linux_binprm *bprm)
 {
        struct page *kmapped_page = NULL;
@@ -398,7 +401,7 @@ static int copy_strings(int argc, char __user * __user * argv,
        int ret;
        while (argc-- > 0) {
-                char __user *str;
+                const char __user *str;
                int len;
                unsigned long pos;
@@ -421,6 +424,12 @@ static int copy_strings(int argc, char __user * __user * argv,
                while (len > 0) {
                        int offset, bytes_to_copy;
+                        if (fatal_signal_pending(current)) {
+                                ret = -ERESTARTNOHAND;
+                                goto out;
+                        }
+                        cond_resched();
                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;
@@ -472,12 +481,13 @@ out:
 /*
 * Like copy_strings, but get argv and its values from kernel memory.
 */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *argv,
+                        struct linux_binprm *bprm)
 {
        int r;
        mm_segment_t oldfs = get_fs();
        set_fs(KERNEL_DS);
-        r = copy_strings(argc, (char __user * __user *)argv, bprm);
+        r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
        set_fs(oldfs);
        return r;
 }
@@ -595,6 +605,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
 #else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);
+        if (unlikely(stack_top < mmap_min_addr) ||
+            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
+                return -ENOMEM;
        stack_shift = vma->vm_end - stack_top;
        bprm->p -= stack_shift;
@@ -618,6 +633,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
+        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
@@ -632,6 +648,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
                        goto out_unlock;
        }
+        /* mprotect_fixup is overkill to remove the temporary stack flags */
+        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
@@ -650,6 +669,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else
                stack_base = vma->vm_start - stack_expand;
 #endif
+        current->mm->start_stack = bprm->p;
        ret = expand_stack(vma, stack_base);
        if (ret)
                ret = -EFAULT;
@@ -680,7 +700,7 @@ struct file *open_exec(const char *name)
        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        fsnotify_open(file->f_path.dentry);
+        fsnotify_open(file);
        err = deny_write_access(file);
        if (err)
@@ -765,7 +785,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        int count;
        if (thread_group_empty(tsk))
                goto no_thread_group;
@@ -782,13 +801,13 @@ static int de_thread(struct task_struct *tsk)
                spin_unlock_irq(lock);
                return -EAGAIN;
        }
        sig->group_exit_task = tsk;
-        zap_other_threads(tsk);
+        sig->notify_count = zap_other_threads(tsk);
+        if (!thread_group_leader(tsk))
+                sig->notify_count--;
-        /* Account for the thread group leader hanging around: */
+        while (sig->notify_count) {
-        count = thread_group_leader(tsk) ? 1 : 2;
-        sig->notify_count = count;
-        while (atomic_read(&sig->count) > count) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(lock);
                schedule();
@@ -995,7 +1014,7 @@ EXPORT_SYMBOL(flush_old_exec);
 void setup_new_exec(struct linux_binprm * bprm)
 {
        int i, ch;
-        char * name;
+        const char *name;
        char tcomm[sizeof(current->comm)];
        arch_pick_mmap_layout(current->mm);
@@ -1115,7 +1134,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
        bprm->unsafe = tracehook_unsafe_exec(p);
        n_fs = 1;
-        write_lock(&p->fs->lock);
+        spin_lock(&p->fs->lock);
        rcu_read_lock();
        for (t = next_thread(p); t != p; t = next_thread(t)) {
                if (t->fs == p->fs)
@@ -1132,7 +1151,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
                        res = 1;
                }
        }
-        write_unlock(&p->fs->lock);
+        spin_unlock(&p->fs->lock);
        return res;
 }
@@ -1314,9 +1333,9 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
 * sys_execve() executes a new program.
 */
-int do_execve(char * filename,
+int do_execve(const char * filename,
-        char __user *__user *argv,
+        const char __user *const __user *argv,
-        char __user *__user *envp,
+        const char __user *const __user *envp,
        struct pt_regs * regs)
 {
        struct linux_binprm *bprm;
@@ -1660,12 +1679,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
        struct completion *vfork_done;
-        int core_waiters;
+        int core_waiters = -EBUSY;
        init_completion(&core_state->startup);
        core_state->dumper.task = tsk;
        core_state->dumper.next = NULL;
-        core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+        down_write(&mm->mmap_sem);
+        if (!mm->core_state)
+                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
        if (unlikely(core_waiters < 0))
@@ -1785,21 +1807,61 @@ static void wait_for_dump_helpers(struct file *file)
 }
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+        struct file *rp, *wp;
+        struct fdtable *fdt;
+        struct coredump_params *cp = (struct coredump_params *)info->data;
+        struct files_struct *cf = current->files;
+        wp = create_write_pipe(0);
+        if (IS_ERR(wp))
+                return PTR_ERR(wp);
+        rp = create_read_pipe(wp, 0);
+        if (IS_ERR(rp)) {
+                free_write_pipe(wp);
+                return PTR_ERR(rp);
+        }
+        cp->file = wp;
+        sys_close(0);
+        fd_install(0, rp);
+        spin_lock(&cf->file_lock);
+        fdt = files_fdtable(cf);
+        FD_SET(0, fdt->open_fds);
+        FD_CLR(0, fdt->close_on_exec);
+        spin_unlock(&cf->file_lock);
+        /* and disallow core files too */
+        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+        return 0;
+}
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
-        struct inode * inode;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
-        int ispipe = 0;
+        int ispipe;
-        char **helper_argv = NULL;
-        int helper_argc = 0;
-        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .signr = signr,
@@ -1818,23 +1880,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        binfmt = mm->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
+        if (!__get_dumpable(cprm.mm_flags))
-        cred = prepare_creds();
-        if (!cred) {
-                retval = -ENOMEM;
                goto fail;
-        }
-        down_write(&mm->mmap_sem);
+        cred = prepare_creds();
-        /*
+        if (!cred)
-         * If another thread got here first, or we are not dumpable, bail out.
-         */
-        if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
-                up_write(&mm->mmap_sem);
-                put_cred(cred);
                goto fail;
-        }
        /*
         *      We cannot trust fsuid as being the "true" uid of the
         *      process nor do we know its entire history. We only know it
@@ -1847,10 +1898,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        }
        retval = coredump_wait(exit_code, &core_state);
-        if (retval < 0) {
+        if (retval < 0)
-                put_cred(cred);
+                goto fail_creds;
-                goto fail;
-        }
        old_cred = override_creds(cred);
@@ -1860,27 +1909,21 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        clear_thread_flag(TIF_SIGPENDING);
-        /*
-         * lock_kernel() because format_corename() is controlled by sysctl, which
-         * uses lock_kernel()
-         */
-        lock_kernel();
        ispipe = format_corename(corename, signr);
-        unlock_kernel();
-        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-                goto fail_unlock;
        if (ispipe) {
-                if (cprm.limit == 0) {
+                int dump_count;
+                char **helper_argv;
+                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * cprm.limit of 0 here as a speacial value. Any
+                         * cprm.limit of 1 here as a speacial value. Any
-                         * non-zero limit gets set to RLIM_INFINITY below, but
+                         * non-1 limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
-                         * if the core_pattern binary sets RLIM_CORE =  !0
+                         * if the core_pattern binary sets RLIM_CORE =  !1
                         * but it runs as root, and can do lots of stupid things
                         * Note that we use task_tgid_vnr here to grab the pid
                         * of the process group leader.  That way we get the
@@ -1888,11 +1931,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                         * core_pattern process dies.
                         */
                        printk(KERN_WARNING
-                                "Process %d(%s) has RLIMIT_CORE set to 0\n",
+                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
                                task_tgid_vnr(current), current->comm);
                        printk(KERN_WARNING "Aborting core\n");
                        goto fail_unlock;
                }
+                cprm.limit = RLIM_INFINITY;
                dump_count = atomic_inc_return(&core_dump_count);
                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1902,71 +1946,114 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
                        goto fail_dropcount;
                }
-                cprm.limit = RLIM_INFINITY;
+                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-                /* SIGPIPE can happen, but it's just never processed */
+                                        NULL, &cprm);
-                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
+                argv_free(helper_argv);
-                                &cprm.file)) {
+                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
-                        goto fail_dropcount;
+                        goto close_fail;
                }
-        } else
+        } else {
+                struct inode *inode;
+                if (cprm.limit < binfmt->min_coredump)
+                        goto fail_unlock;
                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(cprm.file))
+                if (IS_ERR(cprm.file))
-                goto fail_dropcount;
+                        goto fail_unlock;
-        inode = cprm.file->f_path.dentry->d_inode;
-        if (inode->i_nlink > 1)
-                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-                goto close_fail;
-        /* AK: actually i see no reason to not allow this for named pipes etc.,
-           but keep the previous behaviour for now. */
-        if (!ispipe && !S_ISREG(inode->i_mode))
-                goto close_fail;
-        /*
-         * Dont allow local users get cute and trick others to coredump
-         * into their pre-created files:
-         * Note, this is not relevant for pipes
-         */
-        if (!ispipe && (inode->i_uid != current_fsuid()))
-                goto close_fail;
-        if (!cprm.file->f_op)
-                goto close_fail;
-        if (!cprm.file->f_op->write)
-                goto close_fail;
-        if (!ispipe &&
-            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-                goto close_fail;
-        retval = binfmt->core_dump(&cprm);
+                inode = cprm.file->f_path.dentry->d_inode;
+                if (inode->i_nlink > 1)
+                        goto close_fail;
+                if (d_unhashed(cprm.file->f_path.dentry))
+                        goto close_fail;
+                /*
+                 * AK: actually i see no reason to not allow this for named
+                 * pipes etc, but keep the previous behaviour for now.
+                 */
+                if (!S_ISREG(inode->i_mode))
+                        goto close_fail;
+                /*
+                 * Dont allow local users get cute and trick others to coredump
+                 * into their pre-created files.
+                 */
+                if (inode->i_uid != current_fsuid())
+                        goto close_fail;
+                if (!cprm.file->f_op || !cprm.file->f_op->write)
+                        goto close_fail;
+                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+                        goto close_fail;
+        }
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
-close_fail:
        if (ispipe && core_pipe_limit)
                wait_for_dump_helpers(cprm.file);
-        filp_close(cprm.file, NULL);
+close_fail:
+        if (cprm.file)
+                filp_close(cprm.file, NULL);
 fail_dropcount:
-        if (dump_count)
+        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
-        if (helper_argv)
+        coredump_finish(mm);
-                argv_free(helper_argv);
        revert_creds(old_cred);
+fail_creds:
        put_cred(cred);
-        coredump_finish(mm);
 fail:
        return;
 }
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+        return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+int dump_seek(struct file *file, loff_t off)
+{
+        int ret = 1;
+        if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+                if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+                        return 0;
+        } else {
+                char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+                if (!buf)
+                        return 0;
+                while (off > 0) {
+                        unsigned long n = off;
+                        if (n > PAGE_SIZE)
+                                n = PAGE_SIZE;
+                        if (!dump_write(file, buf, n)) {
+                                ret = 0;
+                                break;
+                        }
+                        off -= n;
+                }
+                free_page((unsigned long)buf);
+        }
+        return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
        de->inode_no = cpu_to_le64(parent->i_ino);
        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
        exofs_set_de_type(de, inode);
-        kunmap_atomic(page, KM_USER0);
+        kunmap_atomic(kaddr, KM_USER0);
        err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
        page_cache_release(page);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 22721b2fd890..2dc925fa1010 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -256,7 +256,6 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
 }
 /* inode.c               */
-void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
@@ -264,7 +263,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
 extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
-extern void exofs_delete_inode(struct inode *);
+extern void exofs_evict_inode(struct inode *);
 /* dir.c:                */
 int exofs_add_link(struct dentry *, struct inode *);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..68cb23e3bb98 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
 * along with exofs; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/buffer_head.h>
 #include "exofs.h"
 static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,20 +37,27 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+/* exofs_file_fsync - flush the inode to disk
-                            int datasync)
+ *
+ *   Note, in exofs all metadata is written as part of inode, regardless.
+ *   The writeout is synchronous
+ */
+static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
-        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = filp->f_mapping->host;
-        struct inode *inode = dentry->d_inode;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0, /* metadata-only; caller takes care of data */
+        };
        struct super_block *sb;
-        ret = filemap_write_and_wait(mapping);
+        if (!(inode->i_state & I_DIRTY))
-        if (ret)
+                return 0;
-                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return 0;
-        /* sync the inode attributes */
+        ret = sync_inode(inode, &wbc);
-        ret = write_inode_now(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
@@ -66,9 +70,9 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-        exofs_file_fsync(file, file->f_path.dentry, 1);
+        int ret = vfs_fsync(file, 0);
        /* TODO: Flush the OSD target */
-        return 0;
+        return ret;
 }
 const struct file_operations exofs_file_operations = {
@@ -87,6 +91,5 @@ const struct file_operations exofs_file_operations = {
 };
 const struct inode_operations exofs_file_inode_operations = {
-        .truncate       = exofs_truncate,
        .setattr        = exofs_setattr,
 };
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
 */
 #include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/buffer_head.h>
-#include <scsi/scsi_device.h>
 #include "exofs.h"
@@ -57,6 +54,9 @@ struct page_collect {
        unsigned nr_pages;
        unsigned long length;
        loff_t pg_first; /* keep 64bit also in 32-arches */
+        bool read_4_write; /* This means two things: that the read is sync
+                            * And the pages should not be unlocked.
+                            */
 };
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -74,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
        pcol->nr_pages = 0;
        pcol->length = 0;
        pcol->pg_first = -1;
+        pcol->read_4_write = false;
 }
 static void _pcol_reset(struct page_collect *pcol)
@@ -350,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
                if (PageError(page))
                        ClearPageError(page);
-                unlock_page(page);
+                if (!pcol->read_4_write)
+                        unlock_page(page);
                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
                             " splitting\n", inode->i_ino, page->index);
@@ -431,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
        /* readpage_strip might call read_exec(,is_sync==false) at several
         * places but not if we have a single page.
         */
+        pcol.read_4_write = is_sync;
        ret = readpage_strip(&pcol, page);
        if (ret) {
                EXOFS_ERR("_readpage => %d\n", ret);
@@ -697,6 +700,13 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
        return write_exec(&pcol);
 }
+/* i_mutex held using inode->i_size directly */
+static void _write_failed(struct inode *inode, loff_t to)
+{
+        if (to > inode->i_size)
+                truncate_pagecache(inode, to, inode->i_size);
+}
 int exofs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
@@ -710,7 +720,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                                         fsdata);
                if (ret) {
                        EXOFS_DBGMSG("simple_write_begin faild\n");
-                        return ret;
+                        goto out;
                }
                page = *pagep;
@@ -725,6 +735,9 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
                        EXOFS_DBGMSG("__readpage_filler faild\n");
                }
        }
+out:
+        if (unlikely(ret))
+                _write_failed(mapping->host, pos + len);
        return ret;
 }
@@ -750,11 +763,28 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
        int ret;
        ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+        if (unlikely(ret))
+                _write_failed(inode, pos + len);
+        /* TODO: once simple_write_end marks inode dirty remove */
        if (i_size != inode->i_size)
                mark_inode_dirty(inode);
        return ret;
 }
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+        EXOFS_DBGMSG("page 0x%lx\n", page->index);
+        WARN_ON(1);
+        return 0;
+}
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+        EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+        WARN_ON(1);
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
@@ -762,6 +792,21 @@ const struct address_space_operations exofs_aops = {
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
        .write_end      = exofs_write_end,
+        .releasepage    = exofs_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        .invalidatepage = exofs_invalidatepage,
+        /* Not implemented Yet */
+        .bmap           = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+        .direct_IO      = NULL, /* TODO: Should be trivial to do */
+        /* With these NULL has special meaning or default is not exported */
+        .sync_page      = NULL,
+        .get_xip_mem    = NULL,
+        .migratepage    = NULL,
+        .launder_page   = NULL,
+        .is_partially_uptodate = NULL,
+        .error_remove_page = NULL,
 };
 /******************************************************************************
@@ -778,87 +823,55 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
        return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
 }
-/*
- * get_block_t - Fill in a buffer_head
- * An OSD takes care of block allocation so we just fake an allocation by
- * putting in the inode's sector_t in the buffer_head.
- * TODO: What about the case of create==0 and @iblock does not exist in the
- * object?
- */
-static int exofs_get_block(struct inode *inode, sector_t iblock,
-                    struct buffer_head *bh_result, int create)
-{
-        map_bh(bh_result, inode->i_sb, iblock);
-        return 0;
-}
 const struct osd_attr g_attr_logical_length = ATTR_DEF(
        OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-static int _do_truncate(struct inode *inode)
+static int _do_truncate(struct inode *inode, loff_t newsize)
 {
        struct exofs_i_info *oi = exofs_i(inode);
-        loff_t isize = i_size_read(inode);
        int ret;
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+        ret = exofs_oi_truncate(oi, (u64)newsize);
+        if (likely(!ret))
+                truncate_setsize(inode, newsize);
-        ret = exofs_oi_truncate(oi, (u64)isize);
+        EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
-        EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
+                     inode->i_ino, newsize, ret);
        return ret;
 }
 /*
- * Truncate a file to the specified size - all we have to do is set the size
+ * Set inode attributes - update size attribute on OSD if needed,
- * attribute.  We make sure the object exists first.
+ *                        otherwise just call generic functions.
- */
-void exofs_truncate(struct inode *inode)
-{
-        struct exofs_i_info *oi = exofs_i(inode);
-        int ret;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
-             || S_ISLNK(inode->i_mode)))
-                return;
-        if (exofs_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
-        /* if we are about to truncate an object, and it hasn't been
-         * created yet, wait
-         */
-        if (unlikely(wait_obj_created(oi)))
-                goto fail;
-        ret = _do_truncate(inode);
-        if (ret)
-                goto fail;
-out:
-        mark_inode_dirty(inode);
-        return;
-fail:
-        make_bad_inode(inode);
-        goto out;
-}
-/*
- * Set inode attributes - just call generic functions.
 */
 int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct inode *inode = dentry->d_inode;
        int error;
+        /* if we are about to modify an object, and it hasn't been
+         * created yet, wait
+         */
+        error = wait_obj_created(exofs_i(inode));
+        if (unlikely(error))
+                return error;
        error = inode_change_ok(inode, iattr);
-        if (error)
+        if (unlikely(error))
                return error;
-        error = inode_setattr(inode, iattr);
+        if ((iattr->ia_valid & ATTR_SIZE) &&
-        return error;
+            iattr->ia_size != i_size_read(inode)) {
+                error = _do_truncate(inode, iattr->ia_size);
+                if (unlikely(error))
+                        return error;
+        }
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
@@ -1123,16 +1136,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
        sb->s_dirt = 1;
-        inode->i_uid = current->cred->fsuid;
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current->cred->fsgid;
-        }
-        inode->i_mode = mode;
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -1304,7 +1308,7 @@ static void delete_done(struct exofs_io_state *ios, void *p)
 * from the OSD here.  We make sure the object was created before we try and
 * delete it.
 */
-void exofs_delete_inode(struct inode *inode)
+void exofs_evict_inode(struct inode *inode)
 {
        struct exofs_i_info *oi = exofs_i(inode);
        struct super_block *sb = inode->i_sb;
@@ -1314,30 +1318,27 @@ void exofs_delete_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
-        if (is_bad_inode(inode))
+        /* TODO: should do better here */
+        if (inode->i_nlink || is_bad_inode(inode))
                goto no_delete;
-        mark_inode_dirty(inode);
-        exofs_update_inode(inode, inode_needs_sync(inode));
        inode->i_size = 0;
-        if (inode->i_blocks)
+        end_writeback(inode);
-                exofs_truncate(inode);
-        clear_inode(inode);
+        /* if we are deleting an obj that hasn't been created yet, wait */
+        if (!obj_created(oi)) {
+                BUG_ON(!obj_2bcreated(oi));
+                wait_event(oi->i_wq, obj_created(oi));
+                /* ignore the error attempt a remove anyway */
+        }
+        /* Now Remove the OSD objects */
        ret = exofs_get_io_state(&sbi->layout, &ios);
        if (unlikely(ret)) {
                EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
                return;
        }
-        /* if we are deleting an obj that hasn't been created yet, wait */
-        if (!obj_created(oi)) {
-                BUG_ON(!obj_2bcreated(oi));
-                wait_event(oi->i_wq, obj_created(oi));
-        }
        ios->obj.id = exofs_oi_objno(oi);
        ios->done = delete_done;
        ios->private = sbi;
@@ -1353,5 +1354,5 @@ void exofs_delete_inode(struct inode *inode)
        return;
 no_delete:
-        clear_inode(inode);
+        end_writeback(inode);
 }
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4337cad7777b..6550bf70e41d 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 struct _striping_info {
        u64 obj_offset;
        u64 group_length;
-        u64 total_group_length;
-        u64 Major;
        unsigned dev;
        unsigned unit_off;
 };
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
                                  (M * group_depth * stripe_unit);
        si->group_length = T - H;
-        si->total_group_length = T;
-        si->Major = M;
 }
 static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 }
 static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
-                              struct _striping_info *si, unsigned first_comp)
+                              struct _striping_info *si)
 {
        unsigned stripe_unit = ios->layout->stripe_unit;
        unsigned mirrors_p1 = ios->layout->mirrors_p1;
        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
        unsigned dev = si->dev;
        unsigned first_dev = dev - (dev % devs_in_group);
-        unsigned comp = first_comp + (dev - first_dev);
        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
        unsigned cur_pg = ios->pages_consumed;
        int ret = 0;
        while (length) {
-                struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+                struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
                                cur_len = stripe_unit;
                        }
-                        if (max_comp < comp)
+                        if (max_comp < dev)
-                                max_comp = comp;
+                                max_comp = dev;
-                        dev += mirrors_p1;
-                        dev = (dev % devs_in_group) + first_dev;
                } else {
                        cur_len = stripe_unit;
                }
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
                if (unlikely(ret))
                        goto out;
-                comp += mirrors_p1;
+                dev += mirrors_p1;
-                comp = (comp % devs_in_group) + first_comp;
+                dev = (dev % devs_in_group) + first_dev;
                length -= cur_len;
        }
@@ -454,18 +446,15 @@ out:
 static int _prepare_for_striping(struct exofs_io_state *ios)
 {
        u64 length = ios->length;
+        u64 offset = ios->offset;
        struct _striping_info si;
-        unsigned devs_in_group = ios->layout->group_width *
-                                 ios->layout->mirrors_p1;
-        unsigned first_comp = 0;
        int ret = 0;
-        _calc_stripe_info(ios, ios->offset, &si);
        if (!ios->pages) {
                if (ios->kern_buff) {
                        struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+                        _calc_stripe_info(ios, ios->offset, &si);
                        per_dev->offset = si.obj_offset;
                        per_dev->dev = si.dev;
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
        }
        while (length) {
+                _calc_stripe_info(ios, offset, &si);
                if (length < si.group_length)
                        si.group_length = length;
-                ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+                ret = _prepare_one_group(ios, si.group_length, &si);
                if (unlikely(ret))
                        goto out;
+                offset += si.group_length;
                length -= si.group_length;
-                si.group_length = si.total_group_length;
-                si.unit_off = 0;
-                ++si.Major;
-                si.obj_offset = si.Major * ios->layout->stripe_unit *
-                                                ios->layout->group_depth;
-                si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
-                si.dev %= ios->layout->s_numdevs;
-                first_comp += devs_in_group;
-                first_comp %= ios->layout->s_numdevs;
        }
 out:
@@ -599,7 +579,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
                        } else {
                                bio = master_dev->bio;
                                /* FIXME: bio_set_dir() */
-                                bio->bi_rw |= (1 << BIO_RW);
+                                bio->bi_rw |= REQ_WRITE;
                        }
                        osd_req_write(or, &ios->obj, per_dev->offset, bio,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 03149b9a5178..047e92fa3af8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/vfs.h>
@@ -743,7 +742,7 @@ static const struct super_operations exofs_sops = {
        .alloc_inode    = exofs_alloc_inode,
        .destroy_inode  = exofs_destroy_inode,
        .write_inode    = exofs_write_inode,
-        .delete_inode   = exofs_delete_inode,
+        .evict_inode    = exofs_evict_inode,
        .put_super      = exofs_put_super,
        .write_super    = exofs_write_super,
        .sync_fs        = exofs_sync_fs,
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..2bcc0431bada 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -200,6 +200,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                                        return error;
                                else {
                                        inode->i_mode = mode;
+                                        inode->i_ctime = CURRENT_TIME_SEC;
                                        mark_inode_dirty(inode);
                                        if (error == 0)
                                                acl = NULL;
@@ -420,7 +421,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext2_xattr_acl_access_handler = {
+const struct xattr_handler ext2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext2_xattr_list_acl_access,
@@ -428,7 +429,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
        .set    = ext2_xattr_set_acl,
 };
-struct xattr_handler ext2_xattr_acl_default_handler = {
+const struct xattr_handler ext2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 3cf038c055d7..c6c684b44ea1 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -571,7 +571,7 @@ do_more:
 error_return:
        brelse(bitmap_bh);
        release_blocks(sb, freed);
-        dquot_free_block(inode, freed);
+        dquot_free_block_nodirty(inode, freed);
 }
 /**
@@ -1332,6 +1332,12 @@ retry_alloc:
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
@@ -1412,7 +1418,8 @@ allocated:
        *errp = 0;
        brelse(bitmap_bh);
-        dquot_free_block(inode, *count-num);
+        dquot_free_block_nodirty(inode, *count-num);
+        mark_inode_dirty(inode);
        *count = num;
        return ret_block;
@@ -1422,8 +1429,10 @@ out:
        /*
         * Undo the block allocation
         */
-        if (!performed_allocation)
+        if (!performed_allocation) {
-                dquot_free_block(inode, *count);
+                dquot_free_block_nodirty(inode, *count);
+                mark_inode_dirty(inode);
+        }
        brelse(bitmap_bh);
        return 0;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957273ed..764109886ec0 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -448,6 +448,11 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
        return res;
 }
+static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+{
+        return __block_write_begin(page, pos, len, ext2_get_block);
+}
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
                   struct page *page, struct inode *inode, int update_times)
@@ -458,8 +463,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
        int err;
        lock_page(page);
-        err = __ext2_write_begin(NULL, page->mapping, pos, len,
+        err = ext2_prepare_chunk(page, pos, len);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        BUG_ON(err);
        de->inode = cpu_to_le32(inode->i_ino);
        ext2_set_de_type(de, inode);
@@ -542,8 +546,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 got_it:
        pos = page_offset(page) +
                (char*)de - (char*)page_address(page);
-        err = __ext2_write_begin(NULL, page->mapping, pos, rec_len, 0,
+        err = ext2_prepare_chunk(page, pos, rec_len);
-                                                        &page, NULL);
        if (err)
                goto out_unlock;
        if (de->inode) {
@@ -576,8 +579,7 @@ out_unlock:
 */
 int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
 {
-        struct address_space *mapping = page->mapping;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode = mapping->host;
        char *kaddr = page_address(page);
        unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1);
        unsigned to = ((char *)dir - kaddr) +
@@ -601,8 +603,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page )
                from = (char*)pde - (char*)page_address(page);
        pos = page_offset(page) + from;
        lock_page(page);
-        err = __ext2_write_begin(NULL, page->mapping, pos, to - from, 0,
+        err = ext2_prepare_chunk(page, pos, to - from);
-                                                        &page, NULL);
        BUG_ON(err);
        if (pde)
                pde->rec_len = ext2_rec_len_to_disk(to - from);
@@ -621,8 +622,7 @@ out:
 */
 int ext2_make_empty(struct inode *inode, struct inode *parent)
 {
-        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(inode->i_mapping, 0);
-        struct page *page = grab_cache_page(mapping, 0);
        unsigned chunk_size = ext2_chunk_size(inode);
        struct ext2_dir_entry_2 * de;
        int err;
@@ -631,8 +631,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
        if (!page)
                return -ENOMEM;
-        err = __ext2_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+        err = ext2_prepare_chunk(page, 0, chunk_size);
-                                                        &page, NULL);
        if (err) {
                unlock_page(page);
                goto fail;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..416daa62242c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -119,18 +119,14 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 /* inode.c */
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
 extern int ext2_write_inode (struct inode *, struct writeback_control *);
-extern void ext2_delete_inode (struct inode *);
+extern void ext2_evict_inode(struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                       u64 start, u64 len);
-int __ext2_write_begin(struct file *file, struct address_space *mapping,
-                loff_t pos, unsigned len, unsigned flags,
-                struct page **pagep, void **fsdata);
 /* ioctl.c */
 extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
@@ -155,7 +151,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
        int ret;
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-        ret = simple_fsync(file, dentry, datasync);
+        ret = generic_file_fsync(file, datasync);
        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
                /* We don't really know where the IO error happened... */
                ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
 #endif
 const struct inode_operations ext2_file_inode_operations = {
-        .truncate       = ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..ad70479aabff 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
        struct super_block * sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
-        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *bitmap_bh;
        unsigned long block_group;
        unsigned long bit;
        struct ext2_super_block * es;
@@ -118,31 +118,25 @@ void ext2_free_inode (struct inode * inode)
         * Note: we must free any quota before locking the superblock,
         * as writing the quota to disk may need the lock as well.
         */
-        if (!is_bad_inode(inode)) {
+        /* Quota is already initialized in iput() */
-                /* Quota is already initialized in iput() */
+        ext2_xattr_delete_inode(inode);
-                ext2_xattr_delete_inode(inode);
+        dquot_free_inode(inode);
-                dquot_free_inode(inode);
+        dquot_drop(inode);
-                dquot_drop(inode);
-        }
        es = EXT2_SB(sb)->s_es;
        is_directory = S_ISDIR(inode->i_mode);
-        /* Do this BEFORE marking the inode not in use or returning an error */
-        clear_inode (inode);
        if (ino < EXT2_FIRST_INO(sb) ||
            ino > le32_to_cpu(es->s_inodes_count)) {
                ext2_error (sb, "ext2_free_inode",
                            "reserved or nonexistent inode %lu", ino);
-                goto error_return;
+                return;
        }
        block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
-        brelse(bitmap_bh);
        bitmap_bh = read_inode_bitmap(sb, block_group);
        if (!bitmap_bh)
-                goto error_return;
+                return;
        /* Ok, now we can actually update the inode bitmaps.. */
        if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +148,7 @@ void ext2_free_inode (struct inode * inode)
        mark_buffer_dirty(bitmap_bh);
        if (sb->s_flags & MS_SYNCHRONOUS)
                sync_dirty_buffer(bitmap_bh);
-error_return:
        brelse(bitmap_bh);
 }
@@ -550,16 +544,12 @@ got:
        sb->s_dirt = 1;
        mark_buffer_dirty(bh2);
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt (sb, GRPID))
+                inode->i_mode = mode;
-                inode->i_gid = dir->i_gid;
+                inode->i_uid = current_fsuid();
-        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..940c96168868 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
 *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
 */
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -55,29 +54,57 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
                inode->i_blocks - ea_blocks == 0);
 }
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                ext2_truncate_blocks(inode, inode->i_size);
+        }
+}
 /*
 * Called at the last iput() if i_nlink is zero.
 */
-void ext2_delete_inode (struct inode * inode)
+void ext2_evict_inode(struct inode * inode)
 {
-        if (!is_bad_inode(inode))
+        struct ext2_block_alloc_info *rsv;
+        int want_delete = 0;
+        if (!inode->i_nlink && !is_bad_inode(inode)) {
+                want_delete = 1;
                dquot_initialize(inode);
+        } else {
+                dquot_drop(inode);
+        }
        truncate_inode_pages(&inode->i_data, 0);
-        if (is_bad_inode(inode))
+        if (want_delete) {
-                goto no_delete;
+                /* set dtime */
-        EXT2_I(inode)->i_dtime  = get_seconds();
+                EXT2_I(inode)->i_dtime  = get_seconds();
-        mark_inode_dirty(inode);
+                mark_inode_dirty(inode);
-        __ext2_write_inode(inode, inode_needs_sync(inode));
+                __ext2_write_inode(inode, inode_needs_sync(inode));
+                /* truncate to 0 */
+                inode->i_size = 0;
+                if (inode->i_blocks)
+                        ext2_truncate_blocks(inode, 0);
+        }
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
-        inode->i_size = 0;
+        ext2_discard_reservation(inode);
-        if (inode->i_blocks)
+        rsv = EXT2_I(inode)->i_block_alloc_info;
-                ext2_truncate (inode);
+        EXT2_I(inode)->i_block_alloc_info = NULL;
-        ext2_free_inode (inode);
+        if (unlikely(rsv))
+                kfree(rsv);
-        return;
+        if (want_delete)
-no_delete:
+                ext2_free_inode(inode);
-        clear_inode(inode);     /* We must guarantee clearing of inode... */
 }
 typedef struct {
@@ -412,6 +439,8 @@ static int ext2_alloc_blocks(struct inode *inode,
 failed_out:
        for (i = 0; i <index; i++)
                ext2_free_blocks(inode, new_blocks[i], 1);
+        if (index)
+                mark_inode_dirty(inode);
        return ret;
 }
@@ -754,21 +783,30 @@ ext2_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
 }
-int __ext2_write_begin(struct file *file, struct address_space *mapping,
+static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        int ret;
-                                                        ext2_get_block);
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                ext2_get_block);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
-static int
+static int ext2_write_end(struct file *file, struct address_space *mapping,
-ext2_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
-                loff_t pos, unsigned len, unsigned flags,
+                        struct page *page, void *fsdata)
-                struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret < len)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int
@@ -776,13 +814,13 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
-        /*
+        int ret;
-         * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
-         * directory handling code to pass around offsets rather than struct
+        ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
-         * pages in order to make this work easily.
+                               ext2_get_block);
-         */
+        if (ret < 0)
-        return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                ext2_write_failed(mapping, pos + len);
-                                                        ext2_get_block);
+        return ret;
 }
 static int ext2_nobh_writepage(struct page *page,
@@ -801,10 +839,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ssize_t ret;
-                                offset, nr_segs, ext2_get_block, NULL);
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+                                iov, offset, nr_segs, ext2_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static int
@@ -819,7 +862,7 @@ const struct address_space_operations ext2_aops = {
        .writepage              = ext2_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
-        .write_end              = generic_write_end,
+        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
@@ -968,8 +1011,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
                        else if (block_to_free == nr - count)
                                count++;
                        else {
-                                mark_inode_dirty(inode);
                                ext2_free_blocks (inode, block_to_free, count);
+                                mark_inode_dirty(inode);
                        free_this:
                                block_to_free = nr;
                                count = 1;
@@ -977,8 +1020,8 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
                }
        }
        if (count > 0) {
-                mark_inode_dirty(inode);
                ext2_free_blocks (inode, block_to_free, count);
+                mark_inode_dirty(inode);
        }
 }
@@ -1028,7 +1071,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                ext2_free_data(inode, p, q);
 }
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
        __le32 *i_data = EXT2_I(inode)->i_data;
        struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1040,27 +1083,8 @@ void ext2_truncate(struct inode *inode)
        int n;
        long iblock;
        unsigned blocksize;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext2_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
        blocksize = inode->i_sb->s_blocksize;
-        iblock = (inode->i_size + blocksize-1)
+        iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-                                        >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-        if (mapping_is_xip(inode->i_mapping))
-                xip_truncate_page(inode->i_mapping, inode->i_size);
-        else if (test_opt(inode->i_sb, NOBH))
-                nobh_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
-        else
-                block_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
@@ -1128,6 +1152,54 @@ do_indirects:
        ext2_discard_reservation(inode);
        mutex_unlock(&ei->truncate_mutex);
+}
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+        /*
+         * XXX: it seems like a bug here that we don't allow
+         * IS_APPEND inode to have blocks-past-i_size trimmed off.
+         * review and fix this.
+         *
+         * Also would be nice to be able to handle IO errors and such,
+         * but that's probably too much to ask.
+         */
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return;
+        if (ext2_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        __ext2_truncate_blocks(inode, offset);
+}
+static int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+        int error;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return -EINVAL;
+        if (ext2_inode_is_fast_symlink(inode))
+                return -EINVAL;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return -EPERM;
+        if (mapping_is_xip(inode->i_mapping))
+                error = xip_truncate_page(inode->i_mapping, newsize);
+        else if (test_opt(inode->i_sb, NOBH))
+                error = nobh_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        else
+                error = block_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        if (error)
+                return error;
+        truncate_setsize(inode, newsize);
+        __ext2_truncate_blocks(inode, newsize);
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
@@ -1135,6 +1207,8 @@ do_indirects:
        } else {
                mark_inode_dirty(inode);
        }
+        return 0;
 }
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1406,11 +1480,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
                               /* If this is the first large file
                                * created, add a flag to the superblock.
                                */
-                                lock_kernel();
+                                spin_lock(&EXT2_SB(sb)->s_lock);
                                ext2_update_dynamic_rev(sb);
                                EXT2_SET_RO_COMPAT_FEATURE(sb,
                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
-                                unlock_kernel();
+                                spin_unlock(&EXT2_SB(sb)->s_lock);
                                ext2_write_super(sb);
                        }
                }
@@ -1467,7 +1541,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -1475,8 +1549,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
                if (error)
                        return error;
        }
-        error = inode_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
-        if (!error && (iattr->ia_valid & ATTR_MODE))
+                error = ext2_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, iattr);
+        if (iattr->ia_valid & ATTR_MODE)
                error = ext2_acl_chmod(inode);
+        mark_inode_dirty(inode);
        return error;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..1ec602673ea8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
 #include <linux/random.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
@@ -39,7 +38,7 @@
 #include "xip.h"
 static void ext2_sync_super(struct super_block *sb,
-                            struct ext2_super_block *es);
+                            struct ext2_super_block *es, int wait);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
        struct ext2_super_block *es = sbi->s_es;
        if (!(sb->s_flags & MS_RDONLY)) {
+                spin_lock(&sbi->s_lock);
                sbi->s_mount_state |= EXT2_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
        va_end(args);
 }
+/*
+ * This must be called with sbi->s_lock held.
+ */
 void ext2_update_dynamic_rev(struct super_block *sb)
 {
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,7 +119,7 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        lock_kernel();
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        if (sb->s_dirt)
                ext2_write_super(sb);
@@ -124,8 +128,10 @@ static void ext2_put_super (struct super_block * sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
+                spin_lock(&sbi->s_lock);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        db_count = sbi->s_gdb_count;
        for (i = 0; i < db_count; i++)
@@ -140,8 +146,6 @@ static void ext2_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -191,17 +195,6 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ext2_inode_cachep);
 }
-static void ext2_clear_inode(struct inode *inode)
-{
-        struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info;
-        dquot_drop(inode);
-        ext2_discard_reservation(inode);
-        EXT2_I(inode)->i_block_alloc_info = NULL;
-        if (unlikely(rsv))
-                kfree(rsv);
-}
 static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct super_block *sb = vfs->mnt_sb;
@@ -209,6 +202,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct ext2_super_block *es = sbi->s_es;
        unsigned long def_mount_opts;
+        spin_lock(&sbi->s_lock);
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        if (sbi->s_sb_block != 1)
@@ -281,6 +275,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
@@ -293,13 +288,12 @@ static const struct super_operations ext2_sops = {
        .alloc_inode    = ext2_alloc_inode,
        .destroy_inode  = ext2_destroy_inode,
        .write_inode    = ext2_write_inode,
-        .delete_inode   = ext2_delete_inode,
+        .evict_inode    = ext2_evict_inode,
        .put_super      = ext2_put_super,
        .write_super    = ext2_write_super,
        .sync_fs        = ext2_sync_fs,
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
-        .clear_inode    = ext2_clear_inode,
        .show_options   = ext2_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext2_quota_read,
@@ -606,7 +600,6 @@ static int ext2_setup_super (struct super_block * sb,
        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
-        ext2_write_super(sb);
        if (test_opt (sb, DEBUG))
                ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
                        "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +760,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
+        spin_lock_init(&sbi->s_lock);
        /*
         * See what the current blocksize for the device is, and
         * use that as the blocksize.  Otherwise (or if the blocksize
@@ -1058,6 +1053,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op = &ext2_sops;
        sb->s_export_op = &ext2_export_ops;
        sb->s_xattr = ext2_xattr_handlers;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        root = ext2_iget(sb, EXT2_ROOT_INO);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
@@ -1079,7 +1080,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
                ext2_msg(sb, KERN_WARNING,
                        "warning: mounting ext3 filesystem as ext2");
-        ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+        if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+                sb->s_flags |= MS_RDONLY;
+        ext2_write_super(sb);
        return 0;
 cantfind_ext2:
@@ -1120,30 +1123,26 @@ static void ext2_clear_super_error(struct super_block *sb)
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+                ext2_msg(sb, KERN_ERR,
-                       "superblock detected", sb->s_id);
+                       "previous I/O error to superblock detected\n");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
 }
-static void ext2_commit_super (struct super_block * sb,
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
-                               struct ext2_super_block * es)
+                            int wait)
-{
-        ext2_clear_super_error(sb);
-        es->s_wtime = cpu_to_le32(get_seconds());
-        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sb->s_dirt = 0;
-}
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
        ext2_clear_super_error(sb);
+        spin_lock(&EXT2_SB(sb)->s_lock);
        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
        es->s_wtime = cpu_to_le32(get_seconds());
+        /* unlock before we do IO */
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+        if (wait)
+                sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
        sb->s_dirt = 0;
 }
@@ -1157,43 +1156,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * may have been checked while mounted and e2fsck may have
 * set s_state to EXT2_VALID_FS after some corrections.
 */
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
+        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
-        lock_kernel();
-        if (buffer_write_io_error(sbh)) {
-                /*
-                 * Oh, dear.  A previous attempt to write the
-                 * superblock failed.  This could happen because the
-                 * USB device was yanked out.  Or it could happen to
-                 * be a transient write error and maybe the block will
-                 * be remapped.  Nothing we can do but to retry the
-                 * write and hope for the best.
-                 */
-                ext2_msg(sb, KERN_ERR,
-                       "previous I/O error to superblock detected\n");
-                clear_buffer_write_io_error(sbh);
-                set_buffer_uptodate(sbh);
-        }
+        spin_lock(&sbi->s_lock);
        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
                ext2_debug("setting valid to 0\n");
                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                es->s_free_blocks_count =
-                        cpu_to_le32(ext2_count_free_blocks(sb));
-                es->s_free_inodes_count =
-                        cpu_to_le32(ext2_count_free_inodes(sb));
-                es->s_mtime = cpu_to_le32(get_seconds());
-                ext2_sync_super(sb, es);
-        } else {
-                ext2_commit_super(sb, es);
        }
-        sb->s_dirt = 0;
+        spin_unlock(&sbi->s_lock);
-        unlock_kernel();
+        ext2_sync_super(sb, es, wait);
        return 0;
 }
@@ -1215,7 +1189,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
-        lock_kernel();
+        spin_lock(&sbi->s_lock);
        /* Store the old options */
        old_sb_flags = sb->s_flags;
@@ -1254,21 +1228,31 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-                unlock_kernel();
+                spin_unlock(&sbi->s_lock);
                return 0;
        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
-                        unlock_kernel();
+                        spin_unlock(&sbi->s_lock);
                        return 0;
                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
                 */
                es->s_state = cpu_to_le16(sbi->s_mount_state);
                es->s_mtime = cpu_to_le32(get_seconds());
+                spin_unlock(&sbi->s_lock);
+                err = dquot_suspend(sb, -1);
+                if (err < 0) {
+                        spin_lock(&sbi->s_lock);
+                        goto restore_opts;
+                }
+                ext2_sync_super(sb, es, 1);
        } else {
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
                                               ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1272,20 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_state = le16_to_cpu(es->s_state);
                if (!ext2_setup_super (sb, es, 0))
                        sb->s_flags &= ~MS_RDONLY;
+                spin_unlock(&sbi->s_lock);
+                ext2_write_super(sb);
+                dquot_resume(sb, -1);
        }
-        ext2_sync_super(sb, es);
-        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
-        unlock_kernel();
+        spin_unlock(&sbi->s_lock);
        return err;
 }
@@ -1308,6 +1296,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct ext2_super_block *es = sbi->s_es;
        u64 fsid;
+        spin_lock(&sbi->s_lock);
        if (test_opt (sb, MINIX_DF))
                sbi->s_overhead_last = 0;
        else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1352,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..8c29ae15129e 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 static struct mb_cache *ext2_xattr_cache;
-static struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
        [EXT2_XATTR_INDEX_USER]              = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
        [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler *ext2_xattr_handlers[] = {
        &ext2_xattr_user_handler,
        &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext2_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
                handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
        /* list the attribute names */
        for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
             entry = EXT2_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext2_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
                return;
+        spin_lock(&EXT2_SB(sb)->s_lock);
        EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        sb->s_dirt = 1;
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 }
@@ -672,6 +674,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
                                ext2_free_blocks(inode, block, 1);
+                                mark_inode_dirty(inode);
                                error = -EIO;
                                goto cleanup;
                        }
@@ -701,8 +704,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                 * written (only some dirty data were not) so we just proceed
                 * as if nothing happened and cleanup the unused block */
                if (error && error != -ENOSPC) {
-                        if (new_bh && new_bh != old_bh)
+                        if (new_bh && new_bh != old_bh) {
-                                dquot_free_block(inode, 1);
+                                dquot_free_block_nodirty(inode, 1);
+                                mark_inode_dirty(inode);
+                        }
                        goto cleanup;
                }
        } else
@@ -725,6 +730,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                                mb_cache_entry_free(ce);
                        ea_bdebug(old_bh, "freeing");
                        ext2_free_blocks(inode, old_bh->b_blocknr, 1);
+                        mark_inode_dirty(inode);
                        /* We let our caller release old_bh, so we
                         * need to duplicate the buffer before. */
                        get_bh(old_bh);
@@ -734,7 +740,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
                        le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
                        if (ce)
                                mb_cache_entry_release(ce);
-                        dquot_free_block(inode, 1);
+                        dquot_free_block_nodirty(inode, 1);
+                        mark_inode_dirty(inode);
                        mark_buffer_dirty(old_bh);
                        ea_bdebug(old_bh, "refcount now=%d",
                                le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -797,7 +804,7 @@ ext2_xattr_delete_inode(struct inode *inode)
                mark_buffer_dirty(bh);
                if (IS_SYNC(inode))
                        sync_dirty_buffer(bh);
-                dquot_free_block(inode, 1);
+                dquot_free_block_nodirty(inode, 1);
        }
        EXT2_I(inode)->i_file_acl = 0;
@@ -836,7 +843,7 @@ ext2_xattr_cache_insert(struct buffer_head *bh)
        ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
        if (!ce)
                return -ENOMEM;
-        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
        if (error) {
                mb_cache_entry_free(ce);
                if (error == -EBUSY) {
@@ -910,8 +917,8 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext2_xattr_cache, 0,
+        ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
-                                       inode->i_sb->s_bdev, hash);
+                                       hash);
        while (ce) {
                struct buffer_head *bh;
@@ -943,7 +950,7 @@ again:
                        unlock_buffer(bh);
                        brelse(bh);
                }
-                ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+                ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
        }
        return NULL;
 }
@@ -1019,9 +1026,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 int __init
 init_ext2_xattr(void)
 {
-        ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
+        ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
-                sizeof(struct mb_cache_entry) +
-                sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
        if (!ext2_xattr_cache)
                return -ENOMEM;
        return 0;
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
 # ifdef CONFIG_EXT2_FS_XATTR
-extern struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_user_handler;
-extern struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
-extern struct xattr_handler ext2_xattr_security_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
 extern int init_ext2_xattr(void);
 extern void exit_ext2_xattr(void);
-extern struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler *ext2_xattr_handlers[];
 # else  /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext2_xattr_security_handler = {
+const struct xattr_handler ext2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext2_xattr_security_list,
        .get    = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext2_xattr_trusted_handler = {
+const struct xattr_handler ext2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext2_xattr_trusted_list,
        .get    = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext2_xattr_user_handler = {
+const struct xattr_handler ext2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext2_xattr_user_list,
        .get    = ext2_xattr_user_get,
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f45..e8c6ba0e4a3e 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
 config EXT3_DEFAULTS_TO_ORDERED
        bool "Default to 'data=ordered' in ext3"
        depends on EXT3_FS
+        default y
        help
          The journal mode options for ext3 have different tradeoffs
          between when data is guaranteed to be on disk and
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..8a11fe212183 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -205,6 +205,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
                                        return error;
                                else {
                                        inode->i_mode = mode;
+                                        inode->i_ctime = CURRENT_TIME_SEC;
                                        ext3_mark_inode_dirty(handle, inode);
                                        if (error == 0)
                                                acl = NULL;
@@ -456,7 +457,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext3_xattr_acl_access_handler = {
+const struct xattr_handler ext3_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext3_xattr_list_acl_access,
@@ -464,7 +465,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
        .set    = ext3_xattr_set_acl,
 };
-struct xattr_handler ext3_xattr_acl_default_handler = {
+const struct xattr_handler ext3_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a177122a1b25..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1584,6 +1584,12 @@ retry_alloc:
                        goto io_error;
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree (old);
                }
                if (!parent)
-                        root->rb_node = NULL;
+                        *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,12 +43,12 @@
 * inode to disk.
 */
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext3_inode_info *ei = EXT3_I(inode);
        journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-        int ret = 0;
+        int ret, needs_barrier = 0;
        tid_t commit_tid;
        if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext3_should_journal_data(inode)) {
+        if (ext3_should_journal_data(inode))
-                ret = ext3_force_commit(inode->i_sb);
+                return ext3_force_commit(inode->i_sb);
-                goto out;
-        }
        if (datasync)
                commit_tid = atomic_read(&ei->i_datasync_tid);
        else
                commit_tid = atomic_read(&ei->i_sync_tid);
-        if (log_start_commit(journal, commit_tid)) {
+        if (test_opt(inode->i_sb, BARRIER) &&
-                log_wait_commit(journal, commit_tid);
+            !journal_trans_will_send_data_barrier(journal, commit_tid))
-                goto out;
+                needs_barrier = 1;
-        }
+        log_start_commit(journal, commit_tid);
+        ret = log_wait_commit(journal, commit_tid);
        /*
         * In case we didn't commit a transaction, we have to flush
         * disk caches manually so that data really is on persistent
         * storage
         */
-        if (test_opt(inode->i_sb, BARRIER))
+        if (needs_barrier)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-out:
+                                BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..4ab72db3559e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -119,20 +119,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
        ino = inode->i_ino;
        ext3_debug ("freeing inode %lu\n", ino);
-        /*
-         * Note: we must free any quota before locking the superblock,
-         * as writing the quota to disk may need the lock as well.
-         */
-        dquot_initialize(inode);
-        ext3_xattr_delete_inode(handle, inode);
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        is_directory = S_ISDIR(inode->i_mode);
-        /* Do this BEFORE marking the inode not in use or returning an error */
-        clear_inode (inode);
        es = EXT3_SB(sb)->s_es;
        if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
                ext3_error (sb, "ext3_free_inode",
@@ -538,16 +526,13 @@ got:
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        inode->i_uid = current_fsuid();
-        if (test_opt (sb, GRPID))
+        if (test_opt(sb, GRPID)) {
-                inode->i_gid = dir->i_gid;
+                inode->i_mode = mode;
-        else if (dir->i_mode & S_ISGID) {
+                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf0a300..5e0faf4cda79 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -190,18 +190,28 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 }
 /*
- * Called at the last iput() if i_nlink is zero.
+ * Called at inode eviction from icache
 */
-void ext3_delete_inode (struct inode * inode)
+void ext3_evict_inode (struct inode *inode)
 {
+        struct ext3_block_alloc_info *rsv;
        handle_t *handle;
+        int want_delete = 0;
-        if (!is_bad_inode(inode))
+        if (!inode->i_nlink && !is_bad_inode(inode)) {
                dquot_initialize(inode);
+                want_delete = 1;
+        }
        truncate_inode_pages(&inode->i_data, 0);
-        if (is_bad_inode(inode))
+        ext3_discard_reservation(inode);
+        rsv = EXT3_I(inode)->i_block_alloc_info;
+        EXT3_I(inode)->i_block_alloc_info = NULL;
+        if (unlikely(rsv))
+                kfree(rsv);
+        if (!want_delete)
                goto no_delete;
        handle = start_transaction(inode);
@@ -238,15 +248,22 @@ void ext3_delete_inode (struct inode * inode)
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
-        if (ext3_mark_inode_dirty(handle, inode))
+        if (ext3_mark_inode_dirty(handle, inode)) {
-                /* If that failed, just do the required in-core inode clear. */
+                /* If that failed, just dquot_drop() and be done with that */
-                clear_inode(inode);
+                dquot_drop(inode);
-        else
+                end_writeback(inode);
+        } else {
+                ext3_xattr_delete_inode(handle, inode);
+                dquot_free_inode(inode);
+                dquot_drop(inode);
+                end_writeback(inode);
                ext3_free_inode(handle, inode);
+        }
        ext3_journal_stop(handle);
        return;
 no_delete:
-        clear_inode(inode);     /* We must guarantee clearing of inode... */
+        end_writeback(inode);
+        dquot_drop(inode);
 }
 typedef struct {
@@ -1149,9 +1166,25 @@ static int walk_page_buffers(	handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
                                        struct buffer_head *bh)
 {
+        int dirty = buffer_dirty(bh);
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
-        return ext3_journal_get_write_access(handle, bh);
+        /*
+         * __block_prepare_write() could have dirtied some buffers. Clean
+         * the dirty bit as jbd2_journal_get_write_access() could complain
+         * otherwise about fs integrity issues. Setting of the dirty bit
+         * by __block_prepare_write() isn't a real problem here as we clear
+         * the bit before releasing a page lock and thus writeback cannot
+         * ever write the buffer.
+         */
+        if (dirty)
+                clear_buffer_dirty(bh);
+        ret = ext3_journal_get_write_access(handle, bh);
+        if (!ret && dirty)
+                ret = ext3_journal_dirty_metadata(handle, bh);
+        return ret;
 }
 /*
@@ -1196,8 +1229,7 @@ retry:
                ret = PTR_ERR(handle);
                goto out;
        }
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = __block_write_begin(page, pos, len, ext3_get_block);
-                                                        ext3_get_block);
        if (ret)
                goto write_begin_failed;
@@ -1625,10 +1657,7 @@ static int ext3_writeback_writepage(struct page *page,
                goto out_fail;
        }
-        if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
+        ret = block_write_full_page(page, ext3_get_block, wbc);
-                ret = nobh_writepage(page, ext3_get_block, wbc);
-        else
-                ret = block_write_full_page(page, ext3_get_block, wbc);
        err = ext3_journal_stop(handle);
        if (!ret)
@@ -1785,6 +1814,17 @@ retry:
        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext3_get_block, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && ret < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1922,17 +1962,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        /*
-         * For "nobh" option,  we can only work if we don't need to
-         * read-in the page - otherwise we create buffers to do the IO.
-         */
-        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-             ext3_should_writeback_data(inode) && PageUptodate(page)) {
-                zero_user(page, offset, length);
-                set_page_dirty(page);
-                goto unlock;
-        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
@@ -2284,27 +2313,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                           depth);
                        /*
-                         * We've probably journalled the indirect block several
-                         * times during the truncate.  But it's no longer
-                         * needed and we now drop it from the transaction via
-                         * journal_revoke().
-                         *
-                         * That's easy if it's exclusively part of this
-                         * transaction.  But if it's part of the committing
-                         * transaction then journal_forget() will simply
-                         * brelse() it.  That means that if the underlying
-                         * block is reallocated in ext3_get_block(),
-                         * unmap_underlying_metadata() will find this block
-                         * and will try to get rid of it.  damn, damn.
-                         *
-                         * If this block has already been committed to the
-                         * journal, a revoke record will be written.  And
-                         * revoke records must be emitted *before* clearing
-                         * this block's bit in the bitmaps.
-                         */
-                        ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-                        /*
                         * Everything below this this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
@@ -2327,6 +2335,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                truncate_restart_transaction(handle, inode);
                        }
+                        /*
+                         * We've probably journalled the indirect block several
+                         * times during the truncate.  But it's no longer
+                         * needed and we now drop it from the transaction via
+                         * journal_revoke().
+                         *
+                         * That's easy if it's exclusively part of this
+                         * transaction.  But if it's part of the committing
+                         * transaction then journal_forget() will simply
+                         * brelse() it.  That means that if the underlying
+                         * block is reallocated in ext3_get_block(),
+                         * unmap_underlying_metadata() will find this block
+                         * and will try to get rid of it.  damn, damn. Thus
+                         * we don't allow a block to be reallocated until
+                         * a transaction freeing it has fully committed.
+                         *
+                         * We also have to make sure journal replay after a
+                         * crash does not overwrite non-journaled data blocks
+                         * with old metadata when the block got reallocated for
+                         * data.  Thus we have to store a revoke record for a
+                         * block in the same transaction in which we free the
+                         * block.
+                         */
+                        ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
                        ext3_free_blocks(handle, inode, nr, 1);
                        if (parent_bh) {
@@ -2554,7 +2587,7 @@ out_stop:
         * If this was a simple ftruncate(), and the file will remain alive
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
-         * ext3_delete_inode(), and we allow that function to clean up the
+         * ext3_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
@@ -3151,7 +3184,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -3198,9 +3231,17 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
                ext3_journal_stop(handle);
        }
-        rc = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                rc = vmtruncate(inode, attr->ia_size);
+                if (rc)
+                        goto err_out;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
-        if (!rc && (ia_valid & ATTR_MODE))
+        if (ia_valid & ATTR_MODE)
                rc = ext3_acl_chmod(inode);
 err_out:
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca42..2b35ddb70d65 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
        struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        unsigned long offset;
        struct buffer_head * bh;
        struct ext3_dir_entry_2 *de;
        struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
                ext3_mark_inode_dirty(handle, dir);
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
-        for (block = 0, offset = 0; block < blocks; block++) {
+        for (block = 0; block < blocks; block++) {
                bh = ext3_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef9..0ccd7b12b73c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                      ext3_fsblk_t n_blocks_count)
 {
        ext3_fsblk_t o_blocks_count;
-        unsigned long o_groups_count;
        ext3_grpblk_t last;
        ext3_grpblk_t add;
        struct buffer_head * bh;
@@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
         * yet: we're going to revalidate es->s_blocks_count after
         * taking the s_resize_lock below. */
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
-        o_groups_count = EXT3_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..5dbf4dba03c4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        ext3_xattr_put_super(sb);
@@ -525,17 +527,6 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ext3_inode_cachep);
 }
-static void ext3_clear_inode(struct inode *inode)
-{
-        struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
-        dquot_drop(inode);
-        ext3_discard_reservation(inode);
-        EXT3_I(inode)->i_block_alloc_info = NULL;
-        if (unlikely(rsv))
-                kfree(rsv);
-}
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
@@ -653,11 +644,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
-        if (test_opt(sb, BARRIER))
-                seq_puts(seq, ",barrier=1");
-        if (test_opt(sb, NOBH))
-                seq_puts(seq, ",nobh");
+        /*
+         * Always display barrier state so it's clear what the status is.
+         */
+        seq_puts(seq, ",barrier=");
+        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
@@ -744,7 +736,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -763,12 +755,12 @@ static const struct dquot_operations ext3_quota_operations = {
 static const struct quotactl_ops ext3_qctl_operations = {
        .quota_on       = ext3_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -777,14 +769,13 @@ static const struct super_operations ext3_sops = {
        .destroy_inode  = ext3_destroy_inode,
        .write_inode    = ext3_write_inode,
        .dirty_inode    = ext3_dirty_inode,
-        .delete_inode   = ext3_delete_inode,
+        .evict_inode    = ext3_evict_inode,
        .put_super      = ext3_put_super,
        .sync_fs        = ext3_sync_fs,
        .freeze_fs      = ext3_freeze,
        .unfreeze_fs    = ext3_unfreeze,
        .statfs         = ext3_statfs,
        .remount_fs     = ext3_remount,
-        .clear_inode    = ext3_clear_inode,
        .show_options   = ext3_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext3_quota_read,
@@ -810,8 +801,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota
+        Opt_resize, Opt_usrquota, Opt_grpquota
 };
 static const match_table_t tokens = {
@@ -865,6 +856,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_resize, "resize"},
        {Opt_err, NULL},
 };
@@ -967,7 +960,11 @@ static int parse_options (char *options, struct super_block *sb,
                int token;
                if (!*p)
                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
@@ -1215,9 +1212,15 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+                case Opt_nobarrier:
+                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option))
+                        if (args[0].from) {
-                                return 0;
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = 1;     /* No argument, default to 1 */
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1237,10 +1240,12 @@ set_qf_format:
                        *n_blocks_count = option;
                        break;
                case Opt_nobh:
-                        set_opt(sbi->s_mount_opt, NOBH);
+                        ext3_msg(sb, KERN_WARNING,
+                                "warning: ignoring deprecated nobh option");
                        break;
                case Opt_bh:
-                        clear_opt(sbi->s_mount_opt, NOBH);
+                        ext3_msg(sb, KERN_WARNING,
+                                "warning: ignoring deprecated bh option");
                        break;
                default:
                        ext3_msg(sb, KERN_ERR,
@@ -1511,7 +1516,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -1890,21 +1895,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext3_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext3_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext3_count_dirs(sb));
-        }
-        if (err) {
-                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
-                goto failed_mount3;
-        }
        /* per fileystem reservation list head & lock */
        spin_lock_init(&sbi->s_rsv_window_lock);
        sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1935,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (!test_opt(sb, NOLOAD) &&
            EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext3_load_journal(sb, es, journal_devnum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else if (journal_inum) {
                if (ext3_create_journal(sb, es, journal_inum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else {
                if (!silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: no journal found. "
                                "mounting ext3 over ext2?");
+                goto failed_mount2;
+        }
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext3_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext3_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext3_count_dirs(sb));
+        }
+        if (err) {
+                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
@@ -1978,20 +1982,12 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: journal does not support "
                                "requested data journaling mode");
-                        goto failed_mount4;
+                        goto failed_mount3;
                }
        default:
                break;
        }
-        if (test_opt(sb, NOBH)) {
-                if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
-                        ext3_msg(sb, KERN_WARNING,
-                                "warning: ignoring nobh option - "
-                                "it is supported only with writeback mode");
-                        clear_opt(sbi->s_mount_opt, NOBH);
-                }
-        }
        /*
         * The journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
@@ -2001,19 +1997,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (IS_ERR(root)) {
                ext3_msg(sb, KERN_ERR, "error: get root inode failed");
                ret = PTR_ERR(root);
-                goto failed_mount4;
+                goto failed_mount3;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
                ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
-                goto failed_mount4;
+                goto failed_mount3;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
                iput(root);
                ret = -ENOMEM;
-                goto failed_mount4;
+                goto failed_mount3;
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2035,11 @@ cantfind_ext3:
                       sb->s_id);
        goto failed_mount;
-failed_mount4:
-        journal_destroy(sbi->s_journal);
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
+        journal_destroy(sbi->s_journal);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2273,9 @@ static int ext3_load_journal(struct super_block *sb,
                        return -EINVAL;
        }
+        if (!(journal->j_flags & JFS_BARRIER))
+                printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = journal_update_format(journal);
                if (err)  {
@@ -2534,6 +2532,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
+        int enable_quota = 0;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2580,6 +2579,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -2634,6 +2637,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                                goto restore_opts;
                        if (!ext3_setup_super (sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
 #ifdef CONFIG_QUOTA
@@ -2645,6 +2649,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2834,24 +2841,21 @@ static int ext3_write_info(struct super_block *sb, int type)
 */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-                        EXT3_SB(sb)->s_jquota_fmt, type);
+                                        EXT3_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -2889,7 +2893,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..e69dc6dfaa89 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext3_xattr_cache;
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
        [EXT3_XATTR_INDEX_USER]              = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
        [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
        &ext3_xattr_user_handler,
        &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext3_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
                handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext3_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -1139,7 +1139,7 @@ ext3_xattr_cache_insert(struct buffer_head *bh)
                ea_bdebug(bh, "out of memory");
                return;
        }
-        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
        if (error) {
                mb_cache_entry_free(ce);
                if (error == -EBUSY) {
@@ -1211,8 +1211,8 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
+        ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
-                                       inode->i_sb->s_bdev, hash);
+                                       hash);
        while (ce) {
                struct buffer_head *bh;
@@ -1237,7 +1237,7 @@ again:
                        return bh;
                }
                brelse(bh);
-                ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+                ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
        }
        return NULL;
 }
@@ -1313,9 +1313,7 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
 int __init
 init_ext3_xattr(void)
 {
-        ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
+        ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
-                sizeof(struct mb_cache_entry) +
-                sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
        if (!ext3_xattr_cache)
                return -ENOMEM;
        return 0;
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
 # ifdef CONFIG_EXT3_FS_XATTR
-extern struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
 extern int init_ext3_xattr(void);
 extern void exit_ext3_xattr(void);
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
 # else  /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext3_xattr_security_handler = {
+const struct xattr_handler ext3_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext3_xattr_security_list,
        .get    = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext3_xattr_trusted_list,
        .get    = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext3_xattr_user_list,
        .get    = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..5e2ed4504ead 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
                                return error;
                        else {
                                inode->i_mode = mode;
+                                inode->i_ctime = ext4_current_time(inode);
                                ext4_mark_inode_dirty(handle, inode);
                                if (error == 0)
                                        acl = NULL;
@@ -454,7 +455,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext4_xattr_acl_access_handler = {
+const struct xattr_handler ext4_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext4_xattr_list_acl_access,
@@ -462,7 +463,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
        .set    = ext4_xattr_set_acl,
 };
-struct xattr_handler ext4_xattr_acl_default_handler = {
+const struct xattr_handler ext4_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..bd30799a43ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        ext4_grpblk_t bit;
        unsigned int i;
        struct ext4_group_desc *desc;
-        struct ext4_super_block *es;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_sb_info *sbi;
        int err = 0, ret, blk_free_count;
        ext4_grpblk_t blocks_freed;
        struct ext4_group_info *grp;
-        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
@@ -591,14 +587,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
-         * Account for the allocated meta blocks
+         * Account for the allocated meta blocks.  We will never
+         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                dquot_alloc_block_nofail(inode, ar.len);
        }
        return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..3db5084db9bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        if (start_blk + count > (entry->start_blk + 
+                        if (start_blk + count > (entry->start_blk +
                                                 entry->count))
-                                entry->count = (start_blk + count - 
+                                entry->count = (start_blk + count -
                                                entry->start_blk);
                        new_node = *n;
                        new_entry = rb_entry(new_node, struct ext4_system_zone,
@@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (start_blk + count < start_blk) ||
-            (start_blk + count > ext4_blocks_count(sbi->s_es)))
+            (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+                sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
                return 0;
+        }
        while (n) {
                entry = rb_entry(n, struct ext4_system_zone, node);
                if (start_blk + count - 1 < entry->start_blk)
                        n = n->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = n->rb_right;
-                else
+                else {
+                        sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
                        return 0;
+                }
        }
        return 1;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..374510f72baa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
-int ext4_check_dir_entry(const char *function, struct inode *dir,
+int __ext4_check_dir_entry(const char *function, unsigned int line,
-                         struct ext4_dir_entry_2 *de,
+                           struct inode *dir,
-                         struct buffer_head *bh,
+                           struct ext4_dir_entry_2 *de,
-                         unsigned int offset)
+                           struct buffer_head *bh,
+                           unsigned int offset)
 {
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,12 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                __ext4_error(dir->i_sb, function,
+                ext4_error_inode(dir, function, line, bh->b_blocknr,
-                        "bad entry in directory #%lu: %s - block=%llu"
+                        "bad entry in directory: %s - "
                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        dir->i_ino, error_msg, 
+                        error_msg, (unsigned) (offset%bh->b_size), offset,
-                        (unsigned long long) bh->b_blocknr,     
-                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-            ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,21 @@ static int ext4_readdir(struct file *filp,
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+                                      EXT4_INODE_INDEX);
        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
        while (!error && !stored && filp->f_pos < inode->i_size) {
-                ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                struct ext4_map_blocks map;
-                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
-                map_bh.b_state = 0;
+                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+                map.m_len = 1;
+                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
-                        pgoff_t index = map_bh.b_blocknr >>
+                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&filp->f_ra, index))
                                page_cache_sync_readahead(
@@ -143,7 +143,7 @@ static int ext4_readdir(struct file *filp,
                                        &filp->f_ra, filp,
                                        index, 1);
                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                        bh = ext4_bread(NULL, inode, blk, 0, &err);
+                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
                /*
@@ -152,9 +152,8 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                ext4_error(sb, "directory #%lu "
+                                EXT4_ERROR_INODE(inode, "directory "
                                           "contains a hole at offset %Lu",
-                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
@@ -195,7 +194,7 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+                        if (!ext4_check_dir_entry(inode, de,
                                                  bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
@@ -345,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
        struct dir_private_info *info;
        int len;
-        info = (struct dir_private_info *) dir_file->private_data;
+        info = dir_file->private_data;
        p = &info->root.rb_node;
        /* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..889ec9d5e6ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 /*
 * The fourth extended filesystem constants/structures
@@ -54,10 +57,13 @@
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-        ext4_error_inode(__func__, (inode), (fmt), ## a);
+        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)                 \
+        ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, fmt, a...)        \
-        ext4_error_file(__func__, (file), (fmt), ## a);
+        ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -72,7 +78,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
@@ -126,6 +132,29 @@ struct ext4_allocation_request {
 };
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW            (1 << BH_New)
+#define EXT4_MAP_MAPPED         (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT         (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+                                 EXT4_MAP_UNINIT)
+struct ext4_map_blocks {
+        ext4_fsblk_t m_pblk;
+        ext4_lblk_t m_lblk;
+        unsigned int m_len;
+        unsigned int m_flags;
+};
+/*
 * For delayed allocation tracking
 */
 struct mpage_da_data {
@@ -141,13 +170,15 @@ struct mpage_da_data {
 };
 #define EXT4_IO_UNWRITTEN       0x1
 typedef struct ext4_io_end {
-        struct list_head        list;           /* per-file finished AIO list */
+        struct list_head        list;           /* per-file finished IO list */
        struct inode            *inode;         /* file being written to */
        unsigned int            flag;           /* unwritten or not */
        struct page             *page;          /* page struct for buffer write */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
        struct work_struct      work;           /* data work queue */
+        struct kiocb            *iocb;          /* iocb struct for AIO */
+        int                     result;         /* error value for AIO */
 } ext4_io_end_t;
 /*
@@ -321,6 +352,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+        EXT4_INODE_SECRM        = 0,    /* Secure deletion */
+        EXT4_INODE_UNRM         = 1,    /* Undelete */
+        EXT4_INODE_COMPR        = 2,    /* Compress file */
+        EXT4_INODE_SYNC         = 3,    /* Synchronous updates */
+        EXT4_INODE_IMMUTABLE    = 4,    /* Immutable file */
+        EXT4_INODE_APPEND       = 5,    /* writes to file may only append */
+        EXT4_INODE_NODUMP       = 6,    /* do not dump file */
+        EXT4_INODE_NOATIME      = 7,    /* do not update atime */
+/* Reserved for compression usage... */
+        EXT4_INODE_DIRTY        = 8,
+        EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
+        EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
+        EXT4_INODE_ECOMPR       = 11,   /* Compression error */
+/* End compression flags --- maybe not all used */
+        EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
+        EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
+        EXT4_INODE_JOURNAL_DATA = 14,   /* file data should be journaled */
+        EXT4_INODE_NOTAIL       = 15,   /* file tail should not be merged */
+        EXT4_INODE_DIRSYNC      = 16,   /* dirsync behaviour (directories only) */
+        EXT4_INODE_TOPDIR       = 17,   /* Top of directory hierarchies*/
+        EXT4_INODE_HUGE_FILE    = 18,   /* Set to each huge file */
+        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
+        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
+        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
+};
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+        printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+                EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+        CHECK_FLAG_VALUE(SECRM);
+        CHECK_FLAG_VALUE(UNRM);
+        CHECK_FLAG_VALUE(COMPR);
+        CHECK_FLAG_VALUE(SYNC);
+        CHECK_FLAG_VALUE(IMMUTABLE);
+        CHECK_FLAG_VALUE(APPEND);
+        CHECK_FLAG_VALUE(NODUMP);
+        CHECK_FLAG_VALUE(NOATIME);
+        CHECK_FLAG_VALUE(DIRTY);
+        CHECK_FLAG_VALUE(COMPRBLK);
+        CHECK_FLAG_VALUE(NOCOMPR);
+        CHECK_FLAG_VALUE(ECOMPR);
+        CHECK_FLAG_VALUE(INDEX);
+        CHECK_FLAG_VALUE(IMAGIC);
+        CHECK_FLAG_VALUE(JOURNAL_DATA);
+        CHECK_FLAG_VALUE(NOTAIL);
+        CHECK_FLAG_VALUE(DIRSYNC);
+        CHECK_FLAG_VALUE(TOPDIR);
+        CHECK_FLAG_VALUE(HUGE_FILE);
+        CHECK_FLAG_VALUE(EXTENTS);
+        CHECK_FLAG_VALUE(EA_INODE);
+        CHECK_FLAG_VALUE(EOFBLOCKS);
+        CHECK_FLAG_VALUE(RESERVED);
+}
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -332,6 +440,18 @@ struct ext4_new_group_input {
        __u16 unused;
 };
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+        u32 group;
+        compat_u64 block_bitmap;
+        compat_u64 inode_bitmap;
+        compat_u64 inode_table;
+        u32 blocks_count;
+        u16 reserved_blocks;
+        u16 unused;
+};
+#endif
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
        __u32 group;
@@ -345,7 +465,7 @@ struct ext4_new_group_data {
 };
 /*
- * Flags used by ext4_get_blocks()
+ * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unitialized
           extent to be an initialized ext4 */
@@ -355,7 +475,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path,
-           so set the magic i_delalloc_reserve_flag after taking the 
+           so set the magic i_delalloc_reserve_flag after taking the
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
@@ -398,6 +518,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
 * ioctl commands in 32 bit emulation
 */
@@ -408,11 +529,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ             _IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ             _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND         _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD            _IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY    _IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
+#endif
 /*
@@ -616,9 +739,8 @@ struct ext4_ext_cache {
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
        __u32   i_dtime;
+        ext4_fsblk_t    i_file_acl;
        /*
         * i_block_group is the number of the block group which contains
@@ -629,6 +751,7 @@ struct ext4_inode_info {
         */
        ext4_group_t    i_block_group;
        unsigned long   i_state_flags;          /* Dynamic state flags */
+        unsigned long   i_flags;
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -755,7 +878,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_POSIX_ACL            0x08000 /* POSIX Access Control Lists */
 #define EXT4_MOUNT_NO_AUTO_DA_ALLOC     0x10000 /* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER              0x20000 /* Use block barriers */
-#define EXT4_MOUNT_NOBH                 0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
@@ -864,7 +986,7 @@ struct ext4_super_block {
        __le32  s_last_orphan;          /* start of list of inodes to delete */
        __le32  s_hash_seed[4];         /* HTREE hash seed */
        __u8    s_def_hash_version;     /* Default hash version to use */
-        __u8    s_reserved_char_pad;
+        __u8    s_jnl_backup_type;
        __le16  s_desc_size;            /* size of group descriptor */
 /*100*/ __le32  s_default_mount_opts;
        __le32  s_first_meta_bg;        /* First metablock block group */
@@ -882,12 +1004,34 @@ struct ext4_super_block {
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
-        __u8    s_reserved_char_pad2;
+        __u8    s_reserved_char_pad;
        __le16  s_reserved_pad;
        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
-        __u32   s_reserved[160];        /* Padding to the end of the block */
+        __le32  s_snapshot_inum;        /* Inode number of active snapshot */
+        __le32  s_snapshot_id;          /* sequential ID of active snapshot */
+        __le64  s_snapshot_r_blocks_count; /* reserved blocks for active
+                                              snapshot's future use */
+        __le32  s_snapshot_list;        /* inode number of the head of the
+                                           on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+        __le32  s_error_count;          /* number of fs errors */
+        __le32  s_first_error_time;     /* first time an error happened */
+        __le32  s_first_error_ino;      /* inode involved in first error */
+        __le64  s_first_error_block;    /* block involved of first error */
+        __u8    s_first_error_func[32]; /* function where the error happened */
+        __le32  s_first_error_line;     /* line number where error happened */
+        __le32  s_last_error_time;      /* most recent time of an error */
+        __le32  s_last_error_ino;       /* inode involved in last error */
+        __le32  s_last_error_line;      /* line number where error happened */
+        __le64  s_last_error_block;     /* block involved of last error */
+        __u8    s_last_error_func[32];  /* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+        __u8    s_mount_opts[64];
+        __le32  s_reserved[112];        /* Padding to the end of the block */
 };
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
 #ifdef __KERNEL__
 /*
@@ -1025,6 +1169,9 @@ struct ext4_sb_info {
        /* workqueue for dio unwritten */
        struct workqueue_struct *dio_unwritten_wq;
+        /* timer for periodic error stats printing */
+        struct timer_list s_err_report;
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1062,22 +1209,25 @@ enum {
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+        EXT4_STATE_NEWENTRY,            /* File just added to dir */
 };
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+#define EXT4_INODE_BIT_FNS(name, field)                                 \
-{
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
-        return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+{                                                                       \
+        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+}                                                                       \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
+{                                                                       \
+        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+}                                                                       \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{                                                                       \
+        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
 }
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+EXT4_INODE_BIT_FNS(flag, flags)
-{
+EXT4_INODE_BIT_FNS(state, state_flags)
-        set_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
-        clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1192,6 +1342,10 @@ static inline void ext4_clear_inode_state(struct inode *inode, int bit)
 #define EXT4_DEFM_JMODE_DATA    0x0020
 #define EXT4_DEFM_JMODE_ORDERED 0x0040
 #define EXT4_DEFM_JMODE_WBACK   0x0060
+#define EXT4_DEFM_NOBARRIER     0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD       0x0400
+#define EXT4_DEFM_NODELALLOC    0x0800
 /*
 * Default journal batch times
@@ -1258,13 +1412,50 @@ struct ext4_dir_entry_2 {
 #define EXT4_MAX_REC_LEN                ((1<<16)-1)
 /*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+        unsigned len = le16_to_cpu(dlen);
+#if (PAGE_CACHE_SIZE >= 65536)
+        if (len == EXT4_MAX_REC_LEN || len == 0)
+                return blocksize;
+        return (len & 65532) | ((len & 3) << 16);
+#else
+        return len;
+#endif
+}
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+                BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+        if (len < 65536)
+                return cpu_to_le16(len);
+        if (len == blocksize) {
+                if (blocksize == 65536)
+                        return cpu_to_le16(EXT4_MAX_REC_LEN);
+                else
+                        return cpu_to_le16(0);
+        }
+        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+        return cpu_to_le16(len);
+#endif
+}
+/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
                                      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-                      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1389,16 +1580,18 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
                ext4_init_block_bitmap(sb, NULL, group, desc)
 /* dir.c */
-extern int ext4_check_dir_entry(const char *, struct inode *,
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
-                                struct ext4_dir_entry_2 *,
+                                  struct ext4_dir_entry_2 *,
-                                struct buffer_head *, unsigned int);
+                                  struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, de, bh, offset) \
+        __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1450,7 +1643,8 @@ extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                                struct kstat *stat);
-extern void ext4_delete_inode(struct inode *);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
@@ -1480,8 +1674,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
-extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
-extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1495,25 +1687,38 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 /* super.c */
-extern void __ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                         const char *, ...)
-#define ext4_error(sb, message...)      __ext4_error(sb, __func__, ## message)
+        __attribute__ ((format (printf, 4, 5)));
-extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+#define ext4_error(sb, message...)      __ext4_error(sb, __func__,      \
-        __attribute__ ((format (printf, 3, 4)));
+                                                     __LINE__, ## message)
-extern void ext4_error_file(const char *, struct file *, const char *, ...)
+extern void ext4_error_inode(struct inode *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                             ext4_fsblk_t, const char *, ...)
-extern void __ext4_std_error(struct super_block *, const char *, int);
+        __attribute__ ((format (printf, 5, 6)));
-extern void ext4_abort(struct super_block *, const char *, const char *, ...)
+extern void ext4_error_file(struct file *, const char *, unsigned int,
-        __attribute__ ((format (printf, 3, 4)));
+                            const char *, ...)
-extern void __ext4_warning(struct super_block *, const char *,
+        __attribute__ ((format (printf, 4, 5)));
+extern void __ext4_std_error(struct super_block *, const char *,
+                             unsigned int, int);
+extern void __ext4_abort(struct super_block *, const char *, unsigned int,
+                       const char *, ...)
+        __attribute__ ((format (printf, 4, 5)));
+#define ext4_abort(sb, message...)      __ext4_abort(sb, __func__, \
+                                                       __LINE__, ## message)
+extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                          const char *, ...)
-        __attribute__ ((format (printf, 3, 4)));
+        __attribute__ ((format (printf, 4, 5)));
-#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, ## message)
+#define ext4_warning(sb, message...)    __ext4_warning(sb, __func__, \
+                                                       __LINE__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+extern void __ext4_grp_locked_error(const char *, unsigned int, \
-                                const char *, const char *, ...)
+                                    struct super_block *, ext4_group_t, \
-        __attribute__ ((format (printf, 4, 5)));
+                                    unsigned long, ext4_fsblk_t, \
+                                    const char *, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#define ext4_grp_locked_error(sb, grp, message...) \
+        __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -1647,7 +1852,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
 #define ext4_std_error(sb, errno)                               \
 do {                                                            \
        if ((errno))                                            \
-                __ext4_std_error((sb), __func__, (errno));      \
+                __ext4_std_error((sb), __func__, __LINE__, (errno));    \
 } while (0)
 #ifdef CONFIG_SMP
@@ -1678,6 +1883,7 @@ struct ext4_group_info {
        ext4_grpblk_t   bb_first_free;  /* first free block */
        ext4_grpblk_t   bb_free;        /* total free blocks */
        ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
+        ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
        void            *bb_bitmap;
@@ -1738,6 +1944,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
        spin_unlock(ext4_group_lock_ptr(sb, group));
 }
+static inline void ext4_mark_super_dirty(struct super_block *sb)
+{
+        if (EXT4_SB(sb)->s_journal == NULL)
+                sb->s_dirt =1;
+}
 /*
 * Inodes and files operations
 */
@@ -1772,9 +1984,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int max_blocks,
+                               struct ext4_map_blocks *map, int flags);
-                               struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1782,9 +1993,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
-extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                           sector_t block, unsigned int max_blocks,
+                           struct ext4_map_blocks *map, int flags);
-                           struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 /* move_extent.c */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71ca..6e272ef6ba96 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
 #include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                struct buffer_head *bh)
+                                   handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_undo_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
        }
        return err;
 }
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
-                                struct buffer_head *bh)
+                                    handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
        }
        return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 * If the handle isn't valid we're not journaling, but we still need to
 * call into ext4_journal_revoke() to put the buffer head.
 */
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
-                  struct inode *inode, struct buffer_head *bh,
+                  int is_metadata, struct inode *inode,
-                  ext4_fsblk_t blocknr)
+                  struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
-                                ext4_journal_abort_handle(where, __func__, bh,
+                                ext4_journal_abort_handle(where, line, __func__,
-                                                          handle, err);
+                                                          bh, handle, err);
                        return err;
                }
                return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+                ext4_journal_abort_handle(where, line, __func__,
-                ext4_abort(inode->i_sb, __func__,
+                                          bh, handle, err);
+                __ext4_abort(inode->i_sb, where, line,
                           "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
 }
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh)
 {
        int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_create_access(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__,
-                                                  handle, err);
+                                                  bh, handle, err);
        }
        return err;
 }
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
-                                 struct inode *inode, struct buffer_head *bh)
+                                 handle_t *handle, struct inode *inode,
+                                 struct buffer_head *bh)
 {
        int err = 0;
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                if (err)
-                        ext4_journal_abort_handle(where, __func__, bh,
+                        ext4_journal_abort_handle(where, line, __func__,
-                                                  handle, err);
+                                                  bh, handle, err);
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                                ext4_error(inode->i_sb,
+                                struct ext4_super_block *es;
-                                           "IO error syncing inode, "
-                                           "inode=%lu, block=%llu",
+                                es = EXT4_SB(inode->i_sb)->s_es;
-                                           inode->i_ino,
+                                es->s_last_error_block =
-                                           (unsigned long long) bh->b_blocknr);
+                                        cpu_to_le64(bh->b_blocknr);
+                                ext4_error_inode(inode, where, line,
+                                                 bh->b_blocknr,
+                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
 }
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+                              handle_t *handle, struct super_block *sb)
+{
+        struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+        int err = 0;
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_dirty_metadata(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, line, __func__,
+                                                  bh, handle, err);
+        } else
+                sb->s_dirt = 1;
+        return err;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..b0bd792c58c5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 /*
 * Wrapper functions with which ext4 calls into JBD.
 */
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+                               const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
+int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                struct buffer_head *bh);
+                                   handle_t *handle, struct buffer_head *bh);
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
-                                struct buffer_head *bh);
+                                    handle_t *handle, struct buffer_head *bh);
-int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
-                  struct inode *inode, struct buffer_head *bh,
+                  int is_metadata, struct inode *inode,
-                  ext4_fsblk_t blocknr);
+                  struct buffer_head *bh, ext4_fsblk_t blocknr);
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct buffer_head *bh);
-int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
-                                 struct inode *inode, struct buffer_head *bh);
+                                 handle_t *handle, struct inode *inode,
+                                 struct buffer_head *bh);
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+                              handle_t *handle, struct super_block *sb);
 #define ext4_journal_get_undo_access(handle, bh) \
-        __ext4_journal_get_undo_access(__func__, (handle), (bh))
+        __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-        __ext4_journal_get_write_access(__func__, (handle), (bh))
+        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
-        __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
-                      (block_nr))
+                      (bh), (block_nr))
 #define ext4_journal_get_create_access(handle, bh) \
-        __ext4_journal_get_create_access(__func__, (handle), (bh))
+        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
-        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
+        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+                                     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
-int __ext4_journal_stop(const char *where, handle_t *handle);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 #define ext4_journal_stop(handle) \
-        __ext4_journal_stop(__func__, (handle))
+        __ext4_journal_stop(__func__, __LINE__, (handle))
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -273,7 +281,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 1;
        return 0;
 }
@@ -284,7 +292,7 @@ static inline int ext4_should_order_data(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return 1;
@@ -297,7 +305,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return 1;
@@ -308,20 +316,18 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_mutex for direct I/O reads.  This only works for extent-based
- * files, and it doesn't work for nobh or if data journaling is
+ * files, and it doesn't work if data journaling is enabled, since the
- * enabled, since the dioread_nolock code uses b_private to pass
+ * dioread_nolock code uses b_private to pass information back to the
- * information back to the I/O completion handler, and this conflicts
+ * I/O completion handler, and this conflicts with the jbd's use of
- * with the jbd's use of b_private.
+ * b_private.
 */
 static inline int ext4_should_dioread_nolock(struct inode *inode)
 {
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
-        if (test_opt(inode->i_sb, NOBH))
-                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..06328d3e5717 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
-        /*
+        if (err == 0)
-         * We have dropped i_data_sem so someone might have cached again
+                err = -EAGAIN;
-         * an extent we are going to truncate.
-         */
-        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-                 * block groups per flexgroup, reserve the first block 
+                 * block groups per flexgroup, reserve the first block
-                 * group for directories and special files.  Regular 
+                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
-                 * tends to speed up directory access and improves 
+                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
@@ -404,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
        return 1;
 }
-static int __ext4_ext_check(const char *function, struct inode *inode,
+static int __ext4_ext_check(const char *function, unsigned int line,
-                                        struct ext4_extent_header *eh,
+                            struct inode *inode, struct ext4_extent_header *eh,
-                                        int depth)
+                            int depth)
 {
        const char *error_msg;
        int max = 0;
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        __ext4_error(inode->i_sb, function,
+        ext4_error_inode(inode, function, line, 0,
-                        "bad header/extent in inode #%lu: %s - magic %x, "
+                        "bad header/extent: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
-                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+                        error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
                        max, le16_to_cpu(eh->eh_depth), depth);
@@ -450,7 +447,7 @@ corrupted:
 }
 #define ext4_ext_check(inode, eh, depth)        \
-        __ext4_ext_check(__func__, inode, eh, depth)
+        __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
 int ext4_ext_check_inode(struct inode *inode)
 {
@@ -1086,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 {
        struct ext4_ext_path *curp = path;
        struct ext4_extent_header *neh;
-        struct ext4_extent_idx *fidx;
        struct buffer_head *bh;
        ext4_fsblk_t newblock;
        int err = 0;
@@ -1147,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        ext4_idx_store_pblock(curp->p_idx, newblock);
        neh = ext_inode_hdr(inode);
-        fidx = EXT_FIRST_INDEX(neh);
        ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
-                  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+                  idx_pblock(EXT_FIRST_INDEX(neh)));
        neh->eh_depth = cpu_to_le16(path->p_depth + 1);
        err = ext4_ext_dirty(handle, inode, curp);
@@ -1622,9 +1618,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
-                                   "inode#%lu, eh->eh_entries = 0!",
-                                   inode->i_ino);
        }
        return merge_done;
@@ -2039,7 +2033,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        int ret = EXT4_EXT_CACHE_NO;
-        /* 
+        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2355,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
        handle_t *handle;
-        int i = 0, err = 0;
+        int i, err;
        ext_debug("truncate since %u\n", start);
@@ -2370,23 +2364,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        if (IS_ERR(handle))
                return PTR_ERR(handle);
+again:
        ext4_ext_invalidate_cache(inode);
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
+        depth = ext_depth(inode);
        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
        if (path == NULL) {
                ext4_journal_stop(handle);
                return -ENOMEM;
        }
+        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
-        path[0].p_depth = depth;
+        i = err = 0;
        while (i >= 0 && err == 0) {
                if (i == depth) {
@@ -2480,6 +2477,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
+        if (err == -EAGAIN)
+                goto again;
        ext4_journal_stop(handle);
        return err;
@@ -2544,7 +2543,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-        int ret = -EIO;
+        int ret;
        struct bio *bio;
        int blkbits, blocksize;
        sector_t ee_pblock;
@@ -2568,6 +2567,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
+                if (!bio)
+                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
@@ -2595,22 +2597,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                submit_bio(WRITE, bio);
                wait_for_completion(&event);
-                if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                        ret = 0;
+                        bio_put(bio);
-                else {
+                        return -EIO;
-                        ret = -EIO;
-                        break;
                }
                bio_put(bio);
                ee_len    -= done;
                ee_pblock += done  << (blkbits - 9);
        }
-        return ret;
+        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
 * extent into multiple extents (upto three - one initialized and two
 * uninitialized).
@@ -2620,39 +2620,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 *   c> Splits in three extents: Somone is writing in middle of the extent
 */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-                                                struct inode *inode,
+                                           struct inode *inode,
-                                                struct ext4_ext_path *path,
+                                           struct ext4_map_blocks *map,
-                                                ext4_lblk_t iblock,
+                                           struct ext4_ext_path *path)
-                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
        int ret = 0;
+        int may_zeroout;
+        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2665,10 +2681,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                return allocated;
        }
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2677,15 +2693,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN) {
+                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
                        /*
-                         * iblock == ee_block is handled by the zerouout
+                         * map->m_lblk == ee_block is handled by the zerouout
                         * at the beginning.
                         * Mark first half uninitialized.
                         * Mark second half initialized and zero out the
@@ -2698,7 +2714,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(iblock);
+                        ex3->ee_block = cpu_to_le32(map->m_lblk);
                        ext4_ext_store_pblock(ex3, newblock);
                        ex3->ee_len = cpu_to_le16(allocated);
                        err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2727,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                ex->ee_len   = orig_ex.ee_len;
                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from iblock */
+                                /* blocks available from map->m_lblk */
                                return allocated;
                        } else if (err)
@@ -2733,8 +2749,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                 */
                                depth = ext_depth(inode);
                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode,
+                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                                iblock, path);
+                                                            path);
                                if (IS_ERR(path)) {
                                        err = PTR_ERR(path);
                                        return err;
@@ -2754,12 +2770,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        return allocated;
                }
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2769,7 +2785,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -2783,11 +2799,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -2801,14 +2819,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
                 * to insert a extent in the middle zerout directly
                 * otherwise give the extent a chance to merge to left
                 */
                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                                                        iblock != ee_block) {
+                        map->m_lblk != ee_block && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2818,7 +2836,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                }
        }
@@ -2829,12 +2847,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
-        /* ex2: iblock to iblock + maxblocks-1 : initialised */
+        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        if (ex2 != ex)
@@ -2877,7 +2895,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2904,7 +2922,7 @@ fix_extent_len:
 }
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
@@ -2918,7 +2936,7 @@ fix_extent_len:
 * One of more index blocks maybe needed if the extent tree grow after
 * the unintialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitilized extent called at this time will be split
+ * the IO. The uninitialized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
@@ -2927,51 +2945,62 @@ fix_extent_len:
 */
 static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct inode *inode,
+                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
-                                        ext4_lblk_t iblock,
-                                        unsigned int max_blocks,
                                        int flags)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
-        struct ext4_extent_header *eh;
+        ext4_lblk_t ee_block, eof_block;
-        ext4_lblk_t ee_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
+        int may_zeroout;
+        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
-        ext_debug("ext4_split_unwritten_extents: inode %lu,"
-                  "iblock %llu, max_blocks %u\n", inode->i_ino,
-                  (unsigned long long)iblock, max_blocks);
        depth = ext_depth(inode);
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
+        /*
         * If the uninitialized extent begins at the same logical
         * block where the write begins, and the write completely
         * covers the extent, then we don't need to split it.
         */
-        if ((iblock == ee_block) && (allocated <= max_blocks))
+        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
                return allocated;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2980,18 +3009,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -3001,7 +3030,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -3015,16 +3044,17 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
                }
-                eh = path[depth].p_hdr;
                ex = path[depth].p_ext;
                if (ex2 != &newex)
                        ex2 = ex;
@@ -3033,7 +3063,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
         * If there was a change of depth as part of the
@@ -3042,15 +3072,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
        /*
-         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * uninitialised still.
+         * using direct I/O, uninitialised still.
         */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3092,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -3152,10 +3182,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock, unsigned int max_blocks,
+                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
-                        unsigned int allocated, struct buffer_head *bh_result,
+                        unsigned int allocated, ext4_fsblk_t newblock)
-                        ext4_fsblk_t newblock)
 {
        int ret = 0;
        int err = 0;
@@ -3163,15 +3192,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
                  "block %llu, max_blocks %u, flags %d, allocated %u",
-                  inode->i_ino, (unsigned long long)iblock, max_blocks,
+                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle,
+                ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                inode, path, iblock,
+                                                   path, flags);
-                                                max_blocks, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
                 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3210,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3238,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
-                set_buffer_unwritten(bh_result);
+                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode,
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-                                                path, iblock,
-                                                max_blocks);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3226,7 +3252,7 @@ out:
                goto out2;
        } else
                allocated = ret;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * if we allocated more blocks than requested
         * we need to make sure we unmap the extra block
@@ -3234,11 +3260,11 @@ out:
         * unmapped later when we find the buffer_head marked
         * new.
         */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-                                        newblock + max_blocks,
+                                        newblock + map->m_len,
-                                        allocated - max_blocks);
+                                        allocated - map->m_len);
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
@@ -3252,13 +3278,13 @@ out:
                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3284,26 +3310,23 @@ out2:
 *
 * return < 0, error case.
 */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                        struct ext4_map_blocks *map, int flags)
-                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int i, err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %lu\n",
-                        iblock, max_blocks, inode->i_ino);
+                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3339,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
-                        newblock = iblock
+                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
-                                        (iblock - le32_to_cpu(newex.ee_block));
+                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
                } else {
                        BUG();
@@ -3329,7 +3352,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        }
        /* find extent for this block */
-        path = ext4_ext_find_extent(inode, iblock, NULL);
+        path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -3345,8 +3368,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
-                                 "iblock: %d, depth: %d pblock %lld",
+                                 "lblock: %lu, depth: %d pblock %lld",
-                                 iblock, depth, path[depth].p_block);
+                                 (unsigned long) map->m_lblk, depth,
+                                 path[depth].p_block);
                err = -EIO;
                goto out2;
        }
@@ -3364,12 +3388,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-                if (in_range(iblock, ee_block, ee_len)) {
+                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        newblock = iblock - ee_block + ee_start;
+                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
-                        allocated = ee_len - (iblock - ee_block);
+                        allocated = ee_len - (map->m_lblk - ee_block);
-                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
+                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
-                                        ee_block, ee_len, newblock);
+                                  ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3403,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, iblock, max_blocks, path,
+                                        inode, map, path, flags, allocated,
-                                        flags, allocated, bh_result, newblock);
+                                        newblock);
                        return ret;
                }
        }
@@ -3394,7 +3418,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                ext4_ext_put_gap_in_cache(inode, path, iblock);
+                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
@@ -3402,11 +3426,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        /* find neighbour allocated blocks */
-        ar.lleft = iblock;
+        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out2;
-        ar.lright = iblock;
+        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
        if (err)
                goto out2;
@@ -3417,26 +3441,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
         * EXT_UNINIT_MAX_LEN.
         */
-        if (max_blocks > EXT_INIT_MAX_LEN &&
+        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_INIT_MAX_LEN;
+                map->m_len = EXT_INIT_MAX_LEN;
-        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+        else if (map->m_len > EXT_UNINIT_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_UNINIT_MAX_LEN;
+                map->m_len = EXT_UNINIT_MAX_LEN;
-        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-        newex.ee_block = cpu_to_le32(iblock);
+        newex.ee_block = cpu_to_le32(map->m_lblk);
-        newex.ee_len = cpu_to_le16(max_blocks);
+        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
-                allocated = max_blocks;
+                allocated = map->m_len;
        /* allocate new block */
        ar.inode = inode;
-        ar.goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
-        ar.logical = iblock;
+        ar.logical = map->m_lblk;
        ar.len = allocated;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3494,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
                if (unlikely(!eh->eh_entries)) {
                        EXT4_ERROR_INODE(inode,
-                                         "eh->eh_entries == 0 ee_block %d",
+                                         "eh->eh_entries == 0 and "
-                                         ex->ee_block);
+                                         "EOFBLOCKS_FL set");
                        err = -EIO;
                        goto out2;
                }
                last_ex = EXT_LAST_EXTENT(eh);
-                if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                /*
-                    + ext4_ext_get_actual_len(last_ex))
+                 * If the current leaf block was reached by looking at
-                        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+                 * the last index block all the way down the tree, and
+                 * we are extending the inode beyond the last extent
+                 * in the current leaf block, then clear the
+                 * EOFBLOCKS_FL flag.
+                 */
+                for (i = depth-1; i >= 0; i--) {
+                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                                break;
+                }
+                if ((i < 0) &&
+                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+                     ext4_ext_get_actual_len(last_ex)))
+                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3500,9 +3536,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3552,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3625,7 +3661,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
                 * can proceed even if the new size is the same as i_size.
                 */
                if (new_size > i_size_read(inode))
-                        EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
 }
@@ -3640,55 +3676,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        loff_t new_size;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
        /* preallocation to directories is currently not supported */
        if (S_ISDIR(inode->i_mode))
                return -ENODEV;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                                                        - block;
+                - map.m_lblk;
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, (len + offset));
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
 retry:
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk = map.m_lblk + ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = max_blocks = max_blocks - ret;
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
                                    inode->i_ino, block, max_blocks);
@@ -3697,14 +3735,14 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
-                if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
                                                blkbits) >> blkbits))
                        new_size = offset + len;
                else
-                        new_size = (block + ret) << blkbits;
+                        new_size = (map.m_lblk + ret) << blkbits;
                ext4_falloc_update_inode(inode, mode, new_size,
-                                                buffer_new(&map_bh));
+                                         (map.m_flags & EXT4_MAP_NEW));
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
@@ -3733,42 +3771,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                    ssize_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
-        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
-                                                        - block;
+                      map.m_lblk);
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk += ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = (max_blocks -= ret);
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, map.m_len);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3933,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        int error = 0;
        /* fallback to generic here if not in extents fmt */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
                        ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..ee92b66d4558 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,11 +66,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                size_t length = iov_length(iov, nr_segs);
-                if (pos > sbi->s_bitmap_maxbytes)
+                if ((pos > sbi->s_bitmap_maxbytes ||
+                    (pos == sbi->s_bitmap_maxbytes && length > 0)))
                        return -EFBIG;
                if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
                if (!IS_ERR(cp)) {
                        memcpy(sbi->s_es->s_last_mounted, cp,
                               sizeof(sbi->s_es->s_last_mounted));
-                        sb->s_dirt = 1;
+                        ext4_mark_super_dirty(sb);
                }
        }
        return dquot_file_open(inode, filp);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+        struct dentry *dentry = NULL;
+        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+                dentry = list_entry(inode->i_dentry.next,
+                                    struct dentry, d_alias);
+                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+                        break;
+                inode = dentry->d_parent->d_inode;
+                sync_mapping_buffers(inode->i_mapping);
+        }
+}
+/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
 * i_mutex lock is held when entering and exiting this function
 */
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, dentry, datasync);
+        trace_ext4_sync_file(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        ret = flush_completed_IO(inode);
        if (ret < 0)
                return ret;
-        
-        if (!journal)
+        if (!journal) {
-                return simple_fsync(file, dentry, datasync);
+                ret = generic_file_fsync(file, datasync);
+                if (!ret && !list_empty(&inode->i_dentry))
+                        ext4_sync_parent(inode);
+                return ret;
+        }
        /*
         * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                if (ext4_should_writeback_data(inode) &&
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                jbd2_log_wait_commit(journal, commit_tid);
+                                        NULL, BLKDEV_IFL_WAIT);
+                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..45853e0d1f21 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -222,7 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        is_directory = S_ISDIR(inode->i_mode);
        /* Do this BEFORE marking the inode not in use or returning an error */
-        clear_inode(inode);
+        ext4_clear_inode(inode);
        es = EXT4_SB(sb)->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        if (fatal)
                goto error_return;
-        /* Ok, now we can actually update the inode bitmaps.. */
+        fatal = -ESRCH;
-        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+        gdp = ext4_get_group_desc(sb, block_group, &bh2);
-                                        bit, bitmap_bh->b_data);
+        if (gdp) {
-        if (!cleared)
-                ext4_error(sb, "bit already cleared for inode %lu", ino);
-        else {
-                gdp = ext4_get_group_desc(sb, block_group, &bh2);
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
-                if (fatal) goto error_return;
+        }
+        ext4_lock_group(sb, block_group);
-                if (gdp) {
+        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-                        ext4_lock_group(sb, block_group);
+        if (fatal || !cleared) {
-                        count = ext4_free_inodes_count(sb, gdp) + 1;
+                ext4_unlock_group(sb, block_group);
-                        ext4_free_inodes_set(sb, gdp, count);
+                goto out;
-                        if (is_directory) {
+        }
-                                count = ext4_used_dirs_count(sb, gdp) - 1;
-                                ext4_used_dirs_set(sb, gdp, count);
-                                if (sbi->s_log_groups_per_flex) {
-                                        ext4_group_t f;
-                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-                                }
-                        }
+        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
+        ext4_free_inodes_set(sb, gdp, count);
-                                                        block_group, gdp);
+        if (is_directory) {
-                        ext4_unlock_group(sb, block_group);
+                count = ext4_used_dirs_count(sb, gdp) - 1;
-                        percpu_counter_inc(&sbi->s_freeinodes_counter);
+                ext4_used_dirs_set(sb, gdp, count);
-                        if (is_directory)
+                percpu_counter_dec(&sbi->s_dirs_counter);
-                                percpu_counter_dec(&sbi->s_dirs_counter);
+        }
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-                        if (sbi->s_log_groups_per_flex) {
+        ext4_unlock_group(sb, block_group);
-                                ext4_group_t f;
+        percpu_counter_inc(&sbi->s_freeinodes_counter);
-                                f = ext4_flex_group(sbi, block_group);
+        if (sbi->s_log_groups_per_flex) {
-                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                ext4_group_t f = ext4_flex_group(sbi, block_group);
-                        }
-                }
+                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+                if (is_directory)
-                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
+                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
-        if (!fatal)
+out:
-                fatal = err;
+        if (cleared) {
-        sb->s_dirt = 1;
+                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+                if (!fatal)
+                        fatal = err;
+                ext4_mark_super_dirty(sb);
+        } else
+                ext4_error(sb, "bit already cleared for inode %lu", ino);
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        if (S_ISDIR(mode) &&
            ((parent == sb->s_root->d_inode) ||
-             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;
@@ -972,23 +965,19 @@ got:
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt(sb, GRPID))
+                inode->i_mode = mode;
-                inode->i_gid = dir->i_gid;
+                inode->i_uid = current_fsuid();
-        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
@@ -1045,7 +1034,7 @@ got:
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d605412844..4b8debeb3965 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        int ret;
        /*
-         * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_mutex. So we can safely drop the i_data_sem here.
@@ -167,11 +167,16 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 /*
 * Called at the last iput() if i_nlink is zero.
 */
-void ext4_delete_inode(struct inode *inode)
+void ext4_evict_inode(struct inode *inode)
 {
        handle_t *handle;
        int err;
+        if (inode->i_nlink) {
+                truncate_inode_pages(&inode->i_data, 0);
+                goto no_delete;
+        }
        if (!is_bad_inode(inode))
                dquot_initialize(inode);
@@ -221,6 +226,7 @@ void ext4_delete_inode(struct inode *inode)
                                     "couldn't extend journal (err %d)", err);
                stop_handle:
                        ext4_journal_stop(handle);
+                        ext4_orphan_del(NULL, inode);
                        goto no_delete;
                }
        }
@@ -245,13 +251,13 @@ void ext4_delete_inode(struct inode *inode)
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
-                clear_inode(inode);
+                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        return;
 no_delete:
-        clear_inode(inode);     /* We must guarantee clearing of inode... */
+        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 }
 typedef struct {
@@ -337,9 +343,11 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
-static int __ext4_check_blockref(const char *function, struct inode *inode,
+static int __ext4_check_blockref(const char *function, unsigned int line,
+                                 struct inode *inode,
                                 __le32 *p, unsigned int max)
 {
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        __le32 *bref = p;
        unsigned int blk;
@@ -348,9 +356,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        __ext4_error(inode->i_sb, function,
+                        es->s_last_error_block = cpu_to_le64(blk);
-                                   "invalid block reference %u "
+                        ext4_error_inode(inode, function, line, blk,
-                                   "in inode #%lu", blk, inode->i_ino);
+                                         "invalid block");
                        return -EIO;
                }
        }
@@ -359,11 +367,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              (__le32 *)(bh)->b_data,                   \
                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+        __ext4_check_blockref(__func__, __LINE__, inode,                \
+                              EXT4_I(inode)->i_data,                    \
                              EXT4_NDIR_BLOCKS)
 /**
@@ -785,7 +795,7 @@ failed:
        /* Allocation failed, free what we already allocated */
        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +885,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +900,9 @@ err_out:
 }
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +927,8 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int maxblocks,
+                               struct ext4_map_blocks *map,
-                               struct buffer_head *bh_result,
                               int flags)
 {
        int err = -EIO;
@@ -933,9 +942,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, iblock, offsets,
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
        if (depth == 0)
@@ -946,10 +955,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result);
                count++;
                /*map more blocks*/
-                while (count < maxblocks && count <= blocks_to_boundary) {
+                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +977,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
-        goal = ext4_find_goal(inode, iblock, partial);
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +987,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * direct blocks to allocate for this branch.
         */
        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                        maxblocks, blocks_to_boundary);
+                                      map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
                                &count, goal,
                                offsets + (partial - chain), partial);
@@ -995,18 +1003,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, iblock,
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
                                         partial, indirect_blks, count);
        if (err)
                goto cleanup;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -1016,7 +1026,6 @@ cleanup:
                brelse(partial->bh);
                partial--;
        }
-        BUFFER_TRACE(bh_result, "returned");
 out:
        return err;
 }
@@ -1061,7 +1070,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
        return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1085,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int mdb_free = 0, allocated_meta_blocks = 0;
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1099,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
-        used += ei->i_allocated_meta_blocks;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        allocated_meta_blocks = ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
        if (ei->i_reserved_data_blocks == 0) {
                /*
@@ -1103,30 +1110,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                mdb_free = ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
        }
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* Update quota subsystem */
+        /* Update quota subsystem for data blocks */
-        if (quota_claim) {
+        if (quota_claim)
                dquot_claim_block(inode, used);
-                if (mdb_free)
+        else {
-                        dquot_release_reservation_block(inode, mdb_free);
-        } else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
-                 * not update the quota for allocated blocks. But then
+                 * not re-claim the quota for fallocated blocks.
-                 * converting an fallocate region to initialized region would
-                 * have caused a metadata allocation. So claim quota for
-                 * that
                 */
-                if (allocated_meta_blocks)
+                dquot_release_reservation_block(inode, used);
-                        dquot_claim_block(inode, allocated_meta_blocks);
-                dquot_release_reservation_block(inode, mdb_free + used);
        }
        /*
@@ -1139,20 +1139,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, const char *msg,
+static int __check_block_validity(struct inode *inode, const char *func,
-                                sector_t logical, sector_t phys, int len)
+                                unsigned int line,
+                                struct ext4_map_blocks *map)
 {
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
-                __ext4_error(inode->i_sb, msg,
+                                   map->m_len)) {
-                           "inode #%lu logical block %llu mapped to %llu "
+                ext4_error_inode(inode, func, line, map->m_pblk,
-                           "(size %d)", inode->i_ino,
+                                 "lblock %lu mapped to illegal pblock "
-                           (unsigned long long) logical,
+                                 "(length %d)", (unsigned long) map->m_lblk,
-                           (unsigned long long) phys, len);
+                                 map->m_len);
                return -EIO;
        }
        return 0;
 }
+#define check_block_validity(inode, map)        \
+        __check_block_validity((inode), __func__, __LINE__, (map))
 /*
 * Return the number of contiguous dirty pages in a given inode
 * starting at page frame idx.
@@ -1212,15 +1216,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1237,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                    unsigned int max_blocks, struct buffer_head *bh,
+                    struct ext4_map_blocks *map, int flags)
-                    int flags)
 {
        int retval;
-        clear_buffer_mapped(bh);
+        map->m_flags = 0;
-        clear_buffer_unwritten(bh);
+        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
-        ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  (unsigned long) map->m_lblk);
-                  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-                  (unsigned long)block);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, 0);
-                                bh, 0);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ind_map_blocks(handle, inode, map, 0);
-                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system corruption",
+                int ret = check_block_validity(inode, map);
-                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1277,7 +1275,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * ext4_ext_get_block() returns th create = 0
         * with buffer head unmapped.
         */
-        if (retval > 0 && buffer_mapped(bh))
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                return retval;
        /*
@@ -1290,7 +1288,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * of BH_Unwritten and BH_Mapped flags being simultaneously
         * set on the buffer_head.
         */
-        clear_buffer_unwritten(bh);
+        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1310,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, flags);
-                                              bh, flags);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block,
+                retval = ext4_ind_map_blocks(handle, inode, map, flags);
-                                             max_blocks, bh, flags);
-                if (retval > 0 && buffer_new(bh)) {
+                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
@@ -1342,10 +1338,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system "
+                int ret = check_block_validity(inode, map);
-                                               "corruption after allocation",
-                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1355,109 +1349,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
+                           struct buffer_head *bh, int flags)
 {
        handle_t *handle = ext4_journal_current_handle();
+        struct ext4_map_blocks map;
        int ret = 0, started = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
-        if (create && !handle) {
+        map.m_lblk = iblock;
+        map.m_len = bh->b_size >> inode->i_blkbits;
+        if (flags && !handle) {
                /* Direct IO write... */
-                if (max_blocks > DIO_MAX_BLOCKS)
+                if (map.m_len > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
+                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        goto out;
+                        return ret;
                }
                started = 1;
        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+        ret = ext4_map_blocks(handle, inode, &map, flags);
-                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
+                map_bh(bh, inode->i_sb, map.m_pblk);
+                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
        if (started)
                ext4_journal_stop(handle);
-out:
        return ret;
 }
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh, int create)
+{
+        return _ext4_get_block(inode, iblock, bh,
+                               create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
 /*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int create, int *errp)
 {
-        struct buffer_head dummy;
+        struct ext4_map_blocks map;
+        struct buffer_head *bh;
        int fatal = 0, err;
-        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
-        dummy.b_state = 0;
+        map.m_lblk = block;
-        dummy.b_blocknr = -1000;
+        map.m_len = 1;
-        buffer_trace_init(&dummy.b_history);
+        err = ext4_map_blocks(handle, inode, &map,
-        if (create)
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-                flags |= EXT4_GET_BLOCKS_CREATE;
-        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
+        if (err < 0)
-        /*
+                *errp = err;
-         * ext4_get_blocks() returns number of blocks mapped. 0 in
+        if (err <= 0)
-         * case of a HOLE.
+                return NULL;
-         */
+        *errp = 0;
-        if (err > 0) {
-                if (err > 1)
+        bh = sb_getblk(inode->i_sb, map.m_pblk);
-                        WARN_ON(1);
+        if (!bh) {
-                err = 0;
+                *errp = -EIO;
+                return NULL;
        }
-        *errp = err;
+        if (map.m_flags & EXT4_MAP_NEW) {
-        if (!err && buffer_mapped(&dummy)) {
+                J_ASSERT(create != 0);
-                struct buffer_head *bh;
+                J_ASSERT(handle != NULL);
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (!bh) {
-                        *errp = -EIO;
-                        goto err;
-                }
-                if (buffer_new(&dummy)) {
-                        J_ASSERT(create != 0);
-                        J_ASSERT(handle != NULL);
-                        /*
+                /*
-                         * Now that we do not always journal data, we should
+                 * Now that we do not always journal data, we should
-                         * keep in mind whether this should always journal the
+                 * keep in mind whether this should always journal the
-                         * new buffer as metadata.  For now, regular file
+                 * new buffer as metadata.  For now, regular file
-                         * writes use ext4_get_block instead, so it's not a
+                 * writes use ext4_get_block instead, so it's not a
-                         * problem.
+                 * problem.
-                         */
+                 */
-                        lock_buffer(bh);
+                lock_buffer(bh);
-                        BUFFER_TRACE(bh, "call get_create_access");
+                BUFFER_TRACE(bh, "call get_create_access");
-                        fatal = ext4_journal_get_create_access(handle, bh);
+                fatal = ext4_journal_get_create_access(handle, bh);
-                        if (!fatal && !buffer_uptodate(bh)) {
+                if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (!fatal)
-                                fatal = err;
-                } else {
-                        BUFFER_TRACE(bh, "not a new buffer");
-                }
-                if (fatal) {
-                        *errp = fatal;
-                        brelse(bh);
-                        bh = NULL;
                }
-                return bh;
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!fatal)
+                        fatal = err;
+        } else {
+                BUFFER_TRACE(bh, "not a new buffer");
        }
-err:
+        if (fatal) {
-        return NULL;
+                *errp = fatal;
+                brelse(bh);
+                bh = NULL;
+        }
+        return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1538,9 +1532,25 @@ static int walk_page_buffers(handle_t *handle,
 static int do_journal_get_write_access(handle_t *handle,
                                       struct buffer_head *bh)
 {
+        int dirty = buffer_dirty(bh);
+        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
-        return ext4_journal_get_write_access(handle, bh);
+        /*
+         * __block_prepare_write() could have dirtied some buffers. Clean
+         * the dirty bit as jbd2_journal_get_write_access() could complain
+         * otherwise about fs integrity issues. Setting of the dirty bit
+         * by __block_prepare_write() isn't a real problem here as we clear
+         * the bit before releasing a page lock and thus writeback cannot
+         * ever write the buffer.
+         */
+        if (dirty)
+                clear_buffer_dirty(bh);
+        ret = ext4_journal_get_write_access(handle, bh);
+        if (!ret && dirty)
+                ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+        return ret;
 }
 /*
@@ -1597,11 +1607,9 @@ retry:
        *pagep = page;
        if (ext4_should_dioread_nolock(inode))
-                ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
-                                fsdata, ext4_get_block_write);
        else
-                ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+                ret = __block_write_begin(page, pos, len, ext4_get_block);
-                                fsdata, ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = walk_page_buffers(handle, page_buffers(page),
@@ -1612,7 +1620,7 @@ retry:
                unlock_page(page);
                page_cache_release(page);
                /*
-                 * block_write_begin may have instantiated a few blocks
+                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
                 *
@@ -1860,7 +1868,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed, md_reserved;
+        unsigned long md_needed;
        int ret;
        /*
@@ -1870,22 +1878,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_reserved = ei->i_reserved_meta_blocks;
        md_needed = ext4_calc_metadata_amount(inode, lblock);
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
        /*
-         * Make quota reservation here to prevent quota overflow
+         * We will charge metadata quota at writeout time; this saves
-         * later. Real quota accounting is done at pages writeout
+         * us from metadata over-estimation, though we may go over by
-         * time.
+         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, md_needed + 1);
+        ret = dquot_reserve_block(inode, 1);
        if (ret)
                return ret;
+        /*
+         * We do still charge estimated metadata to the sb though;
+         * we cannot afford to run out of free blocks.
+         */
        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                dquot_release_reservation_block(inode, md_needed + 1);
+                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1910,6 +1920,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1943,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                to_free += ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
-        /* update fs dirty blocks counter */
+        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2054,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
 *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
 * the function goes through all passed space and put actual disk
 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct buffer_head *exbh)
+                                 struct ext4_map_blocks *map)
 {
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        int blocks = exbh->b_size >> inode->i_blkbits;
+        int blocks = map->m_len;
-        sector_t pblock = exbh->b_blocknr, cur_logical;
+        sector_t pblock = map->m_pblk, cur_logical;
        struct buffer_head *head, *bh;
        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
        pagevec_init(&pvec, 0);
@@ -2090,17 +2097,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        /* skip blocks out of the range */
                        do {
-                                if (cur_logical >= logical)
+                                if (cur_logical >= map->m_lblk)
                                        break;
                                cur_logical++;
                        } while ((bh = bh->b_this_page) != head);
                        do {
-                                if (cur_logical >= logical + blocks)
+                                if (cur_logical >= map->m_lblk + blocks)
                                        break;
-                                if (buffer_delay(bh) ||
+                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
-                                                buffer_unwritten(bh)) {
                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2119,7 +2125,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
-                                if (buffer_uninit(exbh))
+                                if (map->m_flags & EXT4_MAP_UNINIT)
                                        set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
@@ -2130,21 +2136,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-                                             struct buffer_head *bh)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        int blocks, i;
-        blocks = bh->b_size >> inode->i_blkbits;
-        for (i = 0; i < blocks; i++)
-                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2206,7 +2197,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct buffer_head new;
+        struct ext4_map_blocks map;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2230,7 +2221,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        BUG_ON(!handle);
        /*
-         * Call ext4_get_blocks() to allocate any delayed allocation
+         * Call ext4_map_blocks() to allocate any delayed allocation
         * blocks, or to convert an uninitialized extent to be
         * initialized (in the case where we have written into
         * one or more preallocated blocks).
@@ -2239,7 +2230,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * indicate that we are on the delayed allocation path.  This
         * affects functions in many different parts of the allocation
         * call path.  This flag exists primarily because we don't
-         * want to change *many* call functions, so ext4_get_blocks()
+         * want to change *many* call functions, so ext4_map_blocks()
         * will set the magic i_delalloc_reserved_flag once the
         * inode's allocation semaphore is taken.
         *
@@ -2247,16 +2238,18 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
         * variables are updated after the blocks have been allocated.
         */
-        new.b_state = 0;
+        map.m_lblk = next;
+        map.m_len = max_blocks;
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
        if (ext4_should_dioread_nolock(mpd->inode))
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-                               &new, get_blocks_flags);
        if (blks < 0) {
+                struct super_block *sb = mpd->inode->i_sb;
                err = blks;
                /*
                 * If get block returns with error we simply
@@ -2267,7 +2260,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                        return 0;
                if (err == -ENOSPC &&
-                    ext4_count_free_blocks(mpd->inode->i_sb)) {
+                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
                        return 0;
                }
@@ -2279,16 +2272,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                 * writepage and writepages will again try to write
                 * the same.
                 */
-                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+                if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-                         "delayed block allocation failed for inode %lu at "
+                        ext4_msg(sb, KERN_CRIT,
-                         "logical offset %llu with max blocks %zd with "
+                                 "delayed block allocation failed for inode %lu "
-                         "error %d\n", mpd->inode->i_ino,
+                                 "at logical offset %llu with max blocks %zd "
-                         (unsigned long long) next,
+                                 "with error %d", mpd->inode->i_ino,
-                         mpd->b_size >> mpd->inode->i_blkbits, err);
+                                 (unsigned long long) next,
-                printk(KERN_CRIT "This should not happen!!  "
+                                 mpd->b_size >> mpd->inode->i_blkbits, err);
-                       "Data will be lost\n");
+                        ext4_msg(sb, KERN_CRIT,
-                if (err == -ENOSPC) {
+                                "This should not happen!! Data will be lost\n");
-                        ext4_print_free_blocks(mpd->inode);
+                        if (err == -ENOSPC)
+                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
@@ -2297,10 +2291,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        }
        BUG_ON(blks == 0);
-        new.b_size = (blks << mpd->inode->i_blkbits);
+        if (map.m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+                int i;
-        if (buffer_new(&new))
+                for (i = 0; i < map.m_len; i++)
-                __unmap_underlying_blocks(mpd->inode, &new);
+                        unmap_underlying_metadata(bdev, map.m_pblk + i);
+        }
        /*
         * If blocks are delayed marked, we need to
@@ -2308,7 +2305,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         */
        if ((mpd->b_state & (1 << BH_Delay)) ||
            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, next, &new);
+                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2346,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        sector_t next;
        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        /*
+         * XXX Don't go larger than mballoc is willing to allocate
+         * This is a stopgap solution.  We eventually need to fold
+         * mpage_da_submit_io() into this function and then call
+         * ext4_map_blocks() multiple times in a loop
+         */
+        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+                goto flush_it;
        /* check if thereserved journal credits might overflow */
-        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -2423,17 +2429,6 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head;
        sector_t logical;
-        if (mpd->io_done) {
-                /*
-                 * Rest of the page in the page_vec
-                 * redirty then and skip then. We will
-                 * try to write them again after
-                 * starting a new transaction
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return MPAGE_DA_EXTENT_TAIL;
-        }
        /*
         * Can we merge this page to current extent?
         */
@@ -2528,8 +2523,9 @@ static int __mpage_da_writepage(struct page *page,
 * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh, int create)
 {
+        struct ext4_map_blocks map;
        int ret = 0;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2537,16 +2533,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                invalid_block = ~0;
        BUG_ON(create == 0);
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+        map.m_lblk = iblock;
+        map.m_len = 1;
        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
+        ret = ext4_map_blocks(NULL, inode, &map, 0);
-        if ((ret == 0) && !buffer_delay(bh_result)) {
+        if (ret < 0)
-                /* the block isn't (pre)allocated yet, let's reserve space */
+                return ret;
+        if (ret == 0) {
+                if (buffer_delay(bh))
+                        return 0; /* Not sure this could or should happen */
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -2556,62 +2558,47 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, invalid_block);
+                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh_result);
+                set_buffer_new(bh);
-                set_buffer_delay(bh_result);
+                set_buffer_delay(bh);
-        } else if (ret > 0) {
+                return 0;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (buffer_unwritten(bh_result)) {
-                        /* A delayed write to unwritten bh should
-                         * be marked new and mapped.  Mapped ensures
-                         * that we don't do get_block multiple times
-                         * when we write to the same offset and new
-                         * ensures that we do proper zero out for
-                         * partial write.
-                         */
-                        set_buffer_new(bh_result);
-                        set_buffer_mapped(bh_result);
-                }
-                ret = 0;
        }
-        return ret;
+        map_bh(bh, inode->i_sb, map.m_pblk);
+        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+        if (buffer_unwritten(bh)) {
+                /* A delayed write to unwritten bh should be marked
+                 * new and mapped.  Mapped ensures that we don't do
+                 * get_block multiple times when we write to the same
+                 * offset and new ensures that we do proper zero out
+                 * for partial write.
+                 */
+                set_buffer_new(bh);
+                set_buffer_mapped(bh);
+        }
+        return 0;
 }
 /*
 * This function is used as a standard get_block_t calback function
 * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_prepare_write(), nobh_writepage(), and
+ * callback function for block_prepare_write() and block_write_full_page().
- * block_write_full_page().  These functions should only try to map a
+ * These functions should only try to map a single block at a time.
- * single block at a time.
 *
 * Since this function doesn't do block allocations even if the caller
 * requests it by passing in create=1, it is critically important that
 * any caller checks to make sure that any buffer heads are returned
 * by this function are either all already mapped or marked for
- * delayed allocation before calling nobh_writepage() or
+ * delayed allocation before calling  block_write_full_page().  Otherwise,
- * block_write_full_page().  Otherwise, b_blocknr could be left
+ * b_blocknr could be left unitialized, and the page write functions will
- * unitialized, and the page write functions will be taken by
+ * be taken by surprise.
- * surprise.
 */
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        return _ext4_get_block(inode, iblock, bh_result, 0);
-        /*
-         * we don't want to do block allocation in writepage
-         * so call get_block_wrap with create = 0
-         */
-        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        return ret;
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2790,9 +2777,7 @@ static int ext4_writepage(struct page *page,
                return __ext4_journalled_writepage(page, len);
        }
-        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+        if (page_bufs && buffer_uninit(page_bufs)) {
-                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-        else if (page_bufs && buffer_uninit(page_bufs)) {
                ext4_set_bh_endio(page_bufs, inode);
                ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                            wbc, ext4_end_io_buffer_write);
@@ -2821,13 +2806,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
        return ext4_chunk_trans_blocks(inode, max_blocks);
 }
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *      Range cyclic is ignored.
+ *      no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
+{
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        long nr_to_write = wbc->nr_to_write;
+        pagevec_init(&pvec, 0);
+        index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        while (!done && (index <= end)) {
+                int i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point, the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
+                         * even swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
+                         * because we have a reference on the page.
+                         */
+                        if (page->index > end) {
+                                done = 1;
+                                break;
+                        }
+                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (PageWriteback(page)) {
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
+                                else
+                                        goto continue_unlock;
+                        }
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (unlikely(ret)) {
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0) {
+                                nr_to_write--;
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        return ret;
+}
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
@@ -2836,7 +2939,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
        unsigned int max_pages;
@@ -2916,12 +3018,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-        /*
-         * we don't want write_cache_pages to update
-         * nr_to_write and writeback_index
-         */
-        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
 retry:
@@ -2941,7 +3037,7 @@ retry:
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-                               "%ld pages, ino %lu; err %d\n", __func__,
+                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        goto out_writepages;
                }
@@ -2963,8 +3059,7 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                ret = write_cache_pages_da(mapping, wbc, &mpd);
-                                        &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3111,7 @@ retry:
        if (pages_skipped != wbc->pages_skipped)
                ext4_msg(inode->i_sb, KERN_CRIT,
                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d\n",
+                         "with nr_to_write = %ld ret = %d",
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
@@ -3030,8 +3125,6 @@ retry:
                mapping->writeback_index = index;
 out_writepages:
-        if (!no_nrwrite_index_update)
-                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,16 +3169,13 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0, quota_retries = 0;
+        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
-        unsigned from, to;
        struct inode *inode = mapping->host;
        handle_t *handle;
        index = pos >> PAGE_CACHE_SHIFT;
-        from = pos & (PAGE_CACHE_SIZE - 1);
-        to = from + len;
        if (ext4_nonda_switch(inode->i_sb)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3118,8 +3208,7 @@ retry:
        }
        *pagep = page;
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
-                                ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -3135,22 +3224,6 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        if ((ret == -EDQUOT) &&
-            EXT4_I(inode)->i_reserved_meta_blocks &&
-            (quota_retries++ < 3)) {
-                /*
-                 * Since we often over-estimate the number of meta
-                 * data blocks required, we may sometimes get a
-                 * spurios out of quota error even though there would
-                 * be enough space once we write the data blocks and
-                 * find out how many meta data blocks were _really_
-                 * required.  So try forcing the inode write to see if
-                 * that helps.
-                 */
-                write_inode_now(inode, (quota_retries == 3));
-                goto retry;
-        }
 out:
        return ret;
 }
@@ -3494,15 +3567,24 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 retry:
        if (rw == READ && ext4_should_dioread_nolock(inode))
-                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+                ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
-                                 ext4_get_block, NULL);
+                                 ext4_get_block, NULL, NULL, 0);
-        else
+        else {
                ret = blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
+                if (unlikely((rw & WRITE) && ret < 0)) {
+                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
+                        if (end > isize)
+                                vmtruncate(inode, isize);
+                }
+        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -3546,46 +3628,18 @@ out:
        return ret;
 }
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = ext4_journal_current_handle();
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        int dio_credits;
-        int started = 0;
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-        /*
+        return _ext4_get_block(inode, iblock, bh_result,
-         * ext4_get_block in prepare for a DIO write or buffer write.
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
-         * We allocate an uinitialized extent if blocks haven't been allocated.
-         * The extent will be converted to initialized after IO complete.
-         */
-        create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (!handle) {
-                if (max_blocks > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                handle = ext4_journal_start(inode, dio_credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        goto out;
-                }
-                started = 1;
-        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                              create);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        if (started)
-                ext4_journal_stop(handle);
-out:
-        return ret;
 }
 static void dump_completed_IO(struct inode * inode)
@@ -3645,6 +3699,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
                return ret;
        }
+        if (io->iocb)
+                aio_complete(io->iocb, io->result, 0);
        /* clear the DIO AIO unwritten flag */
        io->flag = 0;
        return ret;
@@ -3744,6 +3800,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
                io->offset = 0;
                io->size = 0;
                io->page = NULL;
+                io->iocb = NULL;
+                io->result = 0;
                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -3752,7 +3810,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 }
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                            ssize_t size, void *private)
+                            ssize_t size, void *private, int ret,
+                            bool is_async)
 {
        ext4_io_end_t *io_end = iocb->private;
        struct workqueue_struct *wq;
@@ -3761,7 +3820,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        /* if not async direct IO or dio with 0 bytes write, just return */
        if (!io_end || !size)
-                return;
+                goto out;
        ext_debug("ext4_end_io_dio(): io_end 0x%p"
                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
@@ -3772,12 +3831,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (io_end->flag != EXT4_IO_UNWRITTEN){
                ext4_free_io_end(io_end);
                iocb->private = NULL;
+out:
+                if (is_async)
+                        aio_complete(iocb, ret, 0);
                return;
        }
        io_end->offset = offset;
        io_end->size = size;
-        io_end->flag = EXT4_IO_UNWRITTEN;
+        if (is_async) {
+                io_end->iocb = iocb;
+                io_end->result = ret;
+        }
        wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
        /* queue the work to convert unwritten extents to written */
@@ -3914,7 +3979,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                                return -ENOMEM;
                        /*
                         * we save the io structure for current async
-                         * direct IO, so that later ext4_get_blocks()
+                         * direct IO, so that later ext4_map_blocks()
                         * could flag the io structure whether there
                         * is a unwritten extents needs to be converted
                         * when IO is completed.
@@ -3973,7 +4038,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4105,17 +4170,6 @@ int ext4_block_truncate_page(handle_t *handle,
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        /*
-         * For "nobh" option,  we can only work if we don't need to
-         * read-in the page - otherwise we create buffers to do the IO.
-         */
-        if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-             ext4_should_writeback_data(inode) && PageUptodate(page)) {
-                zero_user(page, offset, length);
-                set_page_dirty(page);
-                goto unlock;
-        }
        if (!page_has_buffers(page))
                create_empty_buffers(page, blocksize, 0);
@@ -4302,10 +4356,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
-                ext4_error(inode->i_sb, "inode #%lu: "
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
-                           "attempt to clear blocks %llu len %lu, invalid",
+                                 "blocks %llu len %lu",
-                           inode->i_ino, (unsigned long long) block_to_free,
+                                 (unsigned long long) block_to_free, count);
-                           count);
                return 1;
        }
@@ -4410,11 +4463,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "circular indirect block detected, "
+                                         "circular indirect block detected at "
-                                   "inode=%lu, block=%llu",
+                                         "block %llu",
-                                   inode->i_ino,
+                                (unsigned long long) this_bh->b_blocknr);
-                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -4452,11 +4504,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                   nr, 1)) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "indirect mapped block in inode "
+                                                 "invalid indirect mapped "
-                                           "#%lu invalid (level %d, blk #%lu)",
+                                                 "block %lu (level %d)",
-                                           inode->i_ino, depth,
+                                                 (unsigned long) nr, depth);
-                                           (unsigned long) nr);
                                break;
                        }
@@ -4468,9 +4519,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE_BLOCK(inode, nr,
-                                           "Read failure, inode=%lu, block=%llu",
+                                                       "Read failure");
-                                           inode->i_ino, nr);
                                continue;
                        }
@@ -4482,27 +4532,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                        depth);
                        /*
-                         * We've probably journalled the indirect block several
-                         * times during the truncate.  But it's no longer
-                         * needed and we now drop it from the transaction via
-                         * jbd2_journal_revoke().
-                         *
-                         * That's easy if it's exclusively part of this
-                         * transaction.  But if it's part of the committing
-                         * transaction then jbd2_journal_forget() will simply
-                         * brelse() it.  That means that if the underlying
-                         * block is reallocated in ext4_get_block(),
-                         * unmap_underlying_metadata() will find this block
-                         * and will try to get rid of it.  damn, damn.
-                         *
-                         * If this block has already been committed to the
-                         * journal, a revoke record will be written.  And
-                         * revoke records must be emitted *before* clearing
-                         * this block's bit in the bitmaps.
-                         */
-                        ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
-                        /*
                         * Everything below this this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
@@ -4526,8 +4555,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                            blocks_for_truncate(inode));
                        }
+                        /*
+                         * The forget flag here is critical because if
+                         * we are journaling (and not doing data
+                         * journaling), we have to make sure a revoke
+                         * record is written to prevent the journal
+                         * replay from overwriting the (former)
+                         * indirect block if it gets reallocated as a
+                         * data block.  This must happen in the same
+                         * transaction where the data blocks are
+                         * actually freed.
+                         */
                        ext4_free_blocks(handle, inode, 0, nr, 1,
-                                         EXT4_FREE_BLOCKS_METADATA);
+                                         EXT4_FREE_BLOCKS_METADATA|
+                                         EXT4_FREE_BLOCKS_FORGET);
                        if (parent_bh) {
                                /*
@@ -4612,12 +4653,12 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
                return;
        }
@@ -4785,8 +4826,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "unable to read inode block - "
+                EXT4_ERROR_INODE_BLOCK(inode, block,
-                           "inode=%lu, block=%llu", inode->i_ino, block);
+                                       "unable to read itable block");
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4884,8 +4925,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, "unable to read inode block - inode=%lu,"
+                        EXT4_ERROR_INODE_BLOCK(inode, block,
-                                   " block=%llu", inode->i_ino, block);
+                                               "unable to read itable block");
                        brelse(bh);
                        return -EIO;
                }
@@ -4922,20 +4963,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-        unsigned int flags = ei->vfs_inode.i_flags;
+        unsigned int vfs_fl;
+        unsigned long old_fl, new_fl;
-        ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                        EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+        do {
-        if (flags & S_SYNC)
+                vfs_fl = ei->vfs_inode.i_flags;
-                ei->i_flags |= EXT4_SYNC_FL;
+                old_fl = ei->i_flags;
-        if (flags & S_APPEND)
+                new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                ei->i_flags |= EXT4_APPEND_FL;
+                                EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
-        if (flags & S_IMMUTABLE)
+                                EXT4_DIRSYNC_FL);
-                ei->i_flags |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_SYNC)
-        if (flags & S_NOATIME)
+                        new_fl |= EXT4_SYNC_FL;
-                ei->i_flags |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_APPEND)
-        if (flags & S_DIRSYNC)
+                        new_fl |= EXT4_APPEND_FL;
-                ei->i_flags |= EXT4_DIRSYNC_FL;
+                if (vfs_fl & S_IMMUTABLE)
+                        new_fl |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_NOATIME)
+                        new_fl |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_DIRSYNC)
+                        new_fl |= EXT4_DIRSYNC_FL;
+        } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4950,7 +4997,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
-                if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
@@ -5046,7 +5093,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                transaction_t *transaction;
                tid_t tid;
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
@@ -5055,7 +5102,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }
@@ -5096,11 +5143,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-                ext4_error(sb, "bad extended attribute block %llu inode #%lu",
+                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
-                           ei->i_file_acl, inode->i_ino);
+                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
-        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
+        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                    (S_ISLNK(inode->i_mode) &&
                     !ext4_inode_is_fast_symlink(inode)))
@@ -5142,8 +5189,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
                ret = -EIO;
-                ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
+                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
-                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
        brelse(iloc.bh);
@@ -5172,7 +5218,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5231,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
-                ei->i_flags |= EXT4_HUGE_FILE_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
@@ -5381,9 +5427,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        ext4_error(inode->i_sb, "IO error syncing inode, "
+                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
-                                   "inode=%lu, block=%llu", inode->i_ino,
+                                         "IO error syncing inode");
-                                   (unsigned long long)iloc.bh->b_blocknr);
                        err = -EIO;
                }
                brelse(iloc.bh);
@@ -5425,7 +5470,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
@@ -5455,20 +5500,18 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
-                                error = -EFBIG;
+                                return -EFBIG;
-                                goto err_out;
-                        }
                }
        }
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
            (attr->ia_size < inode->i_size ||
-             (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5500,15 +5543,23 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        }
                }
                /* ext4_truncate will clear the flag */
-                if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
                        ext4_truncate(inode);
        }
-        rc = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode))
+                rc = vmtruncate(inode, attr->ia_size);
+        if (!rc) {
+                setattr_copy(inode, attr);
+                mark_inode_dirty(inode);
+        }
-        /* If inode_setattr's call to ext4_truncate failed to get a
+        /*
-         * transaction handle at all, we need to clean up the in-core
+         * If the call to ext4_truncate failed to get a transaction handle at
-         * orphan list manually. */
+         * all, we need to clean up the in-core orphan list manually.
+         */
        if (inode->i_nlink)
                ext4_orphan_del(NULL, inode);
@@ -5576,7 +5627,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5663,7 +5714,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
@@ -5729,7 +5780,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
-        struct ext4_xattr_entry *entry;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;
@@ -5737,7 +5787,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
-        entry = IFIRST(header);
        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
@@ -5911,9 +5960,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        if (val)
-                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else
-                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
                if (me.moved_len > 0)
                        file_remove_suid(donor_filp);
-                if (copy_to_user((struct move_extent __user *)arg, 
+                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
                        err = -EFAULT;
 mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETRSVSZ:
                cmd = EXT4_IOC_SETRSVSZ;
                break;
-        case EXT4_IOC_GROUP_ADD:
+        case EXT4_IOC32_GROUP_ADD: {
+                struct compat_ext4_new_group_input __user *uinput;
+                struct ext4_new_group_input input;
+                mm_segment_t old_fs;
+                int err;
+                uinput = compat_ptr(arg);
+                err = get_user(input.group, &uinput->group);
+                err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+                err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+                err |= get_user(input.inode_table, &uinput->inode_table);
+                err |= get_user(input.blocks_count, &uinput->blocks_count);
+                err |= get_user(input.reserved_blocks,
+                                &uinput->reserved_blocks);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+                                 (unsigned long) &input);
+                set_fs(old_fs);
+                return err;
+        }
+        case EXT4_IOC_MOVE_EXT:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..4b4ad4b7ce57 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += first + i;
                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                   __func__, "double-free of inode"
+                                              inode ? inode->i_ino : 0,
-                                   " %lu's block %llu(bit %u in group %u)",
+                                              blocknr,
-                                   inode ? inode->i_ino : 0, blocknr,
+                                              "freeing block already freed "
-                                   first + i, e4b->bd_group);
+                                              "(bit %u)",
+                                              first + i);
                }
                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
        }
@@ -658,6 +659,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+        int i;
+        int bits;
+        grp->bb_largest_free_order = -1; /* uninit */
+        bits = sb->s_blocksize_bits + 1;
+        for (i = bits; i >= 0; i--) {
+                if (grp->bb_counters[i] > 0) {
+                        grp->bb_largest_free_order = i;
+                        break;
+                }
+        }
+}
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
@@ -691,15 +713,16 @@ void ext4_mb_generate_buddy(struct super_block *sb,
        grp->bb_fragments = fragments;
        if (free != grp->bb_free) {
-                ext4_grp_locked_error(sb, group,  __func__,
+                ext4_grp_locked_error(sb, group, 0, 0,
-                        "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
+                                      "%u blocks in bitmap, %u in gd",
-                        group, free, grp->bb_free);
+                                      free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
                 * corrupt and update bb_free using bitmap value
                 */
                grp->bb_free = free;
        }
+        mb_set_largest_free_order(sb, grp);
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +748,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
 */
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +891,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -882,6 +909,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore != NULL);
                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_bitmap_load(sb, group);
                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
@@ -910,6 +938,11 @@ out:
        return err;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1037,11 @@ err:
        return ret;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
@@ -1150,7 +1188,7 @@ err:
        return ret;
 }
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
@@ -1259,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += block;
                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                   __func__, "double-free of inode"
+                                              inode ? inode->i_ino : 0,
-                                   " %lu's block %llu(bit %u in group %u)",
+                                              blocknr,
-                                   inode ? inode->i_ino : 0, blocknr, block,
+                                              "freeing already freed block "
-                                   e4b->bd_group);
+                                              "(bit %u)", block);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1299,6 +1337,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        buddy = buddy2;
                } while (1);
        }
+        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
@@ -1427,6 +1466,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
                e4b->bd_info->bb_counters[ord]++;
        }
+        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
@@ -1617,7 +1657,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1672,7 +1712,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                ext4_mb_use_best_found(ac, e4b);
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1749,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         * free blocks even though group info says we
                         * we have free blocks
                         */
-                        ext4_grp_locked_error(sb, e4b->bd_group,
+                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        __func__, "%d free blocks as per "
+                                        "%d free blocks as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
@@ -1759,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
-                        ext4_grp_locked_error(sb, e4b->bd_group,
+                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
-                                        __func__, "%d free blocks as per "
+                                        "%d free blocks as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
@@ -1782,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 /*
 * This is a special case for storages like raid5
- * we try to find stripe-aligned chunks for stripe-size requests
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
- * XXX should do so at least for multiples of stripe size as well
 */
 static noinline_for_stack
 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        }
 }
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
 {
        unsigned free, fragments;
-        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
-        BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+        /* We only do this if the grp has never been initialized */
+        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                int ret = ext4_mb_init_group(ac->ac_sb, group);
+                if (ret)
+                        return 0;
+        }
        free = grp->bb_free;
        fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        case 0:
                BUG_ON(ac->ac_2order == 0);
+                if (grp->bb_largest_free_order < ac->ac_2order)
+                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return 0;
-                bits = ac->ac_sb->s_blocksize_bits + 1;
+                return 1;
-                for (i = ac->ac_2order; i <= bits; i++)
-                        if (grp->bb_counters[i] > 0)
-                                return 1;
-                break;
        case 1:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return 1;
@@ -1955,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        ext4_group_t ngroups, group, i;
        int cr;
        int err = 0;
-        int bsbits;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
@@ -1964,7 +2007,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
-        if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -1997,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
                        ac->ac_2order = i - 1;
        }
-        bsbits = ac->ac_sb->s_blocksize_bits;
        /* if stream allocation is enabled, use global goal */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                /* TBD: may be hot point */
@@ -2024,15 +2065,11 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
-                        struct ext4_group_info *grp;
-                        struct ext4_group_desc *desc;
                        if (group == ngroups)
                                group = 0;
-                        /* quick check to skip empty groups */
+                        /* This now checks without needing the buddy page */
-                        grp = ext4_get_group_info(sb, group);
+                        if (!ext4_mb_good_group(ac, group, cr))
-                        if (grp->bb_free == 0)
                                continue;
                        err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,25 +2077,28 @@ repeat:
                                goto out;
                        ext4_lock_group(sb, group);
+                        /*
+                         * We need to check again after locking the
+                         * block group
+                         */
                        if (!ext4_mb_good_group(ac, group, cr)) {
-                                /* someone did allocation from this group */
                                ext4_unlock_group(sb, group);
-                                ext4_mb_release_desc(&e4b);
+                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }
                        ac->ac_groups_scanned++;
-                        desc = ext4_get_group_desc(sb, group, NULL);
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
-                        else if (cr == 1 &&
+                        else if (cr == 1 && sbi->s_stripe &&
-                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
+                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
                                ext4_mb_scan_aligned(ac, &e4b);
                        else
                                ext4_mb_complex_scan_group(ac, &e4b);
                        ext4_unlock_group(sb, group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
@@ -2148,7 +2188,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_lock_group(sb, group);
        memcpy(&sg, ext4_get_group_info(sb, group), i);
        ext4_unlock_group(sb, group);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2178,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
        rc = seq_open(file, &ext4_mb_seq_groups_ops);
        if (rc == 0) {
-                struct seq_file *m = (struct seq_file *)file->private_data;
+                struct seq_file *m = file->private_data;
                m->private = sb;
        }
        return rc;
@@ -2255,6 +2295,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
+        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 #ifdef DOUBLE_CHECK
        {
@@ -2516,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb)
        return 0;
 }
+static inline void ext4_issue_discard(struct super_block *sb,
+                ext4_group_t block_group, ext4_grpblk_t block, int count)
+{
+        int ret;
+        ext4_fsblk_t discard_block;
+        discard_block = block + ext4_group_first_block_no(sb, block_group);
+        trace_ext4_discard_blocks(sb,
+                        (unsigned long long) discard_block, count);
+        ret = sb_issue_discard(sb, discard_block, count);
+        if (ret == EOPNOTSUPP) {
+                ext4_warning(sb, "discard not supported, disabling");
+                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+        }
+}
 /*
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
@@ -2535,16 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
-                if (test_opt(sb, DISCARD)) {
+                if (test_opt(sb, DISCARD))
-                        ext4_fsblk_t discard_block;
+                        ext4_issue_discard(sb, entry->group,
+                                        entry->start_blk, entry->count);
-                        discard_block = entry->start_blk +
-                                ext4_group_first_block_no(sb, entry->group);
-                        trace_ext4_discard_blocks(sb,
-                                        (unsigned long long)discard_block,
-                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
-                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                }
                ext4_unlock_group(sb, entry->group);
                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
        }
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* 
+        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
@@ -2654,7 +2704,7 @@ void exit_ext4_mballoc(void)
 /*
- * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
 * Returns 0 if success or error code
 */
 static noinline_for_stack int
@@ -2662,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                                handle_t *handle, unsigned int reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
-        struct ext4_super_block *es;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        struct ext4_sb_info *sbi;
@@ -2675,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        err = -EIO;
        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2762,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 out_err:
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
        brelse(bitmap_bh);
        return err;
 }
@@ -2800,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        int bsbits, max;
        ext4_lblk_t end;
        loff_t size, orig_size, start_off;
-        ext4_lblk_t start, orig_start;
+        ext4_lblk_t start;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *pa;
@@ -2831,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
+        orig_size = size;
        /* max size of free chunks */
        max = 2 << bsbits;
@@ -2872,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
                size      = ac->ac_o_ex.fe_len << bsbits;
        }
-        orig_size = size = size >> bsbits;
+        size = size >> bsbits;
-        orig_start = start = start_off >> bsbits;
+        start = start_off >> bsbits;
        /* don't cover already allocated blocks in selected range */
        if (ar->pleft && start <= ar->lleft) {
@@ -2981,7 +3029,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-                if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3171,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        continue;
                /* non-extent files can't have physical blocks past 2^32 */
-                if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
                        continue;
@@ -3280,7 +3328,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* 
+        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
@@ -3497,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
        ext4_group_t group;
        ext4_grpblk_t bit;
        unsigned long long grp_blk_start;
-        sector_t start;
        int err = 0;
        int free = 0;
@@ -3517,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-                start = ext4_group_first_block_no(sb, group) + bit;
                mb_debug(1, "    free preallocated %u/%u in group %u\n",
-                                (unsigned) start, (unsigned) next - bit,
+                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
-                                (unsigned) group);
+                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;
                if (ac) {
@@ -3531,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        trace_ext4_mballoc_discard(ac);
                }
-                trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
+                trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
@@ -3541,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        pa, (unsigned long) pa->pa_lstart,
                        (unsigned long) pa->pa_pstart,
                        (unsigned long) pa->pa_len);
-                ext4_grp_locked_error(sb, group,
+                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
-                                        __func__, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
@@ -3563,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(ac, pa);
+        trace_ext4_mb_release_group_pa(sb, ac, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3697,7 +3742,7 @@ out:
        ext4_unlock_group(sb, group);
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
 }
@@ -3801,7 +3846,7 @@ repeat:
                if (bitmap_bh == NULL) {
                        ext4_error(sb, "Error reading block bitmap for %u",
                                        group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }
@@ -3810,7 +3855,7 @@ repeat:
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);
                list_del(&pa->u.pa_tmp_list);
@@ -3839,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
+        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                return;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
        printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4074,7 +4122,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_mb_release_group_pa(&e4b, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
@@ -4205,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 * to usual allocation
 */
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
-                                 struct ext4_allocation_request *ar, int *errp)
+                                struct ext4_allocation_request *ar, int *errp)
 {
        int freed;
        struct ext4_allocation_context *ac = NULL;
@@ -4249,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                inquota = ar->len;
                if (ar->len == 0) {
                        *errp = -EDQUOT;
-                        goto out3;
+                        goto out;
                }
        }
@@ -4257,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        if (!ac) {
                ar->len = 0;
                *errp = -ENOMEM;
-                goto out1;
+                goto out;
        }
        *errp = ext4_mb_initialize_context(ac, ar);
        if (*errp) {
                ar->len = 0;
-                goto out2;
+                goto out;
        }
        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4272,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                ext4_mb_normalize_request(ac, ar);
 repeat:
                /* allocate space in core */
-                ext4_mb_regular_allocator(ac);
+                *errp = ext4_mb_regular_allocator(ac);
+                if (*errp)
+                        goto errout;
                /* as we've just preallocated more space than
                 * user requested orinally, we store allocated
@@ -4283,7 +4333,7 @@ repeat:
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
-                if (*errp ==  -EAGAIN) {
+                if (*errp == -EAGAIN) {
                        /*
                         * drop the reference that we took
                         * in ext4_mb_use_best_found
@@ -4294,12 +4344,10 @@ repeat:
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        goto repeat;
-                } else if (*errp) {
+                } else if (*errp)
+                errout:
                        ext4_discard_allocated_blocks(ac);
-                        ac->ac_b_ex.fe_len = 0;
+                else {
-                        ar->len = 0;
-                        ext4_mb_show_ac(ac);
-                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
@@ -4308,19 +4356,19 @@ repeat:
                if (freed)
                        goto repeat;
                *errp = -ENOSPC;
+        }
+        if (*errp) {
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
                ext4_mb_show_ac(ac);
        }
        ext4_mb_release_context(ac);
+out:
-out2:
+        if (ac)
-        kmem_cache_free(ext4_ac_cachep, ac);
+                kmem_cache_free(ext4_ac_cachep, ac);
-out1:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, inquota - ar->len);
-out3:
        if (!ar->len) {
                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
                        /* release all the reserved blocks if non delalloc */
@@ -4352,6 +4400,7 @@ static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
 {
+        ext4_group_t group = e4b->bd_group;
        ext4_grpblk_t block;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
@@ -4384,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+                        ext4_grp_locked_error(sb, group, 0,
-                                        "Double free of blocks %d (%d %d)",
+                                ext4_group_first_block_no(sb, group) + block,
-                                        block, entry->start_blk, entry->count);
+                                "Block already on to-be-freed list");
                        return 0;
                }
        }
@@ -4444,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        struct super_block *sb = inode->i_sb;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
-        struct ext4_super_block *es;
        unsigned long freed = 0;
        unsigned int overflow;
        ext4_grpblk_t bit;
@@ -4463,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        }
        sbi = EXT4_SB(sb);
-        es = EXT4_SB(sb)->s_es;
        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_data_block_valid(sbi, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4484,12 +4531,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
        }
-        /* 
+        /*
         * We need to make sure we don't reuse the freed block until
         * after the transaction is committed, which we can do by
         * treating the block as metadata, below.  We make an
@@ -4597,6 +4644,8 @@ do_more:
                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+                if (test_opt(sb, DISCARD))
+                        ext4_issue_discard(sb, block_group, bit, count);
        }
        ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4610,7 +4659,7 @@ do_more:
                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
        }
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        freed += count;
@@ -4630,7 +4679,7 @@ do_more:
                put_bh(bitmap_bh);
                goto do_more;
        }
-        sb->s_dirt = 1;
+        ext4_mark_super_dirty(sb);
 error_return:
        if (freed)
                dquot_free_block(inode, freed);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..1765c2c50a9b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
         */
-        ei->i_flags |= EXT4_EXTENTS_FL;
+        ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
        memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
        /*
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
         */
        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..5f1ed9fc913c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 */
 static int
 mext_check_null_inode(struct inode *inode1, struct inode *inode2,
-                const char *function)
+                      const char *function, unsigned int line)
 {
        int ret = 0;
        if (inode1 == NULL) {
-                __ext4_error(inode2->i_sb, function,
+                __ext4_error(inode2->i_sb, function, line,
                        "Both inodes should not be NULL: "
                        "inode1 NULL inode2 %lu", inode2->i_ino);
                ret = -EIO;
        } else if (inode2 == NULL) {
-                __ext4_error(inode1->i_sb, function,
+                __ext4_error(inode1->i_sb, function, line,
                        "Both inodes should not be NULL: "
                        "inode1 %lu inode2 NULL", inode1->i_ino);
                ret = -EIO;
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        int depth = ext_depth(orig_inode);
        int ret;
+        start_ext.ee_block = end_ext.ee_block = 0;
        o_start = o_end = oext = orig_path[depth].p_ext;
        oext_alen = ext4_ext_get_actual_len(oext);
        start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                ext4_error(orig_inode->i_sb,
+                EXT4_ERROR_INODE(orig_inode,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                   "The extent for donor must be found");
                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
+        if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+                return -EPERM;
        /* Ext4 move extent does not support swapfile */
        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
                ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
        }
        /* Ext4 move extent supports only extent based file */
-        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: orig file is not extents "
                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
                return -EOPNOTSUPP;
-        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: donor file is not extents "
                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
                return -EOPNOTSUPP;
@@ -1080,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
        BUG_ON(inode1 == NULL && inode2 == NULL);
-        ret = mext_check_null_inode(inode1, inode2, __func__);
+        ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
        if (ret < 0)
                goto out;
@@ -1117,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
        BUG_ON(inode1 == NULL && inode2 == NULL);
-        ret = mext_check_null_inode(inode1, inode2, __func__);
+        ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
        if (ret < 0)
                goto out;
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        if (ret1 < 0)
                                break;
                        if (*moved_len > len) {
-                                ext4_error(orig_inode->i_sb,
+                                EXT4_ERROR_INODE(orig_inode,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..314c0d3b3fa9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
-unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
-{
-        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN || len == 0)
-                return blocksize;
-        return (len & 65532) | ((len & 3) << 16);
-}
-  
-__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
-{
-        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
-                BUG();
-        if (len < 65536)
-                return cpu_to_le16(len);
-        if (len == blocksize) {
-                if (blocksize == 65536)
-                        return cpu_to_le16(EXT4_MAX_REC_LEN);
-                else 
-                        return cpu_to_le16(0);
-        }
-        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
-}
 /*
 * p is at least 6 bytes before the end of page
 */
@@ -349,7 +325,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                brelse(bh);
        }
        if (bcount)
-                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
-                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
+                if (!ext4_check_dir_entry(dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
                        /* On error, skip the f_pos to the next block. */
@@ -653,10 +629,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int ret, err;
        __u32 hashval;
-        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = dir_file->f_path.dentry->d_inode;
-        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
@@ -801,7 +777,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
-                EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 /*
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
                if ((char *) de + namelen <= dlimit &&
                    ext4_match (namelen, name, de)) {
                        /* found a match - just to be sure, do a full check */
-                        if (!ext4_check_dir_entry("ext4_find_entry",
+                        if (!ext4_check_dir_entry(dir, de, bh, offset))
-                                                  dir, de, bh, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
@@ -943,8 +918,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                        ext4_error(sb, "reading directory #%lu offset %lu",
+                        EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
-                                   dir->i_ino, (unsigned long)block);
+                                         (unsigned long) block);
                        brelse(bh);
                        goto next;
                }
@@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
-                        if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+                        if (!ext4_check_dir_entry(dir, de, bh, off)) {
                                brelse(bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
@@ -1066,15 +1041,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                        ext4_error(dir->i_sb, "bad inode number: %u", ino);
+                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                                ext4_error(dir->i_sb,
+                                EXT4_ERROR_INODE(dir,
-                                                "deleted inode referenced: %u",
+                                                 "deleted inode referenced: %u",
-                                                ino);
+                                                 ino);
                                return ERR_PTR(-EIO);
                        } else {
                                return ERR_CAST(inode);
@@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
        __u32 ino;
-        struct inode *inode;
        static const struct qstr dotdot = {
                .name = "..",
                .len = 2,
@@ -1097,15 +1071,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
        struct buffer_head *bh;
        bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-        inode = NULL;
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
        brelse(bh);
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-                ext4_error(child->d_inode->i_sb,
+                EXT4_ERROR_INODE(child->d_inode,
-                           "bad inode number: %u", ino);
+                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1141,7 +1114,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
@@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = (struct ext4_dir_entry_2 *)bh->b_data;
                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
-                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+                        if (!ext4_check_dir_entry(dir, de, bh, offset))
-                                                  bh, offset))
                                return -EIO;
                        if (ext4_match(namelen, name, de))
                                return -EEXIST;
@@ -1404,9 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-                ext4_error(dir->i_sb,
+                EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
-                           "invalid rec_len for '..' in inode %lu",
-                           dir->i_ino);
                brelse(bh);
                return -EIO;
        }
@@ -1418,7 +1388,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                brelse(bh);
                return retval;
        }
-        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data1 = bh2->b_data;
        memcpy (data1, de, len);
@@ -1491,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        return retval;
-                EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
        }
@@ -1519,6 +1489,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
+        if (retval == 0)
+                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
 }
@@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle,
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        while (i < bh->b_size) {
-                if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+                if (!ext4_check_dir_entry(dir, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
                        BUFFER_TRACE(bh, "get_write_access");
@@ -1915,9 +1887,8 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "error %d reading directory #%lu offset 0",
+                                "error %d reading directory lblock 0", err);
-                                   err, inode->i_ino);
                else
                        ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
@@ -1941,23 +1912,23 @@ static int empty_dir(struct inode *inode)
        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
-                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                        unsigned int lblock;
                        err = 0;
                        brelse(bh);
-                        bh = ext4_bread(NULL, inode,
+                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
-                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+                        bh = ext4_bread(NULL, inode, lblock, 0, &err);
                        if (!bh) {
                                if (err)
-                                        ext4_error(sb,
+                                        EXT4_ERROR_INODE(inode,
-                                                   "error %d reading directory"
+                                                "error %d reading directory "
-                                                   " #%lu offset %u",
+                                                "lblock %u", err, lblock);
-                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
                        }
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
-                if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+                if (!ext4_check_dir_entry(inode, de, bh, offset)) {
                        de = (struct ext4_dir_entry_2 *)(bh->b_data +
                                                         sb->s_blocksize);
                        offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2297,7 +2268,7 @@ retry:
                }
        } else {
                /* clear the extent format for fast symlink */
-                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                inode->i_op = &ext4_fast_symlink_inode_operations;
                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
                inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..ca5c8aa00a2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
                atomic_add(input->free_blocks_count,
@@ -920,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
-        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+        ext4_handle_dirty_super(handle, sb);
-        sb->s_dirt = 1;
 exit_journal:
        mutex_unlock(&sbi->s_resize_lock);
@@ -952,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                      ext4_fsblk_t n_blocks_count)
 {
        ext4_fsblk_t o_blocks_count;
-        ext4_group_t o_groups_count;
        ext4_grpblk_t last;
        ext4_grpblk_t add;
        struct buffer_head *bh;
@@ -964,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
         * yet: we're going to revalidate es->s_blocks_count after
         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
-        o_groups_count = EXT4_SB(sb)->s_groups_count;
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1044,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
-        sb->s_dirt = 1;
        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
        ext4_add_groupblocks(handle, sb, o_blocks_count, add);
+        ext4_handle_dirty_super(handle, sb);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..26147746c272 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,13 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
+        vfs_check_frozen(sb, SB_FREEZE_TRANS);
        /* Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, __func__, "Detected aborted journal");
+                        ext4_abort(sb, "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -261,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
 */
-int __ext4_journal_stop(const char *where, handle_t *handle)
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 {
        struct super_block *sb;
        int err;
@@ -278,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        if (!err)
                err = rc;
        if (err)
-                __ext4_std_error(sb, where, err);
+                __ext4_std_error(sb, where, line, err);
        return err;
 }
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
-                struct buffer_head *bh, handle_t *handle, int err)
+                               const char *err_fn, struct buffer_head *bh,
+                               handle_t *handle, int err)
 {
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -299,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
        if (is_handle_aborted(handle))
                return;
-        printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+        printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
-               caller, errstr, err_fn);
+               caller, line, errstr, err_fn);
        jbd2_journal_abort_handle(handle);
 }
+static void __save_error_info(struct super_block *sb, const char *func,
+                            unsigned int line)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+        es->s_last_error_time = cpu_to_le32(get_seconds());
+        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+        es->s_last_error_line = cpu_to_le32(line);
+        if (!es->s_first_error_time) {
+                es->s_first_error_time = es->s_last_error_time;
+                strncpy(es->s_first_error_func, func,
+                        sizeof(es->s_first_error_func));
+                es->s_first_error_line = cpu_to_le32(line);
+                es->s_first_error_ino = es->s_last_error_ino;
+                es->s_first_error_block = es->s_last_error_block;
+        }
+        /*
+         * Start the daily error reporting function if it hasn't been
+         * started already
+         */
+        if (!es->s_error_count)
+                mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+        es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+}
+static void save_error_info(struct super_block *sb, const char *func,
+                            unsigned int line)
+{
+        __save_error_info(sb, func, line);
+        ext4_commit_super(sb, 1);
+}
 /* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
@@ -322,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 static void ext4_handle_error(struct super_block *sb)
 {
-        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        if (sb->s_flags & MS_RDONLY)
                return;
@@ -341,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb)
                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-        ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
 }
 void __ext4_error(struct super_block *sb, const char *function,
-                const char *fmt, ...)
+                  unsigned int line, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
+               sb->s_id, function, line, current->comm);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -361,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function,
        ext4_handle_error(sb);
 }
-void ext4_error_inode(const char *function, struct inode *inode,
+void ext4_error_inode(struct inode *inode, const char *function,
+                      unsigned int line, ext4_fsblk_t block,
                      const char *fmt, ...)
 {
        va_list args;
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+        es->s_last_error_block = cpu_to_le64(block);
+        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-               inode->i_sb->s_id, function, inode->i_ino, current->comm);
+               inode->i_sb->s_id, function, line, inode->i_ino);
+        if (block)
+                printk("block %llu: ", block);
+        printk("comm %s: ", current->comm);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -376,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
        ext4_handle_error(inode->i_sb);
 }
-void ext4_error_file(const char *function, struct file *file,
+void ext4_error_file(struct file *file, const char *function,
-                     const char *fmt, ...)
+                     unsigned int line, const char *fmt, ...)
 {
        va_list args;
+        struct ext4_super_block *es;
        struct inode *inode = file->f_dentry->d_inode;
        char pathname[80], *path;
+        es = EXT4_SB(inode->i_sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+        save_error_info(inode->i_sb, function, line);
        va_start(args, fmt);
        path = d_path(&(file->f_path), pathname, sizeof(pathname));
        if (!path)
                path = "(unknown)";
        printk(KERN_CRIT
-               "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+               "EXT4-fs error (device %s): %s:%d: inode #%lu "
-               inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+               "(comm %s path %s): ",
+               inode->i_sb->s_id, function, line, inode->i_ino,
+               current->comm, path);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -434,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */
-void __ext4_std_error(struct super_block *sb, const char *function, int errno)
+void __ext4_std_error(struct super_block *sb, const char *function,
+                      unsigned int line, int errno)
 {
        char nbuf[16];
        const char *errstr;
@@ -447,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
                return;
        errstr = ext4_decode_error(sb, errno, nbuf);
-        printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+        printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
-               sb->s_id, function, errstr);
+               sb->s_id, function, line, errstr);
+        save_error_info(sb, function, line);
        ext4_handle_error(sb);
 }
@@ -463,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
 * case we take the easy way out and panic immediately.
 */
-void ext4_abort(struct super_block *sb, const char *function,
+void __ext4_abort(struct super_block *sb, const char *function,
-                const char *fmt, ...)
+                unsigned int line, const char *fmt, ...)
 {
        va_list args;
+        save_error_info(sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+               function, line);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
+        if ((sb->s_flags & MS_RDONLY) == 0) {
+                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+                sb->s_flags |= MS_RDONLY;
+                EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+                if (EXT4_SB(sb)->s_journal)
+                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+                save_error_info(sb, function, line);
+        }
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs panic from previous error\n");
-        if (sb->s_flags & MS_RDONLY)
-                return;
-        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-        sb->s_flags |= MS_RDONLY;
-        EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
-        if (EXT4_SB(sb)->s_journal)
-                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 void ext4_msg (struct super_block * sb, const char *prefix,
@@ -501,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
 }
 void __ext4_warning(struct super_block *sb, const char *function,
-                  const char *fmt, ...)
+                    unsigned int line, const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
+        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
-               sb->s_id, function);
+               sb->s_id, function, line);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
 }
-void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+void __ext4_grp_locked_error(const char *function, unsigned int line,
-                           const char *function, const char *fmt, ...)
+                             struct super_block *sb, ext4_group_t grp,
+                             unsigned long ino, ext4_fsblk_t block,
+                             const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
        va_list args;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        es->s_last_error_ino = cpu_to_le32(ino);
+        es->s_last_error_block = cpu_to_le64(block);
+        __save_error_info(sb, function, line);
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+               sb->s_id, function, line, grp);
+        if (ino)
+                printk("inode %lu: ", ino);
+        if (block)
+                printk("block %llu:", (unsigned long long) block);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
        if (test_opt(sb, ERRORS_CONT)) {
-                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
        ext4_handle_error(sb);
        /*
@@ -645,6 +702,8 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
@@ -657,8 +716,7 @@ static void ext4_put_super(struct super_block *sb)
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
                if (err < 0)
-                        ext4_abort(sb, __func__,
+                        ext4_abort(sb, "Couldn't clean up the journal");
-                                   "Couldn't clean up the journal");
        }
        ext4_release_system_zone(sb);
@@ -810,8 +868,10 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ext4_inode_cachep);
 }
-static void ext4_clear_inode(struct inode *inode)
+void ext4_clear_inode(struct inode *inode)
 {
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
        dquot_drop(inode);
        ext4_discard_preallocations(inode);
        if (EXT4_JOURNAL(inode))
@@ -941,14 +1001,14 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
                seq_puts(seq, ",journal_async_commit");
-        if (test_opt(sb, NOBH))
+        else if (test_opt(sb, JOURNAL_CHECKSUM))
-                seq_puts(seq, ",nobh");
+                seq_puts(seq, ",journal_checksum");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
-        if (!test_opt(sb, DELALLOC))
+        if (!test_opt(sb, DELALLOC) &&
+            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -972,7 +1032,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NO_AUTO_DA_ALLOC))
                seq_puts(seq, ",noauto_da_alloc");
-        if (test_opt(sb, DISCARD))
+        if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
                seq_puts(seq, ",discard");
        if (test_opt(sb, NOLOAD))
@@ -981,6 +1041,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, DIOREAD_NOLOCK))
                seq_puts(seq, ",dioread_nolock");
+        if (test_opt(sb, BLOCK_VALIDITY) &&
+            !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
+                seq_puts(seq, ",block_validity");
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -1059,7 +1123,8 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
+static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1081,12 +1146,12 @@ static const struct dquot_operations ext4_quota_operations = {
 static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on       = ext4_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = ext4_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -1095,14 +1160,13 @@ static const struct super_operations ext4_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
-        .delete_inode   = ext4_delete_inode,
+        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
-        .clear_inode    = ext4_clear_inode,
        .show_options   = ext4_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext4_quota_read,
@@ -1116,12 +1180,11 @@ static const struct super_operations ext4_nojournal_sops = {
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
-        .delete_inode   = ext4_delete_inode,
+        .evict_inode    = ext4_evict_inode,
        .write_super    = ext4_write_super,
        .put_super      = ext4_put_super,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
-        .clear_inode    = ext4_clear_inode,
        .show_options   = ext4_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext4_quota_read,
@@ -1619,10 +1682,12 @@ set_qf_format:
                        *n_blocks_count = option;
                        break;
                case Opt_nobh:
-                        set_opt(sbi->s_mount_opt, NOBH);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated nobh option");
                        break;
                case Opt_bh:
-                        clear_opt(sbi->s_mount_opt, NOBH);
+                        ext4_msg(sb, KERN_WARNING,
+                                 "Ignoring deprecated bh option");
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
@@ -2051,7 +2116,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2278,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
        struct attribute attr;
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
        int offset;
 };
@@ -2244,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 {
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        if (!sb->s_bdev->bd_part)
+                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%lu\n",
                        (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                         sbi->s_sectors_written_start) >> 1);
@@ -2254,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 {
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        if (!sb->s_bdev->bd_part)
+                return snprintf(buf, PAGE_SIZE, "0\n");
        return snprintf(buf, PAGE_SIZE, "%llu\n",
                        (unsigned long long)(sbi->s_kbytes_written +
                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2426,10 +2495,58 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
        return 1;
 }
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+        struct super_block *sb = (struct super_block *) arg;
+        struct ext4_sb_info *sbi;
+        struct ext4_super_block *es;
+        sbi = EXT4_SB(sb);
+        es = sbi->s_es;
+        if (es->s_error_count)
+                ext4_msg(sb, KERN_NOTICE, "error count: %u",
+                         le32_to_cpu(es->s_error_count));
+        if (es->s_first_error_time) {
+                printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+                       sb->s_id, le32_to_cpu(es->s_first_error_time),
+                       (int) sizeof(es->s_first_error_func),
+                       es->s_first_error_func,
+                       le32_to_cpu(es->s_first_error_line));
+                if (es->s_first_error_ino)
+                        printk(": inode %u",
+                               le32_to_cpu(es->s_first_error_ino));
+                if (es->s_first_error_block)
+                        printk(": block %llu", (unsigned long long)
+                               le64_to_cpu(es->s_first_error_block));
+                printk("\n");
+        }
+        if (es->s_last_error_time) {
+                printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+                       sb->s_id, le32_to_cpu(es->s_last_error_time),
+                       (int) sizeof(es->s_last_error_func),
+                       es->s_last_error_func,
+                       le32_to_cpu(es->s_last_error_line));
+                if (es->s_last_error_ino)
+                        printk(": inode %u",
+                               le32_to_cpu(es->s_last_error_ino));
+                if (es->s_last_error_block)
+                        printk(": block %llu", (unsigned long long)
+                               le64_to_cpu(es->s_last_error_block));
+                printk("\n");
+        }
+        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi;
@@ -2442,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        char *cp;
        const char *descr;
-        int ret = -EINVAL;
+        int ret = -ENOMEM;
        int blocksize;
        unsigned int db_count;
        unsigned int i;
@@ -2453,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto out_free_orig;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock) {
                kfree(sbi);
-                return -ENOMEM;
+                goto out_free_orig;
        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
@@ -2467,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_resgid = EXT4_DEF_RESGID;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
-        sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+        if (sb->s_bdev->bd_part)
-                                                      sectors[1]);
+                sbi->s_sectors_written_start =
+                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
        unlock_kernel();
@@ -2476,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                *cp = '!';
+        ret = -EINVAL;
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2540,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, ERRORS_CONT);
        else
                set_opt(sbi->s_mount_opt, ERRORS_RO);
+        if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+                set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+        if (def_mount_opts & EXT4_DEFM_DISCARD)
+                set_opt(sbi->s_mount_opt, DISCARD);
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2547,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-        set_opt(sbi->s_mount_opt, BARRIER);
+        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+                set_opt(sbi->s_mount_opt, BARRIER);
        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
-        if (!IS_EXT3_SB(sb))
+        if (!IS_EXT3_SB(sb) &&
+            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sbi->s_mount_opt, DELALLOC);
+        if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+                           &journal_devnum, &journal_ioprio, NULL, 0)) {
+                ext4_msg(sb, KERN_WARNING,
+                         "failed to parse options in superblock: %s",
+                         sbi->s_es->s_mount_opts);
+        }
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2793,24 +2924,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext4_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext4_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext4_count_dirs(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-        }
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount3;
-        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2910,18 +3023,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 no_journal:
-        if (test_opt(sb, NOBH)) {
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+                                  ext4_count_free_blocks(sb));
-                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+        if (!err)
-                                "its supported only with writeback mode");
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                        clear_opt(sbi->s_mount_opt, NOBH);
+                                          ext4_count_free_inodes(sb));
-                }
+        if (!err)
-                if (test_opt(sb, DIOREAD_NOLOCK)) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
-                        ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+                                          ext4_count_dirs(sb));
-                                "not supported with nobh mode");
+        if (!err)
-                        goto failed_mount_wq;
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-                }
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount_wq;
        }
        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3001,14 +3117,14 @@ no_journal:
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
-                         "zone (%d)\n", err);
+                         "zone (%d)", err);
                goto failed_mount4;
        }
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-                ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount4;
        }
@@ -3040,9 +3156,18 @@ no_journal:
        } else
                descr = "out journal";
-        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
+        if (es->s_error_count)
+                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
        lock_kernel();
+        kfree(orig_data);
        return 0;
 cantfind_ext4:
@@ -3059,6 +3184,10 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3195,6 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3214,8 @@ out_fail:
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
+out_free_orig:
+        kfree(orig_data);
        return ret;
 }
@@ -3105,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
@@ -3114,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
        else
                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3322,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
                err = jbd2_journal_wipe(journal, !really_read_only);
-        if (!err)
+        if (!err) {
+                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+                if (save)
+                        memcpy(save, ((char *) es) +
+                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
+                if (save)
+                        memcpy(((char *) es) + EXT4_S_ERR_START,
+                               save, EXT4_S_ERR_LEN);
+                kfree(save);
+        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3379,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync)
         */
        if (!(sb->s_flags & MS_RDONLY))
                es->s_wtime = cpu_to_le32(get_seconds());
-        es->s_kbytes_written =
+        if (sb->s_bdev->bd_part)
-                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                es->s_kbytes_written =
+                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
+        else
+                es->s_kbytes_written =
+                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3485,8 +3625,10 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal)
+        if (journal) {
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -3535,18 +3677,16 @@ static int ext4_freeze(struct super_block *sb)
         * the journal.
         */
        error = jbd2_journal_flush(journal);
-        if (error < 0) {
+        if (error < 0)
-        out:
+                goto out;
-                jbd2_journal_unlock_updates(journal);
-                return error;
-        }
        /* Journal blocked and flushed, clear needs_recovery flag. */
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
-        if (error)
+out:
-                goto out;
+        /* we rely on s_frozen to stop further updates */
-        return 0;
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        return error;
 }
 /*
@@ -3563,7 +3703,6 @@ static int ext4_unfreeze(struct super_block *sb)
        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, 1);
        unlock_super(sb);
-        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3574,12 +3713,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
+        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        lock_kernel();
@@ -3610,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        }
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
-                ext4_abort(sb, __func__, "Abort forced by user");
+                ext4_abort(sb, "Abort forced by user");
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3630,6 +3771,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -3698,6 +3843,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
        ext4_setup_system_zone(sb);
@@ -3713,6 +3859,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
+        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+        kfree(orig_data);
        return 0;
 restore_opts:
@@ -3734,6 +3885,7 @@ restore_opts:
 #endif
        unlock_super(sb);
        unlock_kernel();
+        kfree(orig_data);
        return err;
 }
@@ -3906,24 +4058,21 @@ static int ext4_write_info(struct super_block *sb, int type)
 */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                                  EXT4_SB(sb)->s_jquota_fmt, type);
+                                        EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -3962,11 +4111,23 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+        /* Force all delayed allocation blocks to be allocated */
+        if (test_opt(sb, DELALLOC)) {
+                down_read(&sb->s_umount);
+                sync_filesystem(sb);
+                up_read(&sb->s_umount);
+        }
+        return dquot_quota_off(sb, type);
+}
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and noone else should touch the files)
@@ -4016,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0;
        int offset = off & (sb->s_blocksize - 1);
-        int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
@@ -4041,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        bh = ext4_bread(handle, inode, blk, 1, &err);
        if (!bh)
                goto out;
-        if (journal_quota) {
+        err = ext4_journal_get_write_access(handle, bh);
-                err = ext4_journal_get_write_access(handle, bh);
+        if (err) {
-                if (err) {
+                brelse(bh);
-                        brelse(bh);
+                goto out;
-                        goto out;
-                }
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_page(bh->b_page);
        unlock_buffer(bh);
-        if (journal_quota)
+        err = ext4_handle_dirty_metadata(handle, NULL, bh);
-                err = ext4_handle_dirty_metadata(handle, NULL, bh);
-        else {
-                /* Always do at least ordered writes for quotas */
-                err = ext4_jbd2_file_inode(handle, inode);
-                mark_buffer_dirty(bh);
-        }
        brelse(bh);
 out:
        if (err) {
@@ -4141,6 +4293,7 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_check_flag_values();
        err = init_ext4_system_zone();
        if (err)
                return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..3a8cd8dff1ad 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext4_xattr_cache;
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
 bad_block:
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -332,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext4_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -460,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
-                sb->s_dirt = 1;
+                ext4_handle_dirty_super(handle, sb);
-                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
 }
@@ -666,8 +663,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                        ext4_error(sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -820,7 +817,7 @@ inserted:
                                                EXT4_I(inode)->i_block_group);
                        /* non-extent files can't have physical blocks past 2^32 */
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
                        block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +825,7 @@ inserted:
                        if (error)
                                goto cleanup;
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
                        ea_idebug(inode, "creating block %d", block);
@@ -880,8 +877,8 @@ cleanup_dquot:
        goto cleanup;
 bad_block:
-        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+        EXT4_ERROR_INODE(inode, "bad block %llu",
-                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                         EXT4_I(inode)->i_file_acl);
        goto cleanup;
 #undef header
@@ -1194,8 +1191,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1372,14 +1369,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-                ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+                EXT4_ERROR_INODE(inode, "block %llu read error",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-                ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1420,7 +1417,7 @@ ext4_xattr_cache_insert(struct buffer_head *bh)
                ea_bdebug(bh, "out of memory");
                return;
        }
-        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+        error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
        if (error) {
                mb_cache_entry_free(ce);
                if (error == -EBUSY) {
@@ -1492,8 +1489,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
+        ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
-                                       inode->i_sb->s_bdev, hash);
+                                       hash);
        while (ce) {
                struct buffer_head *bh;
@@ -1504,9 +1501,8 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "block %lu read error",
-                                "inode %lu: block %lu read error",
+                                         (unsigned long) ce->e_block);
-                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT4_XATTR_REFCOUNT_MAX) {
                        ea_idebug(inode, "block %lu refcount %d>=%d",
@@ -1518,7 +1514,7 @@ again:
                        return bh;
                }
                brelse(bh);
-                ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+                ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
        }
        return NULL;
 }
@@ -1594,9 +1590,7 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 int __init
 init_ext4_xattr(void)
 {
-        ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
+        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
-                sizeof(struct mb_cache_entry) +
-                sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
        if (!ext4_xattr_cache)
                return -ENOMEM;
        return 0;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
 # ifdef CONFIG_EXT4_FS_XATTR
-extern struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
-extern struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler *ext4_xattr_handlers[];
 # else  /* CONFIG_EXT4_FS_XATTR */
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext4_xattr_security_handler = {
+const struct xattr_handler ext4_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext4_xattr_security_list,
        .get    = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext4_xattr_trusted_list,
        .get    = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext4_xattr_user_list,
        .get    = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_error(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error_ratelimit(sb,
-                                     " (i_pos %lld)", __func__,
+                                        "%s: detected the cluster chain loop"
-                                     MSDOS_I(inode)->i_pos);
+                                        " (i_pos %lld)", __func__,
+                                        MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_error(sb, "%s: invalid cluster chain"
+                        fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
-                                     " (i_pos %lld)", __func__,
+                                               " (i_pos %lld)", __func__,
-                                     MSDOS_I(inode)->i_pos);
+                                               MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
        const wchar_t *ip;
        wchar_t ec;
-        unsigned char *op, nc;
+        unsigned char *op;
        int charlen;
-        int k;
        ip = uni;
        op = ascii;
        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
-                if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+                if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
                        if (uni_xlate == 1) {
-                                *op = ':';
+                                *op++ = ':';
-                                for (k = 4; k > 0; k--) {
+                                op = pack_hex_byte(op, ec >> 8);
-                                        nc = ec & 0xF;
+                                op = pack_hex_byte(op, ec);
-                                        op[k] = nc > 9  ? nc + ('a' - 10)
-                                                        : nc + '0';
-                                        ec >>= 4;
-                                }
-                                op += 5;
                                len -= 5;
                        } else {
                                *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
        return ret;
 }
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
-                         unsigned int cmd, unsigned long arg)
+                          unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                both = 1;
                break;
        default:
-                return fat_generic_ioctl(inode, filp, cmd, arg);
+                return fat_generic_ioctl(filp, cmd, arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
                both = 1;
                break;
        default:
-                return -ENOIOCTLCMD;
+                return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = fat_readdir,
-        .ioctl          = fat_dir_ioctl,
+        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..d75a77f85c28 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
        struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
+        struct ratelimit_state ratelimit;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,16 +301,15 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
-                             unsigned int cmd, unsigned long arg);
+                              unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+extern int fat_file_fsync(struct file *file, int datasync);
-                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +324,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
+extern void
-        __attribute__ ((format (printf, 2, 3))) __cold;
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)           \
+        __fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..7257752b6d5d 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
        return err;
 }
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                      unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 __user *user_attr = (u32 __user *)arg;
        switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        }
 }
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+                                      unsigned long arg)
+{
+        return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int res, err;
-        res = simple_fsync(filp, dentry, datasync);
+        res = generic_file_fsync(filp, datasync);
        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
        return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
-        .ioctl          = fat_generic_ioctl,
+        .unlocked_ioctl = fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = fat_generic_compat_ioctl,
+#endif
        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
        return fat_free_clusters(inode, free_start);
 }
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
-        if (MSDOS_I(inode)->mmu_private > inode->i_size)
+        if (MSDOS_I(inode)->mmu_private > offset)
-                MSDOS_I(inode)->mmu_private = inode->i_size;
+                MSDOS_I(inode)->mmu_private = offset;
-        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
        fat_free(inode, nr_clusters);
        fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -362,20 +375,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        unsigned int ia_valid;
        int error;
-        /*
-         * Expand the file. Since inode_setattr() updates ->i_size
-         * before calling the ->truncate(), but FAT needs to fill the
-         * hole before it.
-         */
-        if (attr->ia_valid & ATTR_SIZE) {
-                if (attr->ia_size > inode->i_size) {
-                        error = fat_cont_expand(inode, attr->ia_size);
-                        if (error || attr->ia_valid == ATTR_SIZE)
-                                goto out;
-                        attr->ia_valid &= ~ATTR_SIZE;
-                }
-        }
        /* Check for setting the inode time. */
        ia_valid = attr->ia_valid;
        if (ia_valid & TIMES_SET_FLAGS) {
@@ -391,6 +390,21 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                goto out;
        }
+        /*
+         * Expand the file. Since inode_setattr() updates ->i_size
+         * before calling the ->truncate(), but FAT needs to fill the
+         * hole before it. XXX: this is no longer true with new truncate
+         * sequence.
+         */
+        if (attr->ia_valid & ATTR_SIZE) {
+                if (attr->ia_size > inode->i_size) {
+                        error = fat_cont_expand(inode, attr->ia_size);
+                        if (error || attr->ia_valid == ATTR_SIZE)
+                                goto out;
+                        attr->ia_valid &= ~ATTR_SIZE;
+                }
+        }
        if (((attr->ia_valid & ATTR_UID) &&
             (attr->ia_uid != sbi->options.fs_uid)) ||
            ((attr->ia_valid & ATTR_GID) &&
@@ -414,15 +428,19 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        attr->ia_valid &= ~ATTR_MODE;
        }
-        if (attr->ia_valid)
+        if (attr->ia_valid & ATTR_SIZE) {
-                error = inode_setattr(inode, attr);
+                truncate_setsize(inode, attr->ia_size);
+                fat_truncate_blocks(inode, attr->ia_size);
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
 out:
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
-        .truncate       = fat_truncate,
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..830058057d33 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                fat_truncate_blocks(inode, inode->i_size);
+        }
+}
 static int fat_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int err;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        err = cont_write_begin(file, mapping, pos, len, flags,
-                                fat_get_block,
+                                pagep, fsdata, fat_get_block,
                                &MSDOS_I(mapping->host)->mmu_private);
+        if (err < 0)
+                fat_write_failed(mapping, pos + len);
+        return err;
 }
 static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = mapping->host;
        int err;
        err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+        if (err < len)
+                fat_write_failed(mapping, pos + len);
        if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
                MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
                             loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        ssize_t ret;
        if (rw == WRITE) {
                /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-                                  offset, nr_segs, fat_get_block, NULL);
+                                 iov, offset, nr_segs, fat_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -240,7 +263,7 @@ static const struct address_space_operations fat_aops = {
 *                      check if the location is still valid and retry if it
 *                      isn't. Otherwise we do changes.
 *              5. Spinlock is used to protect hash/unhash/location check/lookup
- *              6. fat_clear_inode() unhashes the F-d-c entry.
+ *              6. fat_evict_inode() unhashes the F-d-c entry.
 *              7. lookup() and readdir() do igrab() if they find a F-d-c entry
 *                      and consider negative result as cache miss.
 */
@@ -425,16 +448,15 @@ out:
 EXPORT_SYMBOL_GPL(fat_build_inode);
-static void fat_delete_inode(struct inode *inode)
+static void fat_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
-        inode->i_size = 0;
+        if (!inode->i_nlink) {
-        fat_truncate(inode);
+                inode->i_size = 0;
-        clear_inode(inode);
+                fat_truncate_blocks(inode, 0);
-}
+        }
+        invalidate_inode_buffers(inode);
-static void fat_clear_inode(struct inode *inode)
+        end_writeback(inode);
-{
        fat_cache_inval_inode(inode);
        fat_detach(inode);
 }
@@ -651,12 +673,11 @@ static const struct super_operations fat_sops = {
        .alloc_inode    = fat_alloc_inode,
        .destroy_inode  = fat_destroy_inode,
        .write_inode    = fat_write_inode,
-        .delete_inode   = fat_delete_inode,
+        .evict_inode    = fat_evict_inode,
        .put_super      = fat_put_super,
        .write_super    = fat_write_super,
        .sync_fs        = fat_sync_fs,
        .statfs         = fat_statfs,
-        .clear_inode    = fat_clear_inode,
        .remount_fs     = fat_remount,
        .show_options   = fat_show_options,
@@ -1250,6 +1271,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
        sbi->dir_ops = fs_dir_inode_ops;
+        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+                             DEFAULT_RATELIMIT_BURST);
        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
        if (error)
@@ -1497,10 +1520,8 @@ out_fail:
                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
-        if (sbi->nls_io)
+        unload_nls(sbi->nls_io);
-                unload_nls(sbi->nls_io);
+        unload_nls(sbi->nls_disk);
-        if (sbi->nls_disk)
-                unload_nls(sbi->nls_disk);
        if (sbi->options.iocharset != fat_default_iocharset)
                kfree(sbi->options.iocharset);
        sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1736f2356388 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+        if (report) {
+                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-        printk(KERN_ERR "    ");
+                printk(KERN_ERR "    ");
-        va_start(args, fmt);
+                va_start(args, fmt);
-        vprintk(fmt, args);
+                vprintk(fmt, args);
-        va_end(args);
+                va_end(args);
-        printk("\n");
+                printk("\n");
+        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("    FAT fs panic from previous error\n");
+                panic("FAT: fs panic from previous error\n");
        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "    File system has been set read-only\n");
+                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
        }
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -248,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
        int i, err = 0;
-        ll_rw_block(SWRITE, nr_bhs, bhs);
+        for (i = 0; i < nr_bhs; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
                if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f8cc34f542c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -273,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
-                return ret;
+                return -EFAULT;
        switch (owner.type) {
        case F_OWNER_TID:
@@ -331,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
        }
        read_unlock(&filp->f_owner.lock);
-        if (!ret)
+        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
+                if (ret)
+                        ret = -EFAULT;
+        }
        return ret;
 }
@@ -412,6 +416,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, arg);
                break;
+        case F_SETPIPE_SZ:
+        case F_GETPIPE_SZ:
+                err = pipe_fcntl(filp, cmd, arg);
+                break;
        default:
                break;
        }
@@ -614,9 +622,15 @@ int send_sigurg(struct fown_struct *fown)
        return ret;
 }
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+        kmem_cache_free(fasync_cache,
+                        container_of(head, struct fasync_struct, fa_rcu));
+}
 /*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +639,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
 */
 static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
@@ -634,17 +646,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        int result = 0;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
+                fa->fa_file = NULL;
+                spin_unlock_irq(&fa->fa_lock);
                *fp = fa->fa_next;
-                kmem_cache_free(fasync_cache, fa);
+                call_rcu(&fa->fa_rcu, fasync_free_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -666,25 +683,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                return -ENOMEM;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
+                spin_unlock_irq(&fa->fa_lock);
                kmem_cache_free(fasync_cache, new);
                goto out;
        }
+        spin_lock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
-        *fapp = new;
+        rcu_assign_pointer(*fapp, new);
        result = 1;
        filp->f_flags |= FASYNC;
 out:
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -704,46 +726,67 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
        while (fa) {
-                struct fown_struct * fown;
+                struct fown_struct *fown;
+                unsigned long flags;
                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
-                fown = &fa->fa_file->f_owner;
+                spin_lock_irqsave(&fa->fa_lock, flags);
-                /* Don't send SIGURG to processes which have not set a
+                if (fa->fa_file) {
-                   queued signum: SIGURG has its own default signalling
+                        fown = &fa->fa_file->f_owner;
-                   mechanism. */
+                        /* Don't send SIGURG to processes which have not set a
-                if (!(sig == SIGURG && fown->signum == 0))
+                           queued signum: SIGURG has its own default signalling
-                        send_sigio(fown, fa->fa_fd, band);
+                           mechanism. */
-                fa = fa->fa_next;
+                        if (!(sig == SIGURG && fown->signum == 0))
+                                send_sigio(fown, fa->fa_fd, band);
+                }
+                spin_unlock_irqrestore(&fa->fa_lock, flags);
+                fa = rcu_dereference(fa->fa_next);
        }
 }
-EXPORT_SYMBOL(__kill_fasync);
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
-                read_lock(&fasync_lock);
+                rcu_read_lock();
-                /* reread *fp after obtaining the lock */
+                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
-                __kill_fasync(*fp, sig, band);
+                rcu_read_unlock();
-                read_unlock(&fasync_lock);
        }
 }
 EXPORT_SYMBOL(kill_fasync);
-static int __init fasync_init(void)
+static int __init fcntl_init(void)
 {
+        /*
+         * Please add new bits here to ensure allocation uniqueness.
+         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+         * is defined as O_NONBLOCK on some platforms and not on others.
+         */
+        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+                O_RDONLY        | O_WRONLY      | O_RDWR        |
+                O_CREAT         | O_EXCL        | O_NOCTTY      |
+                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
+                __O_SYNC        | O_DSYNC       | FASYNC        |
+                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
+                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
+                FMODE_EXEC
+                ));
        fasync_cache = kmem_cache_create("fasync_cache",
                sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
        return 0;
 }
-module_init(fasync_init)
+module_init(fcntl_init)
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d994..0be344755c02 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 */
 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-static inline void * alloc_fdmem(unsigned int size)
+static inline void *alloc_fdmem(unsigned int size)
 {
-        if (size <= PAGE_SIZE)
+        void *data;
-                return kmalloc(size, GFP_KERNEL);
-        else
+        data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
-                return vmalloc(size);
+        if (data != NULL)
+                return data;
+        return vmalloc(size);
 }
-static inline void free_fdarr(struct fdtable *fdt)
+static void free_fdmem(void *ptr)
 {
-        if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
+        is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
-                kfree(fdt->fd);
-        else
-                vfree(fdt->fd);
 }
-static inline void free_fdset(struct fdtable *fdt)
+static void __free_fdtable(struct fdtable *fdt)
 {
-        if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
+        free_fdmem(fdt->fd);
-                kfree(fdt->open_fds);
+        free_fdmem(fdt->open_fds);
-        else
+        kfree(fdt);
-                vfree(fdt->open_fds);
 }
 static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
        spin_unlock_bh(&f->lock);
        while(fdt) {
                struct fdtable *next = fdt->next;
-                vfree(fdt->fd);
-                free_fdset(fdt);
+                __free_fdtable(fdt);
-                kfree(fdt);
                fdt = next;
        }
 }
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
                                container_of(fdt, struct files_struct, fdtab));
                return;
        }
-        if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
+        if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
                kfree(fdt->fd);
                kfree(fdt->open_fds);
                kfree(fdt);
@@ -178,13 +176,12 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
        fdt->open_fds = (fd_set *)data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = (fd_set *)data;
-        INIT_RCU_HEAD(&fdt->rcu);
        fdt->next = NULL;
        return fdt;
 out_arr:
-        free_fdarr(fdt);
+        free_fdmem(fdt->fd);
 out_fdt:
        kfree(fdt);
 out:
@@ -214,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
         * caller and alloc_fdtable().  Cheaper to catch it here...
         */
        if (unlikely(new_fdt->max_fds <= nr)) {
-                free_fdarr(new_fdt);
+                __free_fdtable(new_fdt);
-                free_fdset(new_fdt);
-                kfree(new_fdt);
                return -EMFILE;
        }
        /*
@@ -232,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
                        free_fdtable(cur_fdt);
        } else {
                /* Somebody else expanded, so undo our attempt */
-                free_fdarr(new_fdt);
+                __free_fdtable(new_fdt);
-                free_fdset(new_fdt);
-                kfree(new_fdt);
        }
        return 1;
 }
@@ -312,7 +305,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
        new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
        new_fdt->fd = &newf->fd_array[0];
-        INIT_RCU_HEAD(&new_fdt->rcu);
        new_fdt->next = NULL;
        spin_lock(&oldf->file_lock);
@@ -325,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);
-                if (new_fdt != &newf->fdtab) {
+                if (new_fdt != &newf->fdtab)
-                        free_fdarr(new_fdt);
+                        __free_fdtable(new_fdt);
-                        free_fdset(new_fdt);
-                        kfree(new_fdt);
-                }
                new_fdt = alloc_fdtable(open_files - 1);
                if (!new_fdt) {
@@ -339,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
                /* beyond sysctl_nr_open; nothing to do */
                if (unlikely(new_fdt->max_fds < open_files)) {
-                        free_fdarr(new_fdt);
+                        __free_fdtable(new_fdt);
-                        free_fdset(new_fdt);
-                        kfree(new_fdt);
                        *errorp = -EMFILE;
                        goto out_release;
                }
@@ -430,7 +417,6 @@ struct files_struct init_files = {
                .fd             = &init_files.fd_array[0],
                .close_on_exec  = (fd_set *)&init_files.close_on_exec_init,
                .open_fds       = (fd_set *)&init_files.open_fds_init,
-                .rcu            = RCU_HEAD_INIT,
        },
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..a04bdd81c11c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 #include <asm/atomic.h>
@@ -32,8 +34,8 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
-/* public. Not pretty! */
+DECLARE_LGLOCK(files_lglock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DEFINE_LGLOCK(files_lglock);
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -194,14 +196,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 }
 EXPORT_SYMBOL(alloc_file);
-void fput(struct file *file)
-{
-        if (atomic_long_dec_and_test(&file->f_count))
-                __fput(file);
-}
-EXPORT_SYMBOL(fput);
 /**
 * drop_file_write_access - give up ability to write to a file
 * @file: the file to which we will stop writing
@@ -227,10 +221,9 @@ void drop_file_write_access(struct file *file)
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
-/* __fput is called from task context when aio completion releases the last
+/* the real guts of fput() - releasing the last reference to file
- * last use of a struct file *.  Do not use otherwise.
 */
-void __fput(struct file *file)
+static void __fput(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
@@ -258,7 +251,7 @@ void __fput(struct file *file)
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
-        file_kill(file);
+        file_sb_list_del(file);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -268,6 +261,14 @@ void __fput(struct file *file)
        mntput(mnt);
 }
+void fput(struct file *file)
+{
+        if (atomic_long_dec_and_test(&file->f_count))
+                __fput(file);
+}
+EXPORT_SYMBOL(fput);
 struct file *fget(unsigned int fd)
 {
        struct file *file;
@@ -290,11 +291,20 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
 /*
- * Lightweight file lookup - no refcnt increment if fd table isn't shared. 
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
- * You can use this only if it is guranteed that the current task already 
+ *
- * holds a refcnt to that file. That check has to be done at fget() only
+ * You can use this instead of fget if you satisfy all of the following
- * and a flag is returned to be passed to the corresponding fput_light().
+ * conditions:
- * There must not be a cloning between an fget_light/fput_light pair.
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
 */
 struct file *fget_light(unsigned int fd, int *fput_needed)
 {
@@ -320,41 +330,107 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        return file;
 }
 void put_filp(struct file *file)
 {
        if (atomic_long_dec_and_test(&file->f_count)) {
                security_file_free(file);
-                file_kill(file);
+                file_sb_list_del(file);
                file_free(file);
        }
 }
-void file_move(struct file *file, struct list_head *list)
+static inline int file_list_cpu(struct file *file)
 {
-        if (!list)
+#ifdef CONFIG_SMP
-                return;
+        return file->f_sb_list_cpu;
-        file_list_lock();
+#else
-        list_move(&file->f_u.fu_list, list);
+        return smp_processor_id();
-        file_list_unlock();
+#endif
 }
-void file_kill(struct file *file)
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        struct list_head *list;
+#ifdef CONFIG_SMP
+        int cpu;
+        cpu = smp_processor_id();
+        file->f_sb_list_cpu = cpu;
+        list = per_cpu_ptr(sb->s_files, cpu);
+#else
+        list = &sb->s_files;
+#endif
+        list_add(&file->f_u.fu_list, list);
+}
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
+void file_sb_list_add(struct file *file, struct super_block *sb)
+{
+        lg_local_lock(files_lglock);
+        __file_sb_list_add(file, sb);
+        lg_local_unlock(files_lglock);
+}
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
+void file_sb_list_del(struct file *file)
 {
        if (!list_empty(&file->f_u.fu_list)) {
-                file_list_lock();
+                lg_local_lock_cpu(files_lglock, file_list_cpu(file));
                list_del_init(&file->f_u.fu_list);
-                file_list_unlock();
+                lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
        }
 }
+#ifdef CONFIG_SMP
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        int i;                                                  \
+        for_each_possible_cpu(i) {                              \
+                struct list_head *list;                         \
+                list = per_cpu_ptr((__sb)->s_files, i);         \
+                list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+        }                                                       \
+}
+#else
+#define do_file_list_for_each_entry(__sb, __file)               \
+{                                                               \
+        struct list_head *list;                                 \
+        list = &(sb)->s_files;                                  \
+        list_for_each_entry((__file), list, f_u.fu_list)
+#define while_file_list_for_each_entry                          \
+}
+#endif
 int fs_may_remount_ro(struct super_block *sb)
 {
        struct file *file;
        /* Check that no files are currently opened for writing. */
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, file) {
                struct inode *inode = file->f_path.dentry->d_inode;
                /* File with pending delete? */
@@ -364,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
                /* Writeable file? */
                if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
                        goto too_bad;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 1; /* Tis' cool bro. */
 too_bad:
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
        return 0;
 }
@@ -384,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
        struct file *f;
 retry:
-        file_list_lock();
+        lg_global_lock(files_lglock);
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+        do_file_list_for_each_entry(sb, f) {
                struct vfsmount *mnt;
                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
                       continue;
@@ -400,16 +476,13 @@ retry:
                        continue;
                file_release_write(f);
                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
+                /* This can sleep, so we can't hold the spinlock. */
-                /*
+                lg_global_unlock(files_lglock);
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
                mnt_drop_write(mnt);
                mntput(mnt);
                goto retry;
-        }
+        } while_file_list_for_each_entry;
-        file_list_unlock();
+        lg_global_unlock(files_lglock);
 }
 void __init files_init(unsigned long mempages)
@@ -429,5 +502,6 @@ void __init files_init(unsigned long mempages)
        if (files_stat.max_files < NR_FILE)
                files_stat.max_files = NR_FILE;
        files_defer_init();
+        lg_lock_init(files_lglock);
        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 50ab5eecb99b..881aa3d217f0 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void			vxfs_put_fake_inode(struct inode *);
 extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t);
 extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t);
 extern struct inode *           vxfs_iget(struct super_block *, ino_t);
-extern void                     vxfs_clear_inode(struct inode *);
+extern void                     vxfs_evict_inode(struct inode *);
 /* vxfs_lookup.c */
 extern const struct inode_operations    vxfs_dir_inode_ops;
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 03a6ea5e99f7..79d1b4ea13e7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -337,15 +337,17 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 }
 /**
- * vxfs_clear_inode - remove inode from main memory
+ * vxfs_evict_inode - remove inode from main memory
 * @ip:         inode to discard.
 *
 * Description:
- *  vxfs_clear_inode() is called on the final iput and frees the private
+ *  vxfs_evict_inode() is called on the final iput and frees the private
 *  inode area.
 */
 void
-vxfs_clear_inode(struct inode *ip)
+vxfs_evict_inode(struct inode *ip)
 {
+        truncate_inode_pages(&ip->i_data, 0);
+        end_writeback(ip);
        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 const struct file_operations vxfs_dir_operations = {
+        .llseek =               generic_file_llseek,
+        .read =                 generic_read_dir,
        .readdir =              vxfs_readdir,
 };
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1e8af939b3e4..dc0c041e85cb 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -61,7 +61,7 @@ static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int              vxfs_remount(struct super_block *, int *, char *);
 static const struct super_operations vxfs_super_ops = {
-        .clear_inode =          vxfs_clear_inode,
+        .evict_inode =          vxfs_evict_inode,
        .put_super =            vxfs_put_super,
        .statfs =               vxfs_statfs,
        .remount_fs =           vxfs_remount,
@@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 }
 /**
- * vxfs_read_super - read superblock into memory and initalize filesystem
+ * vxfs_read_super - read superblock into memory and initialize filesystem
 * @sbp:                VFS superblock (to fill)
 * @dp:                 fs private mount data
 * @silent:             do not complain loudly when sth is wrong
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,62 +26,36 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/tracepoint.h>
 #include "internal.h"
-#define inode_to_bdi(inode)     ((inode)->i_mapping->backing_dev_info)
-/*
- * We don't actually have pdflush, but this one is exported though /proc...
- */
-int nr_pdflush_threads;
 /*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
-struct wb_writeback_args {
+struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
-        int for_kupdate:1;
+        unsigned int for_kupdate:1;
-        int range_cyclic:1;
+        unsigned int range_cyclic:1;
-        int for_background:1;
+        unsigned int for_background:1;
-};
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
        struct list_head list;          /* pending work list */
-        struct rcu_head rcu_head;       /* for RCU free/clear of work */
+        struct completion *done;        /* set if the caller waits */
-        unsigned long seen;             /* threads that have seen this work */
-        atomic_t pending;               /* number of threads still to do work */
-        struct wb_writeback_args args;  /* writeback arguments */
-        unsigned long state;            /* flag bits, see WS_* */
 };
-enum {
+/*
-        WS_USED_B = 0,
+ * Include the creation of the trace points after defining the
-        WS_ONSTACK_B,
+ * wb_writeback_work structure so that the definition remains local to this
-};
+ * file.
+ */
-#define WS_USED (1 << WS_USED_B)
+#define CREATE_TRACE_POINTS
-#define WS_ONSTACK (1 << WS_ONSTACK_B)
+#include <trace/events/writeback.h>
-static inline bool bdi_work_on_stack(struct bdi_work *work)
-{
-        return test_bit(WS_ONSTACK_B, &work->state);
-}
-static inline void bdi_work_init(struct bdi_work *work,
+/*
-                                 struct wb_writeback_args *args)
+ * We don't actually have pdflush, but this one is exported though /proc...
-{
+ */
-        INIT_RCU_HEAD(&work->rcu_head);
+int nr_pdflush_threads;
-        work->args = *args;
-        work->state = WS_USED;
-}
 /**
 * writeback_in_progress - determine whether there is writeback in progress
@@ -92,186 +66,94 @@ static inline void bdi_work_init(struct bdi_work *work,
 */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-        return !list_empty(&bdi->work_list);
+        return test_bit(BDI_writeback_running, &bdi->state);
-}
-static void bdi_work_clear(struct bdi_work *work)
-{
-        clear_bit(WS_USED_B, &work->state);
-        smp_mb__after_clear_bit();
-        /*
-         * work can have disappeared at this point. bit waitq functions
-         * should be able to tolerate this, provided bdi_sched_wait does
-         * not dereference it's pointer argument.
-        */
-        wake_up_bit(&work->state, WS_USED_B);
-}
-static void bdi_work_free(struct rcu_head *head)
-{
-        struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-        if (!bdi_work_on_stack(work))
-                kfree(work);
-        else
-                bdi_work_clear(work);
 }
-static void wb_work_complete(struct bdi_work *work)
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-        const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+        struct super_block *sb = inode->i_sb;
-        int onstack = bdi_work_on_stack(work);
-        /*
-         * For allocated work, we can clear the done/seen bit right here.
-         * For on-stack work, we need to postpone both the clear and free
-         * to after the RCU grace period, since the stack could be invalidated
-         * as soon as bdi_work_clear() has done the wakeup.
-         */
-        if (!onstack)
-                bdi_work_clear(work);
-        if (sync_mode == WB_SYNC_NONE || onstack)
-                call_rcu(&work->rcu_head, bdi_work_free);
-}
-static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
-{
-        /*
-         * The caller has retrieved the work arguments from this work,
-         * drop our reference. If this is the last ref, delete and free it
-         */
-        if (atomic_dec_and_test(&work->pending)) {
-                struct backing_dev_info *bdi = wb->bdi;
-                spin_lock(&bdi->wb_lock);
+        if (strcmp(sb->s_type->name, "bdev") == 0)
-                list_del_rcu(&work->list);
+                return inode->i_mapping->backing_dev_info;
-                spin_unlock(&bdi->wb_lock);
-                wb_work_complete(work);
+        return sb->s_bdi;
-        }
 }
-static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                struct wb_writeback_work *work)
 {
-        work->seen = bdi->wb_mask;
+        trace_writeback_queue(bdi, work);
-        BUG_ON(!work->seen);
-        atomic_set(&work->pending, bdi->wb_cnt);
-        BUG_ON(!bdi->wb_cnt);
-        /*
+        spin_lock_bh(&bdi->wb_lock);
-         * list_add_tail_rcu() contains the necessary barriers to
+        list_add_tail(&work->list, &bdi->work_list);
-         * make sure the above stores are seen before the item is
+        if (bdi->wb.task) {
-         * noticed on the list
+                wake_up_process(bdi->wb.task);
-         */
+        } else {
-        spin_lock(&bdi->wb_lock);
+                /*
-        list_add_tail_rcu(&work->list, &bdi->work_list);
+                 * The bdi thread isn't there, wake up the forker thread which
-        spin_unlock(&bdi->wb_lock);
+                 * will create and run it.
+                 */
-        /*
+                trace_writeback_nothread(bdi, work);
-         * If the default thread isn't there, make sure we add it. When
-         * it gets created and wakes up, we'll run this work.
-         */
-        if (unlikely(list_empty_careful(&bdi->wb_list)))
                wake_up_process(default_backing_dev_info.wb.task);
-        else {
-                struct bdi_writeback *wb = &bdi->wb;
-                if (wb->task)
-                        wake_up_process(wb->task);
        }
+        spin_unlock_bh(&bdi->wb_lock);
 }
-/*
+static void
- * Used for on-stack allocated work items. The caller needs to wait until
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- * the wb threads have acked the work before it's safe to continue.
+                bool range_cyclic, bool for_background)
- */
-static void bdi_wait_on_work_clear(struct bdi_work *work)
-{
-        wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
-                    TASK_UNINTERRUPTIBLE);
-}
-static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args)
 {
-        struct bdi_work *work;
+        struct wb_writeback_work *work;
        /*
         * This is WB_SYNC_NONE writeback, so if allocation fails just
         * wakeup the thread for old dirty data writeback
         */
-        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        work = kzalloc(sizeof(*work), GFP_ATOMIC);
-        if (work) {
+        if (!work) {
-                bdi_work_init(work, args);
+                if (bdi->wb.task) {
-                bdi_queue_work(bdi, work);
+                        trace_writeback_nowork(bdi);
-        } else {
+                        wake_up_process(bdi->wb.task);
-                struct bdi_writeback *wb = &bdi->wb;
+                }
+                return;
-                if (wb->task)
-                        wake_up_process(wb->task);
        }
+        work->sync_mode = WB_SYNC_NONE;
+        work->nr_pages  = nr_pages;
+        work->range_cyclic = range_cyclic;
+        work->for_background = for_background;
+        bdi_queue_work(bdi, work);
 }
 /**
- * bdi_sync_writeback - start and wait for writeback
+ * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
- * @sb: write inodes from this super_block
+ * @nr_pages: the number of pages to write
 *
 * Description:
- *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   started when this function returns, we make no guarentees on
- *   reading, to avoid having the super disappear before we are done.
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
 */
-static void bdi_sync_writeback(struct backing_dev_info *bdi,
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
-                               struct super_block *sb)
 {
-        struct wb_writeback_args args = {
+        __bdi_start_writeback(bdi, nr_pages, true, false);
-                .sb             = sb,
-                .sync_mode      = WB_SYNC_ALL,
-                .nr_pages       = LONG_MAX,
-                .range_cyclic   = 0,
-        };
-        struct bdi_work work;
-        bdi_work_init(&work, &args);
-        work.state |= WS_ONSTACK;
-        bdi_queue_work(bdi, &work);
-        bdi_wait_on_work_clear(&work);
 }
 /**
- * bdi_start_writeback - start writeback
+ * bdi_start_background_writeback - start background writeback
 * @bdi: the backing device to write from
- * @sb: write inodes from this super_block
- * @nr_pages: the number of pages to write
 *
 * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   This does WB_SYNC_NONE background writeback. The IO is only
 *   started when this function returns, we make no guarentees on
 *   completion. Caller need not hold sb s_umount semaphore.
- *
 */
-void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
+void bdi_start_background_writeback(struct backing_dev_info *bdi)
-                         long nr_pages)
 {
-        struct wb_writeback_args args = {
+        __bdi_start_writeback(bdi, LONG_MAX, true, true);
-                .sb             = sb,
-                .sync_mode      = WB_SYNC_NONE,
-                .nr_pages       = nr_pages,
-                .range_cyclic   = 1,
-        };
-        /*
-         * We treat @nr_pages=0 as the special case to do background writeback,
-         * ie. to sync pages until the background dirty threshold is reached.
-         */
-        if (!nr_pages) {
-                args.nr_pages = LONG_MAX;
-                args.for_background = 1;
-        }
-        bdi_alloc_queue_work(bdi, &args);
 }
 /*
@@ -375,10 +257,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
 * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
 */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-        list_splice_init(&wb->b_more_io, wb->b_io.prev);
+        list_splice_init(&wb->b_more_io, &wb->b_io);
        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -398,11 +288,11 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-        do {
+         while (inode->i_state & I_SYNC) {
                spin_unlock(&inode_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
                spin_lock(&inode_lock);
-        } while (inode->i_state & I_SYNC);
+        }
 }
 /*
@@ -452,11 +342,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        BUG_ON(inode->i_state & I_SYNC);
-        /* Set I_SYNC, reset I_DIRTY */
+        /* Set I_SYNC, reset I_DIRTY_PAGES */
-        dirty = inode->i_state & I_DIRTY;
        inode->i_state |= I_SYNC;
-        inode->i_state &= ~I_DIRTY;
+        inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode_lock);
        ret = do_writepages(mapping, wbc);
@@ -472,6 +360,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
+        /*
+         * Some filesystems may redirty the inode during the writeback
+         * due to delalloc, clear dirty metadata flags right before
+         * write_inode()
+         */
+        spin_lock(&inode_lock);
+        dirty = inode->i_state & I_DIRTY;
+        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+        spin_unlock(&inode_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -481,63 +378,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode_lock);
        inode->i_state &= ~I_SYNC;
-        if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
+        if (!(inode->i_state & I_FREEING)) {
-                if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
+                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-                        /*
-                         * More pages get dirtied by a fast dirtier.
-                         */
-                        goto select_queue;
-                } else if (inode->i_state & I_DIRTY) {
-                        /*
-                         * At least XFS will redirty the inode during the
-                         * writeback (delalloc) and on io completion (isize).
-                         */
-                        redirty_tail(inode);
-                } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
                         * We didn't write back all the pages.  nfs_writepages()
-                         * sometimes bales out without doing anything. Redirty
+                         * sometimes bales out without doing anything.
-                         * the inode; Move it from b_io onto b_more_io/b_dirty.
                         */
-                        /*
+                        inode->i_state |= I_DIRTY_PAGES;
-                         * akpm: if the caller was the kupdate function we put
+                        if (wbc->nr_to_write <= 0) {
-                         * this inode at the head of b_dirty so it gets first
-                         * consideration.  Otherwise, move it to the tail, for
-                         * the reasons described there.  I'm not really sure
-                         * how much sense this makes.  Presumably I had a good
-                         * reasons for doing it this way, and I'd rather not
-                         * muck with it at present.
-                         */
-                        if (wbc->for_kupdate) {
                                /*
-                                 * For the kupdate function we move the inode
+                                 * slice used up: queue for next turn
-                                 * to b_more_io so it will get more writeout as
-                                 * soon as the queue becomes uncongested.
                                 */
-                                inode->i_state |= I_DIRTY_PAGES;
+                                requeue_io(inode);
-select_queue:
-                                if (wbc->nr_to_write <= 0) {
-                                        /*
-                                         * slice used up: queue for next turn
-                                         */
-                                        requeue_io(inode);
-                                } else {
-                                        /*
-                                         * somehow blocked: retry later
-                                         */
-                                        redirty_tail(inode);
-                                }
                        } else {
                                /*
-                                 * Otherwise fully redirty the inode so that
+                                 * Writeback blocked by something other than
-                                 * other inodes on this superblock will get some
+                                 * congestion. Delay the inode for some time to
-                                 * writeout.  Otherwise heavy writing to one
+                                 * avoid spinning on the CPU (100% iowait)
-                                 * file would indefinitely suspend writeout of
+                                 * retrying writeback of the dirty page/inode
-                                 * all the other files.
+                                 * that cannot be performed immediately.
                                 */
-                                inode->i_state |= I_DIRTY_PAGES;
                                redirty_tail(inode);
                        }
+                } else if (inode->i_state & I_DIRTY) {
+                        /*
+                         * Filesystems can dirty the inode during writeback
+                         * operations, such as delayed allocation during
+                         * submission or metadata updates after data IO
+                         * completion.
+                         */
+                        redirty_tail(inode);
                } else if (atomic_read(&inode->i_count)) {
                        /*
                         * The inode is clean, inuse
@@ -554,75 +424,69 @@ select_queue:
        return ret;
 }
-static void unpin_sb_for_writeback(struct super_block *sb)
-{
-        up_read(&sb->s_umount);
-        put_super(sb);
-}
-enum sb_pin_state {
-        SB_PINNED,
-        SB_NOT_PINNED,
-        SB_PIN_FAILED
-};
 /*
- * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * For background writeback the caller does not have the sb pinned
 * before calling writeback. So make sure that we do pin it, so it doesn't
 * go away while we are writing inodes from it.
 */
-static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
+static bool pin_sb_for_writeback(struct super_block *sb)
-                                              struct super_block *sb)
 {
-        /*
-         * Caller must already hold the ref for this
-         */
-        if (wbc->sync_mode == WB_SYNC_ALL) {
-                WARN_ON(!rwsem_is_locked(&sb->s_umount));
-                return SB_NOT_PINNED;
-        }
        spin_lock(&sb_lock);
+        if (list_empty(&sb->s_instances)) {
+                spin_unlock(&sb_lock);
+                return false;
+        }
        sb->s_count++;
+        spin_unlock(&sb_lock);
        if (down_read_trylock(&sb->s_umount)) {
-                if (sb->s_root) {
+                if (sb->s_root)
-                        spin_unlock(&sb_lock);
+                        return true;
-                        return SB_PINNED;
-                }
-                /*
-                 * umounted, drop rwsem again and fall through to failure
-                 */
                up_read(&sb->s_umount);
        }
-        sb->s_count--;
-        spin_unlock(&sb_lock);
+        put_super(sb);
-        return SB_PIN_FAILED;
+        return false;
 }
 /*
 * Write a portion of b_io inodes which belong to @sb.
- * If @wbc->sb != NULL, then find and write all such
+ *
+ * If @only_this_sb is true, then find and write all such
 * inodes. Otherwise write only ones which go sequentially
 * in reverse order.
+ *
 * Return 1, if the caller writeback routine should be
 * interrupted. Otherwise return 0.
 */
-static int writeback_sb_inodes(struct super_block *sb,
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-                               struct bdi_writeback *wb,
+                struct writeback_control *wbc, bool only_this_sb)
-                               struct writeback_control *wbc)
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
                struct inode *inode = list_entry(wb->b_io.prev,
                                                 struct inode, i_list);
-                if (wbc->sb && sb != inode->i_sb) {
-                        /* super block given and doesn't
+                if (inode->i_sb != sb) {
-                           match, skip this inode */
+                        if (only_this_sb) {
-                        redirty_tail(inode);
+                                /*
-                        continue;
+                                 * We only want to write back data for this
-                }
+                                 * superblock, move all inodes not belonging
-                if (sb != inode->i_sb)
+                                 * to it back onto the dirty list.
-                        /* finish with this superblock */
+                                 */
+                                redirty_tail(inode);
+                                continue;
+                        }
+                        /*
+                         * The inode belongs to a different superblock.
+                         * Bounce back to the caller to unpin this and
+                         * pin the next superblock.
+                         */
                        return 0;
+                }
                if (inode->i_state & (I_NEW | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
@@ -634,7 +498,7 @@ static int writeback_sb_inodes(struct super_block *sb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
-                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
+                BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -660,12 +524,13 @@ static int writeback_sb_inodes(struct super_block *sb,
        return 1;
 }
-static void writeback_inodes_wb(struct bdi_writeback *wb,
+void writeback_inodes_wb(struct bdi_writeback *wb,
-                                struct writeback_control *wbc)
+                struct writeback_control *wbc)
 {
        int ret = 0;
-        wbc->wb_start = jiffies; /* livelock avoidance */
+        if (!wbc->wb_start)
+                wbc->wb_start = jiffies; /* livelock avoidance */
        spin_lock(&inode_lock);
        if (!wbc->for_kupdate || list_empty(&wb->b_io))
                queue_io(wb, wbc->older_than_this);
@@ -674,24 +539,14 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                struct inode *inode = list_entry(wb->b_io.prev,
                                                 struct inode, i_list);
                struct super_block *sb = inode->i_sb;
-                enum sb_pin_state state;
-                if (wbc->sb && sb != wbc->sb) {
+                if (!pin_sb_for_writeback(sb)) {
-                        /* super block given and doesn't
-                           match, skip this inode */
-                        redirty_tail(inode);
-                        continue;
-                }
-                state = pin_sb_for_writeback(wbc, sb);
-                if (state == SB_PIN_FAILED) {
                        requeue_io(inode);
                        continue;
                }
-                ret = writeback_sb_inodes(sb, wb, wbc);
+                ret = writeback_sb_inodes(sb, wb, wbc, false);
+                drop_super(sb);
-                if (state == SB_PINNED)
-                        unpin_sb_for_writeback(sb);
                if (ret)
                        break;
        }
@@ -699,11 +554,16 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
        /* Leave any unwritten inodes on b_io */
 }
-void writeback_inodes_wbc(struct writeback_control *wbc)
+static void __writeback_inodes_sb(struct super_block *sb,
+                struct bdi_writeback *wb, struct writeback_control *wbc)
 {
-        struct backing_dev_info *bdi = wbc->bdi;
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        writeback_inodes_wb(&bdi->wb, wbc);
+        spin_lock(&inode_lock);
+        if (!wbc->for_kupdate || list_empty(&wb->b_io))
+                queue_io(wb, wbc->older_than_this);
+        writeback_sb_inodes(sb, wb, wbc, true);
+        spin_unlock(&inode_lock);
 }
 /*
@@ -719,7 +579,7 @@ static inline bool over_bground_thresh(void)
 {
        unsigned long background_thresh, dirty_thresh;
-        get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+        global_dirty_limits(&background_thresh, &dirty_thresh);
        return (global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
@@ -741,16 +601,14 @@ static inline bool over_bground_thresh(void)
 * all dirty pages if they are all attached to "old" mappings.
 */
 static long wb_writeback(struct bdi_writeback *wb,
-                         struct wb_writeback_args *args)
+                         struct wb_writeback_work *work)
 {
        struct writeback_control wbc = {
-                .bdi                    = wb->bdi,
+                .sync_mode              = work->sync_mode,
-                .sb                     = args->sb,
-                .sync_mode              = args->sync_mode,
                .older_than_this        = NULL,
-                .for_kupdate            = args->for_kupdate,
+                .for_kupdate            = work->for_kupdate,
-                .for_background         = args->for_background,
+                .for_background         = work->for_background,
-                .range_cyclic           = args->range_cyclic,
+                .range_cyclic           = work->range_cyclic,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -766,25 +624,33 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
+        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
-                if (args->nr_pages <= 0)
+                if (work->nr_pages <= 0)
                        break;
                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
-                if (args->for_background && !over_bground_thresh())
+                if (work->for_background && !over_bground_thresh())
                        break;
                wbc.more_io = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
-                writeback_inodes_wb(wb, &wbc);
-                args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                trace_wbc_writeback_start(&wbc, wb->bdi);
+                if (work->sb)
+                        __writeback_inodes_sb(work->sb, wb, &wbc);
+                else
+                        writeback_inodes_wb(wb, &wbc);
+                trace_wbc_writeback_written(&wbc, wb->bdi);
+                work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                /*
@@ -811,6 +677,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (!list_empty(&wb->b_more_io))  {
                        inode = list_entry(wb->b_more_io.prev,
                                                struct inode, i_list);
+                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
                spin_unlock(&inode_lock);
@@ -820,31 +687,21 @@ static long wb_writeback(struct bdi_writeback *wb,
 }
 /*
- * Return the next bdi_work struct that hasn't been processed by this
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
- * wb thread yet. ->seen is initially set for each thread that exists
- * for this device, when a thread first notices a piece of work it
- * clears its bit. Depending on writeback type, the thread will notify
- * completion on either receiving the work (WB_SYNC_NONE) or after
- * it is done (WB_SYNC_ALL).
 */
-static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+static struct wb_writeback_work *
-                                           struct bdi_writeback *wb)
+get_next_work_item(struct backing_dev_info *bdi)
 {
-        struct bdi_work *work, *ret = NULL;
+        struct wb_writeback_work *work = NULL;
-        rcu_read_lock();
-        list_for_each_entry_rcu(work, &bdi->work_list, list) {
+        spin_lock_bh(&bdi->wb_lock);
-                if (!test_bit(wb->nr, &work->seen))
+        if (!list_empty(&bdi->work_list)) {
-                        continue;
+                work = list_entry(bdi->work_list.next,
-                clear_bit(wb->nr, &work->seen);
+                                  struct wb_writeback_work, list);
+                list_del_init(&work->list);
-                ret = work;
-                break;
        }
+        spin_unlock_bh(&bdi->wb_lock);
-        rcu_read_unlock();
+        return work;
-        return ret;
 }
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -852,6 +709,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
        unsigned long expired;
        long nr_pages;
+        /*
+         * When set to zero, disable periodic writeback
+         */
+        if (!dirty_writeback_interval)
+                return 0;
        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
@@ -863,14 +726,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        if (nr_pages) {
-                struct wb_writeback_args args = {
+                struct wb_writeback_work work = {
                        .nr_pages       = nr_pages,
                        .sync_mode      = WB_SYNC_NONE,
                        .for_kupdate    = 1,
                        .range_cyclic   = 1,
                };
-                return wb_writeback(wb, &args);
+                return wb_writeback(wb, &work);
        }
        return 0;
@@ -882,39 +745,37 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
        struct backing_dev_info *bdi = wb->bdi;
-        struct bdi_work *work;
+        struct wb_writeback_work *work;
        long wrote = 0;
-        while ((work = get_next_work_item(bdi, wb)) != NULL) {
+        set_bit(BDI_writeback_running, &wb->bdi->state);
-                struct wb_writeback_args args = work->args;
+        while ((work = get_next_work_item(bdi)) != NULL) {
                /*
                 * Override sync mode, in case we must wait for completion
+                 * because this thread is exiting now.
                 */
                if (force_wait)
-                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+                        work->sync_mode = WB_SYNC_ALL;
-                /*
+                trace_writeback_exec(bdi, work);
-                 * If this isn't a data integrity operation, just notify
-                 * that we have seen this work and we are now starting it.
-                 */
-                if (args.sync_mode == WB_SYNC_NONE)
-                        wb_clear_pending(wb, work);
-                wrote += wb_writeback(wb, &args);
+                wrote += wb_writeback(wb, work);
                /*
-                 * This is a data integrity writeback, so only do the
+                 * Notify the caller of completion if this is a synchronous
-                 * notification when we have completed the work.
+                 * work item, otherwise just free it.
                 */
-                if (args.sync_mode == WB_SYNC_ALL)
+                if (work->done)
-                        wb_clear_pending(wb, work);
+                        complete(work->done);
+                else
+                        kfree(work);
        }
        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
 }
@@ -923,75 +784,88 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * wakes up periodically and does kupdated style flushing.
 */
-int bdi_writeback_task(struct bdi_writeback *wb)
+int bdi_writeback_thread(void *data)
 {
-        unsigned long last_active = jiffies;
+        struct bdi_writeback *wb = data;
-        unsigned long wait_jiffies = -1UL;
+        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
+        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        set_freezable();
+        wb->last_active = jiffies;
+        /*
+         * Our parent may run at a different priority, just set us to normal
+         */
+        set_user_nice(current, 0);
+        trace_writeback_thread_start(bdi);
        while (!kthread_should_stop()) {
+                /*
+                 * Remove own delayed wake-up timer, since we are already awake
+                 * and we'll take care of the preriodic write-back.
+                 */
+                del_timer(&wb->wakeup_timer);
                pages_written = wb_do_writeback(wb, 0);
+                trace_writeback_pages_written(pages_written);
                if (pages_written)
-                        last_active = jiffies;
+                        wb->last_active = jiffies;
-                else if (wait_jiffies != -1UL) {
-                        unsigned long max_idle;
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
+                        __set_current_state(TASK_RUNNING);
+                        continue;
+                }
+                if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+                        schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+                else {
                        /*
-                         * Longest period of inactivity that we tolerate. If we
+                         * We have nothing to do, so can go sleep without any
-                         * see dirty data again later, the task will get
+                         * timeout and save power. When a work is queued or
-                         * recreated automatically.
+                         * something is made dirty - we will be woken up.
                         */
-                        max_idle = max(5UL * 60 * HZ, wait_jiffies);
+                        schedule();
-                        if (time_after(jiffies, max_idle + last_active))
-                                break;
                }
-                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
-                schedule_timeout_interruptible(wait_jiffies);
                try_to_freeze();
        }
+        /* Flush any work that raced with us exiting */
+        if (!list_empty(&bdi->work_list))
+                wb_do_writeback(wb, 1);
+        trace_writeback_thread_stop(bdi);
        return 0;
 }
 /*
- * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * writeback, for integrity writeback see bdi_sync_writeback().
+ * the whole world.
 */
-static void bdi_writeback_all(struct super_block *sb, long nr_pages)
+void wakeup_flusher_threads(long nr_pages)
 {
-        struct wb_writeback_args args = {
-                .sb             = sb,
-                .nr_pages       = nr_pages,
-                .sync_mode      = WB_SYNC_NONE,
-        };
        struct backing_dev_info *bdi;
-        rcu_read_lock();
+        if (!nr_pages) {
+                nr_pages = global_page_state(NR_FILE_DIRTY) +
+                                global_page_state(NR_UNSTABLE_NFS);
+        }
+        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
+                __bdi_start_writeback(bdi, nr_pages, false, false);
-                bdi_alloc_queue_work(bdi, &args);
        }
        rcu_read_unlock();
 }
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages)
-{
-        if (nr_pages == 0)
-                nr_pages = global_page_state(NR_FILE_DIRTY) +
-                                global_page_state(NR_UNSTABLE_NFS);
-        bdi_writeback_all(NULL, nr_pages);
-}
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1044,6 +918,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
+        struct backing_dev_info *bdi = NULL;
+        bool wakeup_bdi = false;
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1089,7 +965,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        if (hlist_unhashed(&inode->i_hash))
                                goto out;
                }
-                if (inode->i_state & (I_FREEING|I_CLEAR))
+                if (inode->i_state & I_FREEING)
                        goto out;
                /*
@@ -1097,22 +973,31 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
-                        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+                        bdi = inode_to_bdi(inode);
-                        struct backing_dev_info *bdi = wb->bdi;
+                        if (bdi_cap_writeback_dirty(bdi)) {
-                        if (bdi_cap_writeback_dirty(bdi) &&
+                                WARN(!test_bit(BDI_registered, &bdi->state),
-                            !test_bit(BDI_registered, &bdi->state)) {
+                                     "bdi-%s not registered\n", bdi->name);
-                                WARN_ON(1);
-                                printk(KERN_ERR "bdi-%s not registered\n",
+                                /*
-                                                                bdi->name);
+                                 * If this is the first dirty inode for this
+                                 * bdi, we have to wake-up the corresponding
+                                 * bdi thread to make sure background
+                                 * write-back happens later.
+                                 */
+                                if (!wb_has_dirty_io(&bdi->wb))
+                                        wakeup_bdi = true;
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &wb->b_dirty);
+                        list_move(&inode->i_list, &bdi->wb.b_dirty);
                }
        }
 out:
        spin_unlock(&inode_lock);
+        if (wakeup_bdi)
+                bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
@@ -1155,7 +1040,7 @@ static void wait_sb_inodes(struct super_block *sb)
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct address_space *mapping;
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
                        continue;
                mapping = inode->i_mapping;
                if (mapping->nrpages == 0)
@@ -1196,12 +1081,20 @@ void writeback_inodes_sb(struct super_block *sb)
 {
        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        long nr_to_write;
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct wb_writeback_work work = {
+                .sb             = sb,
+                .sync_mode      = WB_SYNC_NONE,
+                .done           = &done,
+        };
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        nr_to_write = nr_dirty + nr_unstable +
+        work.nr_pages = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+        bdi_queue_work(sb->s_bdi, &work);
+        wait_for_completion(&done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
@@ -1215,7 +1108,9 @@ EXPORT_SYMBOL(writeback_inodes_sb);
 int writeback_inodes_sb_if_idle(struct super_block *sb)
 {
        if (!writeback_in_progress(sb->s_bdi)) {
+                down_read(&sb->s_umount);
                writeback_inodes_sb(sb);
+                up_read(&sb->s_umount);
                return 1;
        } else
                return 0;
@@ -1231,7 +1126,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 */
 void sync_inodes_sb(struct super_block *sb)
 {
-        bdi_sync_writeback(sb->s_bdi, sb);
+        DECLARE_COMPLETION_ONSTACK(done);
+        struct wb_writeback_work work = {
+                .sb             = sb,
+                .sync_mode      = WB_SYNC_ALL,
+                .nr_pages       = LONG_MAX,
+                .range_cyclic   = 0,
+                .done           = &done,
+        };
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        bdi_queue_work(sb->s_bdi, &work);
+        wait_for_completion(&done);
        wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db5..ed45a9cf5f3d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 {
        struct path old_root;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_root = fs->root;
        fs->root = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
 }
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 {
        struct path old_pwd;
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        path_get(path);
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        if (old_pwd.dentry)
                path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                task_lock(p);
                fs = p->fs;
                if (fs) {
-                        write_lock(&fs->lock);
+                        spin_lock(&fs->lock);
                        if (fs->root.dentry == old_root->dentry
                            && fs->root.mnt == old_root->mnt) {
                                path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
                                fs->pwd = *new_root;
                                count++;
                        }
-                        write_unlock(&fs->lock);
+                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        } while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
        if (fs) {
                int kill;
                task_lock(tsk);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
@@ -104,14 +104,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
-                rwlock_init(&fs->lock);
+                spin_lock_init(&fs->lock);
                fs->umask = old->umask;
-                read_lock(&old->lock);
+                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
-                fs->root = old->root;
-                path_get(&old->root);
-                fs->pwd = old->pwd;
-                path_get(&old->pwd);
-                read_unlock(&old->lock);
        }
        return fs;
 }
@@ -126,10 +121,10 @@ int unshare_fs_struct(void)
                return -ENOMEM;
        task_lock(current);
-        write_lock(&fs->lock);
+        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
-        write_unlock(&fs->lock);
+        spin_unlock(&fs->lock);
        task_unlock(current);
        if (kill)
@@ -148,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
        .users          = 1,
-        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .lock           = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .umask          = 0022,
 };
@@ -161,14 +156,14 @@ void daemonize_fs_struct(void)
                task_lock(current);
-                write_lock(&init_fs.lock);
+                spin_lock(&init_fs.lock);
                init_fs.users++;
-                write_unlock(&init_fs.lock);
+                spin_unlock(&init_fs.lock);
-                write_lock(&fs->lock);
+                spin_lock(&fs->lock);
                current->fs = &init_fs;
                kill = !--fs->users;
-                write_unlock(&fs->lock);
+                spin_unlock(&fs->lock);
                task_unlock(current);
                if (kill)
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
 config FSCACHE
        tristate "General filesystem local caching manager"
-        select SLOW_WORK
        help
          This option enables a generic filesystem caching manager that can be
          used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..f6aad48d38a8 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
 extern unsigned fscache_defer_create;
 extern unsigned fscache_debug;
 extern struct kobject *fscache_root;
+extern struct workqueue_struct *fscache_object_wq;
+extern struct workqueue_struct *fscache_op_wq;
+DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+static inline bool fscache_object_congested(void)
+{
+        return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
+}
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
@@ -313,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
 #define dbgprintk(FMT, ...) \
        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf, 1, 2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #ifdef __KDEBUG
 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -350,9 +352,9 @@ do {						\
 } while (0)
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..f9d856773f79 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
                 "FS-Cache debugging mask");
 struct kobject *fscache_root;
+struct workqueue_struct *fscache_object_wq;
+struct workqueue_struct *fscache_op_wq;
+DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
+/* these values serve as lower bounds, will be adjusted in fscache_init() */
+static unsigned fscache_object_max_active = 4;
+static unsigned fscache_op_max_active = 2;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fscache_sysctl_header;
+static int fscache_max_active_sysctl(struct ctl_table *table, int write,
+                                     void __user *buffer,
+                                     size_t *lenp, loff_t *ppos)
+{
+        struct workqueue_struct **wqp = table->extra1;
+        unsigned int *datap = table->data;
+        int ret;
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret == 0)
+                workqueue_set_max_active(*wqp, *datap);
+        return ret;
+}
+ctl_table fscache_sysctls[] = {
+        {
+                .procname       = "object_max_active",
+                .data           = &fscache_object_max_active,
+                .maxlen         = sizeof(unsigned),
+                .mode           = 0644,
+                .proc_handler   = fscache_max_active_sysctl,
+                .extra1         = &fscache_object_wq,
+        },
+        {
+                .procname       = "operation_max_active",
+                .data           = &fscache_op_max_active,
+                .maxlen         = sizeof(unsigned),
+                .mode           = 0644,
+                .proc_handler   = fscache_max_active_sysctl,
+                .extra1         = &fscache_op_wq,
+        },
+        {}
+};
+ctl_table fscache_sysctls_root[] = {
+        {
+                .procname       = "fscache",
+                .mode           = 0555,
+                .child          = fscache_sysctls,
+        },
+        {}
+};
+#endif
 /*
 * initialise the fs caching module
 */
 static int __init fscache_init(void)
 {
+        unsigned int nr_cpus = num_possible_cpus();
+        unsigned int cpu;
        int ret;
-        ret = slow_work_register_user(THIS_MODULE);
+        fscache_object_max_active =
-        if (ret < 0)
+                clamp_val(nr_cpus,
-                goto error_slow_work;
+                          fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
+        ret = -ENOMEM;
+        fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
+                                            fscache_object_max_active);
+        if (!fscache_object_wq)
+                goto error_object_wq;
+        fscache_op_max_active =
+                clamp_val(fscache_object_max_active / 2,
+                          fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
+        ret = -ENOMEM;
+        fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
+                                        fscache_op_max_active);
+        if (!fscache_op_wq)
+                goto error_op_wq;
+        for_each_possible_cpu(cpu)
+                init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
        ret = fscache_proc_init();
        if (ret < 0)
                goto error_proc;
+#ifdef CONFIG_SYSCTL
+        ret = -ENOMEM;
+        fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
+        if (!fscache_sysctl_header)
+                goto error_sysctl;
+#endif
        fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
                                               sizeof(struct fscache_cookie),
                                               0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
 error_kobj:
        kmem_cache_destroy(fscache_cookie_jar);
 error_cookie_jar:
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(fscache_sysctl_header);
+error_sysctl:
+#endif
        fscache_proc_cleanup();
 error_proc:
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(fscache_op_wq);
-error_slow_work:
+error_op_wq:
+        destroy_workqueue(fscache_object_wq);
+error_object_wq:
        return ret;
 }
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
        kobject_put(fscache_root);
        kmem_cache_destroy(fscache_cookie_jar);
+#ifdef CONFIG_SYSCTL
+        unregister_sysctl_table(fscache_sysctl_header);
+#endif
        fscache_proc_cleanup();
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(fscache_op_wq);
+        destroy_workqueue(fscache_object_wq);
        printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
 #define FSCACHE_OBJLIST_CONFIG_NOREADS  0x00000200      /* show objects without active reads */
 #define FSCACHE_OBJLIST_CONFIG_EVENTS   0x00000400      /* show objects with events */
 #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800      /* show objects without no events */
-#define FSCACHE_OBJLIST_CONFIG_WORK     0x00001000      /* show objects with slow work */
+#define FSCACHE_OBJLIST_CONFIG_WORK     0x00001000      /* show objects with work */
-#define FSCACHE_OBJLIST_CONFIG_NOWORK   0x00002000      /* show objects without slow work */
+#define FSCACHE_OBJLIST_CONFIG_NOWORK   0x00002000      /* show objects without work */
        u8              buf[512];       /* key and aux data buffer */
 };
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
        /* banners (can't represent line 0 by pos 0 as that would involve
         * returning a NULL pointer) */
        if (pos == 0)
-                return (struct fscache_object *) ++(*_pos);
+                return (struct fscache_object *)(long)++(*_pos);
        if (pos < 3)
                return (struct fscache_object *)pos;
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                       READS, NOREADS);
                FILTER(obj->events & obj->event_mask,
                       EVENTS, NOEVENTS);
-                FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
+                FILTER(work_busy(&obj->work), WORK, NOWORK);
-                       WORK, NOWORK);
        }
        seq_printf(m,
-                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
+                   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
                   obj->debug_id,
                   obj->parent ? obj->parent->debug_id : -1,
                   fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
                   obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
                   obj->events,
                   obj->flags,
-                   obj->work.flags);
+                   work_busy(&obj->work));
        no_cookie = true;
        keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
-#include <linux/seq_file.h>
 #include "internal.h"
 const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
        [FSCACHE_OBJECT_DEAD]           = "DEAD",
 };
-static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_get_object(struct fscache_object *);
-static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_put_object(struct fscache_object *);
-static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
-#endif
 static void fscache_initialise_object(struct fscache_object *);
 static void fscache_lookup_object(struct fscache_object *);
 static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
 static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
-const struct slow_work_ops fscache_object_slow_work_ops = {
-        .owner          = THIS_MODULE,
-        .get_ref        = fscache_object_slow_work_get_ref,
-        .put_ref        = fscache_object_slow_work_put_ref,
-        .execute        = fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        .desc           = fscache_object_slow_work_desc,
-#endif
-};
-EXPORT_SYMBOL(fscache_object_slow_work_ops);
 /*
 * we need to notify the parent when an op completes that we had outstanding
 * upon it
@@ -345,7 +329,7 @@ unsupported_event:
 /*
 * execute an object
 */
-static void fscache_object_slow_work_execute(struct slow_work *work)
+void fscache_object_work_func(struct work_struct *work)
 {
        struct fscache_object *object =
                container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
        if (object->events & object->event_mask)
                fscache_enqueue_object(object);
        clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+        fscache_put_object(object);
 }
+EXPORT_SYMBOL(fscache_object_work_func);
-/*
- * describe an object for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_object_slow_work_desc(struct slow_work *work,
-                                          struct seq_file *m)
-{
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
-        seq_printf(m, "FSC: OBJ%x: %s",
-                   object->debug_id,
-                   fscache_object_states_short[object->state]);
-}
-#endif
 /*
 * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
        _enter("");
        ASSERT(object->cookie != NULL);
        ASSERT(object->cookie->parent != NULL);
-        ASSERT(list_empty(&object->work.link));
        if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
                              (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
                object->parent = NULL;
        }
-        /* this just shifts the object release to the slow work processor */
+        /* this just shifts the object release to the work processor */
-        fscache_stat(&fscache_n_cop_put_object);
+        fscache_put_object(object);
-        object->cache->ops->put_object(object);
-        fscache_stat_d(&fscache_n_cop_put_object);
        _leave("");
 }
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
 }
 /*
- * allow the slow work item processor to get a ref on an object
+ * get a ref on an object
 */
-static int fscache_object_slow_work_get_ref(struct slow_work *work)
+static int fscache_get_object(struct fscache_object *object)
 {
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
        int ret;
        fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
 }
 /*
- * allow the slow work item processor to discard a ref on a work item
+ * discard a ref on a work item
 */
-static void fscache_object_slow_work_put_ref(struct slow_work *work)
+static void fscache_put_object(struct fscache_object *object)
 {
-        struct fscache_object *object =
-                container_of(work, struct fscache_object, work);
        fscache_stat(&fscache_n_cop_put_object);
        object->cache->ops->put_object(object);
        fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
 {
        _enter("{OBJ%x}", object->debug_id);
-        slow_work_enqueue(&object->work);
+        if (fscache_get_object(object) >= 0) {
+                wait_queue_head_t *cong_wq =
+                        &get_cpu_var(fscache_object_cong_wait);
+                if (queue_work(fscache_object_wq, &object->work)) {
+                        if (fscache_object_congested())
+                                wake_up(cong_wq);
+                } else
+                        fscache_put_object(object);
+                put_cpu_var(fscache_object_cong_wait);
+        }
+}
+/**
+ * fscache_object_sleep_till_congested - Sleep until object wq is congested
+ * @timoutp: Scheduler sleep timeout
+ *
+ * Allow an object handler to sleep until the object workqueue is congested.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * %true is returned if the object wq is congested, %false otherwise.
+ */
+bool fscache_object_sleep_till_congested(signed long *timeoutp)
+{
+        wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+        DEFINE_WAIT(wait);
+        if (fscache_object_congested())
+                return true;
+        add_wait_queue_exclusive(cong_wq, &wait);
+        if (!fscache_object_congested())
+                *timeoutp = schedule_timeout(*timeoutp);
+        finish_wait(cong_wq, &wait);
+        return fscache_object_congested();
 }
+EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 /*
 * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
                /* sort onto appropriate lists */
                fscache_enqueue_object(dep);
-                fscache_stat(&fscache_n_cop_put_object);
+                fscache_put_object(dep);
-                dep->cache->ops->put_object(dep);
-                fscache_stat_d(&fscache_n_cop_put_object);
                if (!list_empty(&object->dependents))
                        cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
        fscache_stat(&fscache_n_op_enqueue);
        switch (op->flags & FSCACHE_OP_TYPE) {
-        case FSCACHE_OP_FAST:
+        case FSCACHE_OP_ASYNC:
-                _debug("queue fast");
+                _debug("queue async");
                atomic_inc(&op->usage);
-                if (!schedule_work(&op->fast_work))
+                if (!queue_work(fscache_op_wq, &op->work))
                        fscache_put_operation(op);
                break;
-        case FSCACHE_OP_SLOW:
-                _debug("queue slow");
-                slow_work_enqueue(&op->slow_work);
-                break;
        case FSCACHE_OP_MYTHREAD:
                _debug("queue for caller's attention");
                break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
 }
 /*
- * allow the slow work item processor to get a ref on an operation
+ * execute an operation using fs_op_wq to provide processing context -
- */
+ * the caller holds a ref to this object, so we don't need to hold one
-static int fscache_op_get_ref(struct slow_work *work)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        atomic_inc(&op->usage);
-        return 0;
-}
-/*
- * allow the slow work item processor to discard a ref on an operation
- */
-static void fscache_op_put_ref(struct slow_work *work)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        fscache_put_operation(op);
-}
-/*
- * execute an operation using the slow thread pool to provide processing context
- * - the caller holds a ref to this object, so we don't need to hold one
 */
-static void fscache_op_execute(struct slow_work *work)
+void fscache_op_work_func(struct work_struct *work)
 {
        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
+                container_of(work, struct fscache_operation, work);
        unsigned long start;
        _enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
        start = jiffies;
        op->processor(op);
        fscache_hist(fscache_ops_histogram, start);
+        fscache_put_operation(op);
        _leave("");
 }
-/*
- * describe an operation for slow-work debugging
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
-{
-        struct fscache_operation *op =
-                container_of(work, struct fscache_operation, slow_work);
-        seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
-                   op->object->debug_id, op->debug_id,
-                   op->name, op->state, op->flags);
-}
-#endif
-const struct slow_work_ops fscache_op_slow_work_ops = {
-        .owner          = THIS_MODULE,
-        .get_ref        = fscache_op_get_ref,
-        .put_ref        = fscache_op_put_ref,
-        .execute        = fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
-        .desc           = fscache_op_desc,
-#endif
-};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
 page_busy:
        /* we might want to wait here, but that could deadlock the allocator as
-         * the slow-work threads writing to the cache may all end up sleeping
+         * the work threads writing to the cache may all end up sleeping
         * on memory allocation */
        fscache_stat(&fscache_n_store_vmscan_busy);
        return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
                return -ENOMEM;
        }
-        fscache_operation_init(op, NULL);
+        fscache_operation_init(op, fscache_attr_changed_op, NULL);
-        fscache_operation_init_slow(op, fscache_attr_changed_op);
+        op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
-        op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
        fscache_set_op_name(op, "Attr");
        spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
 EXPORT_SYMBOL(__fscache_attr_changed);
 /*
- * handle secondary execution given to a retrieval op on behalf of the
- * cache
- */
-static void fscache_retrieval_work(struct work_struct *work)
-{
-        struct fscache_retrieval *op =
-                container_of(work, struct fscache_retrieval, op.fast_work);
-        unsigned long start;
-        _enter("{OP%x}", op->op.debug_id);
-        start = jiffies;
-        op->op.processor(&op->op);
-        fscache_hist(fscache_ops_histogram, start);
-        fscache_put_operation(&op->op);
-}
-/*
 * release a retrieval op reference
 */
 static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
                return NULL;
        }
-        fscache_operation_init(&op->op, fscache_release_retrieval_op);
+        fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
        op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
        op->mapping     = mapping;
        op->end_io_func = end_io_func;
        op->context     = context;
        op->start_time  = jiffies;
-        INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
        INIT_LIST_HEAD(&op->to_do);
        fscache_set_op_name(&op->op, "Retr");
        return op;
@@ -710,30 +690,26 @@ static void fscache_write_op(struct fscache_operation *_op)
                goto superseded;
        }
-        if (page) {
+        radix_tree_tag_set(&cookie->stores, page->index,
-                radix_tree_tag_set(&cookie->stores, page->index,
+                           FSCACHE_COOKIE_STORING_TAG);
-                                   FSCACHE_COOKIE_STORING_TAG);
+        radix_tree_tag_clear(&cookie->stores, page->index,
-                radix_tree_tag_clear(&cookie->stores, page->index,
+                             FSCACHE_COOKIE_PENDING_TAG);
-                                     FSCACHE_COOKIE_PENDING_TAG);
-        }
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        if (page) {
+        fscache_set_op_state(&op->op, "Store");
-                fscache_set_op_state(&op->op, "Store");
+        fscache_stat(&fscache_n_store_pages);
-                fscache_stat(&fscache_n_store_pages);
+        fscache_stat(&fscache_n_cop_write_page);
-                fscache_stat(&fscache_n_cop_write_page);
+        ret = object->cache->ops->write_page(op, page);
-                ret = object->cache->ops->write_page(op, page);
+        fscache_stat_d(&fscache_n_cop_write_page);
-                fscache_stat_d(&fscache_n_cop_write_page);
+        fscache_set_op_state(&op->op, "EndWrite");
-                fscache_set_op_state(&op->op, "EndWrite");
+        fscache_end_page_write(object, page);
-                fscache_end_page_write(object, page);
+        if (ret < 0) {
-                if (ret < 0) {
+                fscache_set_op_state(&op->op, "Abort");
-                        fscache_set_op_state(&op->op, "Abort");
+                fscache_abort_object(object);
-                        fscache_abort_object(object);
+        } else {
-                } else {
+                fscache_enqueue_operation(&op->op);
-                        fscache_enqueue_operation(&op->op);
-                }
        }
        _leave("");
@@ -799,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        if (!op)
                goto nomem;
-        fscache_operation_init(&op->op, fscache_release_write_op);
+        fscache_operation_init(&op->op, fscache_write_op,
-        fscache_operation_init_slow(&op->op, fscache_write_op);
+                               fscache_release_write_op);
-        op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+        op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
        fscache_set_op_name(&op->op, "Write1");
        ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -856,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
        fscache_stat(&fscache_n_store_ops);
        fscache_stat(&fscache_n_stores_ok);
-        /* the slow work queue now carries its own ref on the object */
+        /* the work queue now carries its own ref on the object */
        fscache_put_operation(&op->op);
        _leave(" = 0");
        return 0;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 static struct kmem_cache *fuse_req_cachep;
@@ -235,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-        req->in.h.unique = fuse_get_unique(fc);
        req->in.h.len = sizeof(struct fuse_in_header) +
                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
        list_add_tail(&req->list, &fc->pending);
@@ -257,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
                req = list_entry(fc->bg_queue.next, struct fuse_req, list);
                list_del(&req->list);
                fc->active_background++;
+                req->in.h.unique = fuse_get_unique(fc);
                queue_request(fc, req);
        }
 }
@@ -272,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -302,8 +306,8 @@ __releases(&fc->lock)
 static void wait_answer_interruptible(struct fuse_conn *fc,
                                      struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        if (signal_pending(current))
                return;
@@ -321,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
@@ -394,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
        else if (fc->conn_error)
                req->out.h.error = -ECONNREFUSED;
        else {
+                req->in.h.unique = fuse_get_unique(fc);
                queue_request(fc, req);
                /* acquire extra reference, since request is still needed
                   after request_end() */
@@ -446,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+                                          struct fuse_req *req, u64 unique)
+{
+        int err = -ENODEV;
+        req->isreply = 0;
+        req->in.h.unique = unique;
+        spin_lock(&fc->lock);
+        if (fc->connected) {
+                queue_request(fc, req);
+                err = 0;
+        }
+        spin_unlock(&fc->lock);
+        return err;
+}
 /*
 * Called under fc->lock
 *
@@ -498,6 +520,9 @@ struct fuse_copy_state {
        int write;
        struct fuse_req *req;
        const struct iovec *iov;
+        struct pipe_buffer *pipebufs;
+        struct pipe_buffer *currbuf;
+        struct pipe_inode_info *pipe;
        unsigned long nr_segs;
        unsigned long seglen;
        unsigned long addr;
@@ -505,16 +530,16 @@ struct fuse_copy_state {
        void *mapaddr;
        void *buf;
        unsigned len;
+        unsigned move_pages:1;
 };
 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-                           int write, struct fuse_req *req,
+                           int write,
                           const struct iovec *iov, unsigned long nr_segs)
 {
        memset(cs, 0, sizeof(*cs));
        cs->fc = fc;
        cs->write = write;
-        cs->req = req;
        cs->iov = iov;
        cs->nr_segs = nr_segs;
 }
@@ -522,8 +547,19 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
 /* Unmap and put previous page of userspace buffer */
 static void fuse_copy_finish(struct fuse_copy_state *cs)
 {
-        if (cs->mapaddr) {
+        if (cs->currbuf) {
-                kunmap_atomic(cs->mapaddr, KM_USER0);
+                struct pipe_buffer *buf = cs->currbuf;
+                if (!cs->write) {
+                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                } else {
+                        kunmap(buf->page);
+                        buf->len = PAGE_SIZE - cs->len;
+                }
+                cs->currbuf = NULL;
+                cs->mapaddr = NULL;
+        } else if (cs->mapaddr) {
+                kunmap(cs->pg);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
                        set_page_dirty_lock(cs->pg);
@@ -544,26 +580,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
        unlock_request(cs->fc, cs->req);
        fuse_copy_finish(cs);
-        if (!cs->seglen) {
+        if (cs->pipebufs) {
-                BUG_ON(!cs->nr_segs);
+                struct pipe_buffer *buf = cs->pipebufs;
-                cs->seglen = cs->iov[0].iov_len;
-                cs->addr = (unsigned long) cs->iov[0].iov_base;
+                if (!cs->write) {
-                cs->iov++;
+                        err = buf->ops->confirm(cs->pipe, buf);
-                cs->nr_segs--;
+                        if (err)
+                                return err;
+                        BUG_ON(!cs->nr_segs);
+                        cs->currbuf = buf;
+                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+                        cs->len = buf->len;
+                        cs->buf = cs->mapaddr + buf->offset;
+                        cs->pipebufs++;
+                        cs->nr_segs--;
+                } else {
+                        struct page *page;
+                        if (cs->nr_segs == cs->pipe->buffers)
+                                return -EIO;
+                        page = alloc_page(GFP_HIGHUSER);
+                        if (!page)
+                                return -ENOMEM;
+                        buf->page = page;
+                        buf->offset = 0;
+                        buf->len = 0;
+                        cs->currbuf = buf;
+                        cs->mapaddr = kmap(page);
+                        cs->buf = cs->mapaddr;
+                        cs->len = PAGE_SIZE;
+                        cs->pipebufs++;
+                        cs->nr_segs++;
+                }
+        } else {
+                if (!cs->seglen) {
+                        BUG_ON(!cs->nr_segs);
+                        cs->seglen = cs->iov[0].iov_len;
+                        cs->addr = (unsigned long) cs->iov[0].iov_base;
+                        cs->iov++;
+                        cs->nr_segs--;
+                }
+                err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+                if (err < 0)
+                        return err;
+                BUG_ON(err != 1);
+                offset = cs->addr % PAGE_SIZE;
+                cs->mapaddr = kmap(cs->pg);
+                cs->buf = cs->mapaddr + offset;
+                cs->len = min(PAGE_SIZE - offset, cs->seglen);
+                cs->seglen -= cs->len;
+                cs->addr += cs->len;
        }
-        down_read(&current->mm->mmap_sem);
-        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
-                             &cs->pg, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (err < 0)
-                return err;
-        BUG_ON(err != 1);
-        offset = cs->addr % PAGE_SIZE;
-        cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
-        cs->buf = cs->mapaddr + offset;
-        cs->len = min(PAGE_SIZE - offset, cs->seglen);
-        cs->seglen -= cs->len;
-        cs->addr += cs->len;
        return lock_request(cs->fc, cs->req);
 }
@@ -585,23 +656,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
        return ncpy;
 }
+static int fuse_check_page(struct page *page)
+{
+        if (page_mapcount(page) ||
+            page->mapping != NULL ||
+            page_count(page) != 1 ||
+            (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+             ~(1 << PG_locked |
+               1 << PG_referenced |
+               1 << PG_uptodate |
+               1 << PG_lru |
+               1 << PG_active |
+               1 << PG_reclaim))) {
+                printk(KERN_WARNING "fuse: trying to steal weird page\n");
+                printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+                return 1;
+        }
+        return 0;
+}
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+        int err;
+        struct page *oldpage = *pagep;
+        struct page *newpage;
+        struct pipe_buffer *buf = cs->pipebufs;
+        struct address_space *mapping;
+        pgoff_t index;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        err = buf->ops->confirm(cs->pipe, buf);
+        if (err)
+                return err;
+        BUG_ON(!cs->nr_segs);
+        cs->currbuf = buf;
+        cs->len = buf->len;
+        cs->pipebufs++;
+        cs->nr_segs--;
+        if (cs->len != PAGE_SIZE)
+                goto out_fallback;
+        if (buf->ops->steal(cs->pipe, buf) != 0)
+                goto out_fallback;
+        newpage = buf->page;
+        if (WARN_ON(!PageUptodate(newpage)))
+                return -EIO;
+        ClearPageMappedToDisk(newpage);
+        if (fuse_check_page(newpage) != 0)
+                goto out_fallback_unlock;
+        mapping = oldpage->mapping;
+        index = oldpage->index;
+        /*
+         * This is a new and locked page, it shouldn't be mapped or
+         * have any special flags on it
+         */
+        if (WARN_ON(page_mapped(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(page_has_private(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageMlocked(oldpage)))
+                goto out_fallback_unlock;
+        remove_from_page_cache(oldpage);
+        page_cache_release(oldpage);
+        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+        if (err) {
+                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                goto out_fallback_unlock;
+        }
+        page_cache_get(newpage);
+        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+                lru_cache_add_file(newpage);
+        err = 0;
+        spin_lock(&cs->fc->lock);
+        if (cs->req->aborted)
+                err = -ENOENT;
+        else
+                *pagep = newpage;
+        spin_unlock(&cs->fc->lock);
+        if (err) {
+                unlock_page(newpage);
+                page_cache_release(newpage);
+                return err;
+        }
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        cs->len = 0;
+        return 0;
+out_fallback_unlock:
+        unlock_page(newpage);
+out_fallback:
+        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->buf = cs->mapaddr + buf->offset;
+        err = lock_request(cs->fc, cs->req);
+        if (err)
+                return err;
+        return 1;
+}
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+                         unsigned offset, unsigned count)
+{
+        struct pipe_buffer *buf;
+        if (cs->nr_segs == cs->pipe->buffers)
+                return -EIO;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        buf = cs->pipebufs;
+        page_cache_get(page);
+        buf->page = page;
+        buf->offset = offset;
+        buf->len = count;
+        cs->pipebufs++;
+        cs->nr_segs++;
+        cs->len = 0;
+        return 0;
+}
 /*
 * Copy a page in the request to/from the userspace buffer.  Must be
 * done atomically
 */
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                          unsigned offset, unsigned count, int zeroing)
 {
+        int err;
+        struct page *page = *pagep;
        if (page && zeroing && count < PAGE_SIZE) {
                void *mapaddr = kmap_atomic(page, KM_USER1);
                memset(mapaddr, 0, PAGE_SIZE);
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                if (!cs->len) {
+                if (cs->write && cs->pipebufs && page) {
-                        int err = fuse_copy_fill(cs);
+                        return fuse_ref_page(cs, page, offset, count);
-                        if (err)
+                } else if (!cs->len) {
-                                return err;
+                        if (cs->move_pages && page &&
+                            offset == 0 && count == PAGE_SIZE) {
+                                err = fuse_try_move_page(cs, pagep);
+                                if (err <= 0)
+                                        return err;
+                        } else {
+                                err = fuse_copy_fill(cs);
+                                if (err)
+                                        return err;
+                        }
                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +852,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
-                struct page *page = req->pages[i];
+                int err;
-                int err = fuse_copy_page(cs, page, offset, count, zeroing);
+                err = fuse_copy_page(cs, &req->pages[i], offset, count,
+                                     zeroing);
                if (err)
                        return err;
@@ -677,8 +905,8 @@ static int request_pending(struct fuse_conn *fc)
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        DECLARE_WAITQUEUE(wait, current);
@@ -704,11 +932,10 @@ __acquires(&fc->lock)
 *
 * Called with fc->lock held, releases it
 */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
-                               const struct iovec *iov, unsigned long nr_segs)
+                               size_t nbytes, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
 {
-        struct fuse_copy_state cs;
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
        unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +951,13 @@ __releases(&fc->lock)
        arg.unique = req->in.h.unique;
        spin_unlock(&fc->lock);
-        if (iov_length(iov, nr_segs) < reqsize)
+        if (nbytes < reqsize)
                return -EINVAL;
-        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
-        err = fuse_copy_one(&cs, &ih, sizeof(ih));
        if (!err)
-                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err ? err : reqsize;
 }
@@ -745,18 +971,13 @@ __releases(&fc->lock)
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
-                              unsigned long nr_segs, loff_t pos)
+                                struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
        struct fuse_req *req;
        struct fuse_in *in;
-        struct fuse_copy_state cs;
        unsigned reqsize;
-        struct file *file = iocb->ki_filp;
-        struct fuse_conn *fc = fuse_get_conn(file);
-        if (!fc)
-                return -EPERM;
 restart:
        spin_lock(&fc->lock);
@@ -776,7 +997,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        if (!list_empty(&fc->interrupts)) {
                req = list_entry(fc->interrupts.next, struct fuse_req,
                                 intr_entry);
-                return fuse_read_interrupt(fc, req, iov, nr_segs);
+                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +1007,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        in = &req->in;
        reqsize = in->h.len;
        /* If request is too large, reply with an error and restart the read */
-        if (iov_length(iov, nr_segs) < reqsize) {
+        if (nbytes < reqsize) {
                req->out.h.error = -EIO;
                /* SETXATTR is special, since it may contain too large data */
                if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +1016,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                goto restart;
        }
        spin_unlock(&fc->lock);
-        fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
+        cs->req = req;
-        err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+        err = fuse_copy_one(cs, &in->h, sizeof(in->h));
        if (!err)
-                err = fuse_copy_args(&cs, in->numargs, in->argpages,
+                err = fuse_copy_args(cs, in->numargs, in->argpages,
                                     (struct fuse_arg *) in->args, 0);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
        if (req->aborted) {
@@ -828,6 +1049,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct file *file = iocb->ki_filp;
+        struct fuse_conn *fc = fuse_get_conn(file);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+                                   struct pipe_buffer *buf)
+{
+        return 1;
+}
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = fuse_dev_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+                                    struct pipe_inode_info *pipe,
+                                    size_t len, unsigned int flags)
+{
+        int ret;
+        int page_nr = 0;
+        int do_wakeup = 0;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(in);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        fuse_copy_init(&cs, fc, 1, NULL, 0);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        ret = fuse_dev_do_read(fc, in, &cs, len);
+        if (ret < 0)
+                goto out;
+        ret = 0;
+        pipe_lock(pipe);
+        if (!pipe->readers) {
+                send_sig(SIGPIPE, current, 0);
+                if (!ret)
+                        ret = -EPIPE;
+                goto out_unlock;
+        }
+        if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+                ret = -EIO;
+                goto out_unlock;
+        }
+        while (page_nr < cs.nr_segs) {
+                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+                struct pipe_buffer *buf = pipe->bufs + newbuf;
+                buf->page = bufs[page_nr].page;
+                buf->offset = bufs[page_nr].offset;
+                buf->len = bufs[page_nr].len;
+                buf->ops = &fuse_dev_pipe_buf_ops;
+                pipe->nrbufs++;
+                page_nr++;
+                ret += buf->len;
+                if (pipe->inode)
+                        do_wakeup = 1;
+        }
+out_unlock:
+        pipe_unlock(pipe);
+        if (do_wakeup) {
+                smp_mb();
+                if (waitqueue_active(&pipe->wait))
+                        wake_up_interruptible(&pipe->wait);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+        }
+out:
+        for (; page_nr < cs.nr_segs; page_nr++)
+                page_cache_release(bufs[page_nr].page);
+        kfree(bufs);
+        return ret;
+}
 static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
                            struct fuse_copy_state *cs)
 {
@@ -924,6 +1249,199 @@ err:
        return err;
 }
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+                             struct fuse_copy_state *cs)
+{
+        struct fuse_notify_store_out outarg;
+        struct inode *inode;
+        struct address_space *mapping;
+        u64 nodeid;
+        int err;
+        pgoff_t index;
+        unsigned int offset;
+        unsigned int num;
+        loff_t file_size;
+        loff_t end;
+        err = -EINVAL;
+        if (size < sizeof(outarg))
+                goto out_finish;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto out_finish;
+        err = -EINVAL;
+        if (size - sizeof(outarg) != outarg.size)
+                goto out_finish;
+        nodeid = outarg.nodeid;
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (!fc->sb)
+                goto out_up_killsb;
+        inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+        if (!inode)
+                goto out_up_killsb;
+        mapping = inode->i_mapping;
+        index = outarg.offset >> PAGE_CACHE_SHIFT;
+        offset = outarg.offset & ~PAGE_CACHE_MASK;
+        file_size = i_size_read(inode);
+        end = outarg.offset + outarg.size;
+        if (end > file_size) {
+                file_size = end;
+                fuse_write_update_size(inode, file_size);
+        }
+        num = outarg.size;
+        while (num) {
+                struct page *page;
+                unsigned int this_num;
+                err = -ENOMEM;
+                page = find_or_create_page(mapping, index,
+                                           mapping_gfp_mask(mapping));
+                if (!page)
+                        goto out_iput;
+                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+                err = fuse_copy_page(cs, &page, offset, this_num, 0);
+                if (!err && offset == 0 && (num != 0 || file_size == end))
+                        SetPageUptodate(page);
+                unlock_page(page);
+                page_cache_release(page);
+                if (err)
+                        goto out_iput;
+                num -= this_num;
+                offset = 0;
+                index++;
+        }
+        err = 0;
+out_iput:
+        iput(inode);
+out_up_killsb:
+        up_read(&fc->killsb);
+out_finish:
+        fuse_copy_finish(cs);
+        return err;
+}
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+        int i;
+        for (i = 0; i < req->num_pages; i++) {
+                struct page *page = req->pages[i];
+                page_cache_release(page);
+        }
+}
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+                         struct fuse_notify_retrieve_out *outarg)
+{
+        int err;
+        struct address_space *mapping = inode->i_mapping;
+        struct fuse_req *req;
+        pgoff_t index;
+        loff_t file_size;
+        unsigned int num;
+        unsigned int offset;
+        size_t total_len = 0;
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        offset = outarg->offset & ~PAGE_CACHE_MASK;
+        req->in.h.opcode = FUSE_NOTIFY_REPLY;
+        req->in.h.nodeid = outarg->nodeid;
+        req->in.numargs = 2;
+        req->in.argpages = 1;
+        req->page_offset = offset;
+        req->end = fuse_retrieve_end;
+        index = outarg->offset >> PAGE_CACHE_SHIFT;
+        file_size = i_size_read(inode);
+        num = outarg->size;
+        if (outarg->offset > file_size)
+                num = 0;
+        else if (outarg->offset + num > file_size)
+                num = file_size - outarg->offset;
+        while (num) {
+                struct page *page;
+                unsigned int this_num;
+                page = find_get_page(mapping, index);
+                if (!page)
+                        break;
+                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+                req->pages[req->num_pages] = page;
+                req->num_pages++;
+                num -= this_num;
+                total_len += this_num;
+        }
+        req->misc.retrieve_in.offset = outarg->offset;
+        req->misc.retrieve_in.size = total_len;
+        req->in.args[0].size = sizeof(req->misc.retrieve_in);
+        req->in.args[0].value = &req->misc.retrieve_in;
+        req->in.args[1].size = total_len;
+        err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+        if (err)
+                fuse_retrieve_end(fc, req);
+        return err;
+}
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+                                struct fuse_copy_state *cs)
+{
+        struct fuse_notify_retrieve_out outarg;
+        struct inode *inode;
+        int err;
+        err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto copy_finish;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto copy_finish;
+        fuse_copy_finish(cs);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (fc->sb) {
+                u64 nodeid = outarg.nodeid;
+                inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+                if (inode) {
+                        err = fuse_retrieve(fc, inode, &outarg);
+                        iput(inode);
+                }
+        }
+        up_read(&fc->killsb);
+        return err;
+copy_finish:
+        fuse_copy_finish(cs);
+        return err;
+}
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -937,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_INVAL_ENTRY:
                return fuse_notify_inval_entry(fc, size, cs);
+        case FUSE_NOTIFY_STORE:
+                return fuse_notify_store(fc, size, cs);
+        case FUSE_NOTIFY_RETRIEVE:
+                return fuse_notify_retrieve(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
@@ -987,23 +1511,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 * it from the list and copy the rest of the buffer to the request.
 * The request is finished by calling request_end()
 */
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
-                               unsigned long nr_segs, loff_t pos)
+                                 struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
-        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
-        struct fuse_copy_state cs;
-        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-        if (!fc)
-                return -EPERM;
-        fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
        if (nbytes < sizeof(struct fuse_out_header))
                return -EINVAL;
-        err = fuse_copy_one(&cs, &oh, sizeof(oh));
+        err = fuse_copy_one(cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
@@ -1016,7 +1534,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
         * and error contains notification code.
         */
        if (!oh.unique) {
-                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
                return err ? err : nbytes;
        }
@@ -1035,7 +1553,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        if (req->aborted) {
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
@@ -1052,7 +1570,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                return nbytes;
        }
@@ -1060,11 +1578,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
-        cs.req = req;
+        cs->req = req;
+        if (!req->out.page_replace)
+                cs->move_pages = 0;
        spin_unlock(&fc->lock);
-        err = copy_out_args(&cs, &req->out, nbytes);
+        err = copy_out_args(cs, &req->out, nbytes);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
@@ -1080,10 +1600,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 err_unlock:
        spin_unlock(&fc->lock);
 err_finish:
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err;
 }
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+        return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *out, loff_t *ppos,
+                                     size_t len, unsigned int flags)
+{
+        unsigned nbuf;
+        unsigned idx;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc;
+        size_t rem;
+        ssize_t ret;
+        fc = fuse_get_conn(out);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        pipe_lock(pipe);
+        nbuf = 0;
+        rem = 0;
+        for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+                rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+        ret = -EINVAL;
+        if (rem < len) {
+                pipe_unlock(pipe);
+                goto out;
+        }
+        rem = len;
+        while (rem) {
+                struct pipe_buffer *ibuf;
+                struct pipe_buffer *obuf;
+                BUG_ON(nbuf >= pipe->buffers);
+                BUG_ON(!pipe->nrbufs);
+                ibuf = &pipe->bufs[pipe->curbuf];
+                obuf = &bufs[nbuf];
+                if (rem >= ibuf->len) {
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+                        pipe->nrbufs--;
+                } else {
+                        ibuf->ops->get(pipe, ibuf);
+                        *obuf = *ibuf;
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = rem;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                nbuf++;
+                rem -= obuf->len;
+        }
+        pipe_unlock(pipe);
+        fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        if (flags & SPLICE_F_MOVE)
+                cs.move_pages = 1;
+        ret = fuse_dev_do_write(fc, &cs, len);
+        for (idx = 0; idx < nbuf; idx++) {
+                struct pipe_buffer *buf = &bufs[idx];
+                buf->ops->release(pipe, buf);
+        }
+out:
+        kfree(bufs);
+        return ret;
+}
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
        unsigned mask = POLLOUT | POLLWRNORM;
@@ -1109,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 * This function releases and reacquires fc->lock
 */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
@@ -1133,8 +1744,8 @@ __acquires(&fc->lock)
 * locked).
 */
 static void end_io_requests(struct fuse_conn *fc)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        while (!list_empty(&fc->io)) {
                struct fuse_req *req =
@@ -1158,6 +1769,16 @@ __acquires(&fc->lock)
        }
 }
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+        fc->max_background = UINT_MAX;
+        flush_bg_queue(fc);
+        end_requests(fc, &fc->pending);
+        end_requests(fc, &fc->processing);
+}
 /*
 * Abort all requests.
 *
@@ -1184,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
                fc->connected = 0;
                fc->blocked = 0;
                end_io_requests(fc);
-                end_requests(fc, &fc->pending);
+                end_queued_requests(fc);
-                end_requests(fc, &fc->processing);
                wake_up_all(&fc->waitq);
                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1200,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file)
        if (fc) {
                spin_lock(&fc->lock);
                fc->connected = 0;
-                end_requests(fc, &fc->pending);
+                fc->blocked = 0;
-                end_requests(fc, &fc->processing);
+                end_queued_requests(fc);
+                wake_up_all(&fc->blocked_waitq);
                spin_unlock(&fc->lock);
                fuse_conn_put(fc);
        }
@@ -1225,8 +1846,10 @@ const struct file_operations fuse_dev_operations = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
+        .splice_read    = fuse_dev_splice_read,
        .write          = do_sync_write,
        .aio_write      = fuse_dev_write,
+        .splice_write   = fuse_dev_splice_write,
        .poll           = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..c9627c95482d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask)
                   exist.  So if permissions are revoked this won't be
                   noticed immediately, only after the attribute
                   timeout has expired */
-        } else if (mask & MAY_ACCESS) {
+        } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
                err = fuse_access(inode, mask);
        } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                if (!(inode->i_mode & S_IXUGO)) {
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-        /* nfsd can call this with no file */
+        return fuse_fsync_common(file, datasync, 1);
-        return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
 }
 static bool update_mtime(unsigned ivalid)
@@ -1271,21 +1270,18 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (!fuse_allow_task(fc, current))
                return -EACCES;
-        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+        if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
-                err = inode_change_ok(inode, attr);
+                attr->ia_valid |= ATTR_FORCE;
-                if (err)
-                        return err;
+        err = inode_change_ok(inode, attr);
-        }
+        if (err)
+                return err;
        if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
                return 0;
-        if (attr->ia_valid & ATTR_SIZE) {
+        if (attr->ia_valid & ATTR_SIZE)
-                err = inode_newsize_ok(inode, attr->ia_size);
-                if (err)
-                        return err;
                is_truncate = true;
-        }
        req = fuse_get_req(fc);
        if (IS_ERR(req))
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..c8224587123f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
        fuse_release_nowrite(inode);
 }
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
-                      int isdir)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_file *ff = file->private_data;
        struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        return err;
 }
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-        return fuse_fsync_common(file, de, datasync, 0);
+        return fuse_fsync_common(file, datasync, 0);
 }
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        int i;
        size_t count = req->misc.read.in.size;
        size_t num_read = req->out.args[0].size;
-        struct inode *inode = req->pages[0]->mapping->host;
+        struct address_space *mapping = NULL;
-        /*
+        for (i = 0; mapping == NULL && i < req->num_pages; i++)
-         * Short read means EOF.  If file size is larger, truncate it
+                mapping = req->pages[i]->mapping;
-         */
-        if (!req->out.h.error && num_read < count) {
-                loff_t pos = page_offset(req->pages[0]) + num_read;
-                fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
-        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        if (mapping) {
+                struct inode *inode = mapping->host;
+                /*
+                 * Short read means EOF. If file size is larger, truncate it
+                 */
+                if (!req->out.h.error && num_read < count) {
+                        loff_t pos;
+                        pos = page_offset(req->pages[0]) + num_read;
+                        fuse_read_update_size(inode, pos,
+                                              req->misc.read.attr_ver);
+                }
+                fuse_invalidate_attr(inode); /* atime changed */
+        }
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                else
                        SetPageError(page);
                unlock_page(page);
+                page_cache_release(page);
        }
        if (req->ff)
                fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
+        req->out.page_replace = 1;
        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                        return PTR_ERR(req);
                }
        }
+        page_cache_get(page);
        req->pages[req->num_pages] = page;
        req->num_pages++;
        return 0;
@@ -695,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
        return 0;
 }
-static void fuse_write_update_size(struct inode *inode, loff_t pos)
+void fuse_write_update_size(struct inode *inode, loff_t pos)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-        down_read(&current->mm->mmap_sem);
+        npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
-                                0, req->pages, NULL);
-        up_read(&current->mm->mmap_sem);
        if (npages < 0)
                return npages;
@@ -1136,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
        loff_t size = i_size_read(req->inode);
@@ -1175,8 +1183,8 @@ __acquires(&fc->lock)
 * Called with fc->lock
 */
 void fuse_flush_writepages(struct inode *inode)
-__releases(&fc->lock)
+__releases(fc->lock)
-__acquires(&fc->lock)
+__acquires(fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
        while (iov_iter_count(&ii)) {
                struct page *page = pages[page_idx++];
                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-                void *kaddr, *map;
+                void *kaddr;
-                kaddr = map = kmap(page);
+                kaddr = kmap(page);
                while (todo) {
                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
        /** Zero partially or not copied pages */
        unsigned page_zeroing:1;
+        /** Pages may be replaced with new ones */
+        unsigned page_replace:1;
        /** Number or arguments */
        unsigned numargs;
@@ -269,6 +272,7 @@ struct fuse_req {
                        struct fuse_write_in in;
                        struct fuse_write_out out;
                } write;
+                struct fuse_notify_retrieve_in retrieve_in;
                struct fuse_lk_in lk_in;
        } misc;
@@ -568,8 +572,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
 */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
-                      int isdir);
 /**
 * Notify poll wakeup
@@ -746,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
+void fuse_write_update_size(struct inode *inode, loff_t pos);
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ec14d19ce501..da9e6e11374c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -122,8 +122,10 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
        fuse_request_send_noreply(fc, req);
 }
-static void fuse_clear_inode(struct inode *inode)
+static void fuse_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
                struct fuse_inode *fi = get_fuse_inode(inode);
@@ -736,7 +738,7 @@ static const struct export_operations fuse_export_operations = {
 static const struct super_operations fuse_super_operations = {
        .alloc_inode    = fuse_alloc_inode,
        .destroy_inode  = fuse_destroy_inode,
-        .clear_inode    = fuse_clear_inode,
+        .evict_inode    = fuse_evict_inode,
        .drop_inode     = generic_delete_inode,
        .remount_fs     = fuse_remount_fs,
        .put_super      = fuse_put_super,
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..6bc9e3a5a693 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                        if (error < 0)
                                goto failed;
                        inode->i_mode = mode;
+                        inode->i_ctime = CURRENT_TIME;
                        if (error == 0) {
                                posix_acl_release(acl);
                                acl = NULL;
@@ -201,7 +202,7 @@ generic_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-struct xattr_handler generic_acl_access_handler = {
+const struct xattr_handler generic_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = generic_acl_list,
@@ -209,7 +210,7 @@ struct xattr_handler generic_acl_access_handler = {
        .set    = generic_acl_set,
 };
-struct xattr_handler generic_acl_default_handler = {
+const struct xattr_handler generic_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = generic_acl_list,
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b43107112..cc9665522148 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,7 +7,6 @@ config GFS2_FS
        select IP_SCTP if DLM_SCTP
        select FS_POSIX_ACL
        select CRC32
-        select SLOW_WORK
        select QUOTACTL
        help
          A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
                                 void *buffer, size_t size, int xtype)
 {
        struct inode *inode = dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct posix_acl *acl;
        int type;
        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
        type = gfs2_acl_type(name);
        if (type < 0)
                return type;
@@ -335,7 +339,7 @@ out:
        return error;
 }
-struct xattr_handler gfs2_xattr_system_handler = {
+const struct xattr_handler gfs2_xattr_system_handler = {
        .prefix = XATTR_SYSTEM_PREFIX,
        .flags  = GFS2_EATYPE_SYS,
        .get    = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
 extern int gfs2_check_acl(struct inode *inode, int mask);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern struct xattr_handler gfs2_xattr_system_handler;
+extern const struct xattr_handler gfs2_xattr_system_handler;
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..194fe16d8418 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page,
        if (ret <= 0)
                return ret;
-        ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+        return nobh_writepage(page, gfs2_get_block_noalloc, wbc);
-        if (ret == -EAGAIN)
-                ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-        return ret;
 }
 /**
@@ -418,6 +415,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
        struct buffer_head *dibh;
+        u64 dsize = i_size_read(&ip->i_inode);
        void *kaddr;
        int error;
@@ -437,9 +435,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
                return error;
        kaddr = kmap_atomic(page, KM_USER0);
-        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+        if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
-               ip->i_disksize);
+                dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
-        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+        memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
@@ -635,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                }
        }
-        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
+        alloc_required = gfs2_write_alloc_required(ip, pos, len);
-        if (error)
-                goto out_unlock;
        if (alloc_required || gfs2_is_jdata(ip))
                gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
@@ -698,8 +695,14 @@ out:
                return 0;
        page_cache_release(page);
+        /*
+         * XXX(truncate): the call below should probably be replaced with
+         * a call to the gfs2-specific truncate blocks helper to actually
+         * release disk blocks..
+         */
        if (pos + len > ip->i_inode.i_size)
-                vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+                truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
        gfs2_trans_end(sdp);
 out_trans_fail:
@@ -1039,9 +1042,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
        if (rv != 1)
                goto out; /* dio not valid, fall back to buffered i/o */
-        rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+        rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                           iov, offset, nr_segs,
+                                  offset, nr_segs, gfs2_get_block_direct,
-                                           gfs2_get_block_direct, NULL);
+                                  NULL, NULL, 0);
 out:
        gfs2_glock_dq_m(1, &gh);
        gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e411d5f4697..6f482809d1a3 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (!PageUptodate(page)) {
                void *kaddr = kmap(page);
+                u64 dsize = i_size_read(inode);
+ 
+                if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+                        dsize = dibh->b_size - sizeof(struct gfs2_dinode);
-                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-                       ip->i_disksize);
+                memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
-                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -1038,13 +1040,15 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
+                u64 dsize = size + sizeof(struct gfs2_dinode);
                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
-                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                if (dsize > dibh->b_size)
+                        dsize = dibh->b_size;
+                gfs2_buffer_clear_tail(dibh, dsize);
                error = 1;
        } else {
                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
@@ -1240,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 * @ip: the file being written to
 * @offset: the offset to write to
 * @len: the number of bytes being written
- * @alloc_required: set to 1 if an alloc is required, 0 otherwise
 *
- * Returns: errno
+ * Returns: 1 if an alloc is required, 0 otherwise
 */
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                              unsigned int len, int *alloc_required)
+                              unsigned int len)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head bh;
@@ -1254,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        u64 lblock, lblock_stop, size;
        u64 end_of_file;
-        *alloc_required = 0;
        if (!len)
                return 0;
        if (gfs2_is_stuffed(ip)) {
                if (offset + len >
                    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
-                        *alloc_required = 1;
+                        return 1;
                return 0;
        }
-        *alloc_required = 1;
        shift = sdp->sd_sb.sb_bsize_shift;
        BUG_ON(gfs2_is_dir(ip));
        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
        lblock = offset >> shift;
        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
        if (lblock_stop > end_of_file)
-                return 0;
+                return 1;
        size = (lblock_stop - lblock) << shift;
        do {
@@ -1281,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                bh.b_size = size;
                gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
                if (!buffer_mapped(&bh))
-                        return 0;
+                        return 1;
                size -= bh.b_size;
                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
        } while(size > 0);
-        *alloc_required = 0;
        return 0;
 }
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index c983177e05ac..a20a5213135a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                              unsigned int len, int *alloc_required);
+                              unsigned int len);
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..b9dd88a78dd4 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
        unsigned totlen = be16_to_cpu(dent->de_rec_len);
        if (gfs2_dirent_sentinel(dent))
-                actual = GFS2_DIRENT_SIZE(0);
+                actual = 0;
        if (totlen - actual >= required)
                return 1;
        return 0;
@@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        /* Change the pointers.
           Don't bother distinguishing stuffed from non-stuffed.
           This code is complicated enough already. */
-        lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL);
+        lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
+        if (!lp) {
+                error = -ENOMEM;
+                goto fail_brelse;
+        }
        /*  Change the pointers  */
        for (x = 0; x < half_len; x++)
                lp[x] = cpu_to_be64(bn);
@@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        /*  Allocate both the "from" and "to" buffers in one big chunk  */
-        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
+        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
        return 0;
 }
+static void *gfs2_alloc_sort_buffer(unsigned size)
+{
+        void *ptr = NULL;
+        if (size < KMALLOC_MAX_SIZE)
+                ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
+        if (!ptr)
+                ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
+        return ptr;
+}
+static void gfs2_free_sort_buffer(void *ptr)
+{
+        if (is_vmalloc_addr(ptr))
+                vfree(ptr);
+        else
+                kfree(ptr);
+}
 static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                              filldir_t filldir, int *copied, unsigned *depth,
                              u64 leaf_no)
@@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
         * 99 is the maximum number of entries that can fit in a single
         * leaf block.
         */
-        larr = vmalloc((leaves + entries + 99) * sizeof(void *));
+        larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
        if (!larr)
                goto out;
        darr = (const struct gfs2_dirent **)(larr + leaves);
@@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        do {
                error = get_leaf(ip, lfn, &bh);
                if (error)
-                        goto out_kfree;
+                        goto out_free;
                lf = (struct gfs2_leaf *)bh->b_data;
                lfn = be64_to_cpu(lf->lf_next);
                if (lf->lf_entries) {
@@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                                gfs2_dirent_gather, NULL, &g);
                        error = PTR_ERR(dent);
                        if (IS_ERR(dent))
-                                goto out_kfree;
+                                goto out_free;
                        if (entries2 != g.offset) {
                                fs_warn(sdp, "Number of entries corrupt in dir "
                                                "leaf %llu, entries2 (%u) != "
@@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
                                        entries2, g.offset);
                                        
                                error = -EIO;
-                                goto out_kfree;
+                                goto out_free;
                        }
                        error = 0;
                        larr[leaf++] = bh;
@@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
        BUG_ON(entries2 != entries);
        error = do_filldir_main(ip, offset, opaque, filldir, darr,
                                entries, copied);
-out_kfree:
+out_free:
        for(i = 0; i < leaf; i++)
                brelse(larr[i]);
-        vfree(larr);
+        gfs2_free_sort_buffer(larr);
 out:
        return error;
 }
@@ -1475,7 +1501,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+                                be64_to_cpu(dent->de_inum.no_formal_ino));
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c22c21174833..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..4edd662c8232 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
+        error = -EACCES;
+        if (!is_owner_or_cap(inode))
+                goto out;
+        error = 0;
        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 fsflags, gfsflags;
        if (get_user(fsflags, ptr))
                return -EFAULT;
        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -344,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned long last_index;
        u64 pos = page->index << PAGE_CACHE_SHIFT;
        unsigned int data_blocks, ind_blocks, rblocks;
-        int alloc_required = 0;
        struct gfs2_holder gh;
        struct gfs2_alloc *al;
        int ret;
@@ -357,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+        if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE))
-        if (ret || !alloc_required)
                goto out_unlock;
        ret = -ENOMEM;
        al = gfs2_alloc_get(ip);
@@ -547,9 +552,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
 * Returns: errno
 */
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
        int ret = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..9adf8f924e08 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 }
 /**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+        struct gfs2_holder *gh, *tmp;
+        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (ret & LM_OUT_ERROR)
+                        gh->gh_error = -EIO;
+                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+                        gh->gh_error = GLR_TRYFAILED;
+                else
+                        continue;
+                list_del_init(&gh->gh_list);
+                trace_gfs2_glock_queue(gh, 0);
+                gfs2_holder_wake(gh);
+        }
+}
+/**
 * do_promote - promote as many requests as possible on the current queue
 * @gl: The glock
 * 
@@ -375,36 +399,13 @@ restart:
                }
                if (gh->gh_list.prev == &gl->gl_holders)
                        return 1;
+                do_error(gl, 0);
                break;
        }
        return 0;
 }
 /**
- * do_error - Something unexpected has happened during a lock request
- *
- */
-static inline void do_error(struct gfs2_glock *gl, const int ret)
-{
-        struct gfs2_holder *gh, *tmp;
-        list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
-                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-                        continue;
-                if (ret & LM_OUT_ERROR)
-                        gh->gh_error = -EIO;
-                else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-                        gh->gh_error = GLR_TRYFAILED;
-                else
-                        continue;
-                list_del_init(&gh->gh_list);
-                trace_gfs2_glock_queue(gh, 0);
-                gfs2_holder_wake(gh);
-        }
-}
-/**
 * find_first_waiter - find the first gh that's waiting for the glock
 * @gl: the glock
 */
@@ -855,6 +856,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
        gh->gh_flags = flags;
        gh->gh_iflags = 0;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        if (gh->gh_owner_pid)
+                put_pid(gh->gh_owner_pid);
+        gh->gh_owner_pid = get_pid(task_pid(current));
 }
 /**
@@ -1059,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
+        if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+            test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
@@ -1316,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 }
 /**
+ * gfs2_should_freeze - Figure out if glock should be frozen
+ * @gl: The glock in question
+ *
+ * Glocks are not frozen if (a) the result of the dlm operation is
+ * an error, (b) the locking operation was an unlock operation or
+ * (c) if there is a "noexp" flagged request anywhere in the queue
+ *
+ * Returns: 1 if freezing should occur, 0 otherwise
+ */
+static int gfs2_should_freeze(const struct gfs2_glock *gl)
+{
+        const struct gfs2_holder *gh;
+        if (gl->gl_reply & ~LM_OUT_ST_MASK)
+                return 0;
+        if (gl->gl_target == LM_ST_UNLOCKED)
+                return 0;
+        list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+                if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+                        continue;
+                if (LM_FLAG_NOEXP & gh->gh_flags)
+                        return 0;
+        }
+        return 1;
+}
+/**
 * gfs2_glock_complete - Callback used by locking
 * @gl: Pointer to the glock
 * @ret: The return value from the dlm
@@ -1325,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
        gl->gl_reply = ret;
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-                struct gfs2_holder *gh;
                spin_lock(&gl->gl_spin);
-                gh = find_first_waiter(gl);
+                if (gfs2_should_freeze(gl)) {
-                if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
-                     (gl->gl_target != LM_ST_UNLOCKED)) ||
-                    ((ret & ~LM_OUT_ST_MASK) != 0))
                        set_bit(GLF_FROZEN, &gl->gl_flags);
-                spin_unlock(&gl->gl_spin);
+                        spin_unlock(&gl->gl_spin);
-                if (test_bit(GLF_FROZEN, &gl->gl_flags))
                        return;
+                }
+                spin_unlock(&gl->gl_spin);
        }
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
        gfs2_glock_hold(gl);
@@ -1345,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..fdbf4b366fa5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
-#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
-        struct slow_work jd_work;
+        struct work_struct jd_work;
        struct inode *jd_inode;
        unsigned long jd_flags;
 #define JDF_RECOVERY 1
@@ -439,9 +438,6 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_incore_log_blocks;
-        unsigned int gt_log_flush_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +458,8 @@ enum {
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
        SDF_NORECOVERY          = 4,
+        SDF_DEMOTE              = 5,
+        SDF_NOJOURNALID         = 6,
 };
 #define GFS2_FSNAME_LEN         256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
        unsigned int sd_log_commited_databuf;
        int sd_log_commited_revoke;
+        atomic_t sd_log_pinned;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
        unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
+        atomic_t sd_log_thresh1;
+        atomic_t sd_log_thresh2;
        atomic_t sd_log_blks_free;
-        struct mutex sd_log_reserve_mutex;
+        wait_queue_head_t sd_log_waitq;
+        wait_queue_head_t sd_logd_waitq;
        u64 sd_log_sequence;
        unsigned int sd_log_head;
        unsigned int sd_log_tail;
        int sd_log_idle;
-        unsigned long sd_log_flush_time;
        struct rw_semaphore sd_log_flush_lock;
        atomic_t sd_log_in_flight;
        wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..08140f185a37 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -84,7 +84,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
        struct gfs2_skip_data *data = opaque;
        if (ip->i_no_addr == data->no_addr) {
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
+                if (inode->i_state & (I_FREEING|I_WILL_FREE)){
                        data->skipped = 1;
                        return 0;
                }
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
 *
 * Returns: A VFS inode, or an error
 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
                                unsigned int type,
                                u64 no_addr,
-                                u64 no_formal_ino, int skip_freeing)
+                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
-        struct gfs2_glock *io_gl;
+        struct gfs2_glock *io_gl = NULL;
        int error;
-        if (skip_freeing)
+        inode = gfs2_iget(sb, no_addr);
-                inode = gfs2_iget_skip(sb, no_addr);
-        else
-                inode = gfs2_iget(sb, no_addr);
        ip = GFS2_I(inode);
        if (!inode)
@@ -202,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
                ip->i_iopen_gh.gh_gl->gl_object = ip;
                gfs2_glock_put(io_gl);
+                io_gl = NULL;
                if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
                        goto gfs2_nfsbypass;
@@ -232,13 +229,107 @@ gfs2_nfsbypass:
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
+        if (io_gl)
+                gfs2_glock_put(io_gl);
+fail_put:
+        if (inode->i_state & I_NEW)
+                ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        if (inode->i_state & I_NEW)
+                iget_failed(inode);
+        else
+                iput(inode);
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
+ * @sb: The super block
+ * no_addr: The inode number
+ *
+ */
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_inode *ip;
+        struct gfs2_glock *io_gl = NULL;
+        int error;
+        struct gfs2_holder gh;
+        struct inode *inode;
+        inode = gfs2_iget_skip(sb, no_addr);
+        if (!inode)
+                return;
+        /* If it's not a new inode, someone's using it, so leave it alone. */
+        if (!(inode->i_state & I_NEW)) {
+                iput(inode);
+                return;
+        }
+        ip = GFS2_I(inode);
+        sdp = GFS2_SB(inode);
+        ip->i_no_formal_ino = -1;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        if (unlikely(error))
+                goto fail;
+        ip->i_gl->gl_object = ip;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        if (unlikely(error))
+                goto fail_put;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+                                   &ip->i_iopen_gh);
+        if (unlikely(error))
+                goto fail_iopen;
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
        gfs2_glock_put(io_gl);
+        io_gl = NULL;
+        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /*
+         * We must read the inode in order to work out its type in
+         * this case. Note that this doesn't happen often as we normally
+         * know the type beforehand. This code path only occurs during
+         * unlinked inode recovery (where it is safe to do this glock,
+         * which is not true in the general case).
+         */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+                                   &gh);
+        if (unlikely(error))
+                goto fail_glock;
+        /* Inode is now uptodate */
+        gfs2_glock_dq_uninit(&gh);
+        gfs2_set_iop(inode);
+        /* The iput will cause it to be deleted. */
+        iput(inode);
+        return;
+fail_glock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+        if (io_gl)
+                gfs2_glock_put(io_gl);
 fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
        iget_failed(inode);
-        return ERR_PTR(error);
+        return;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +953,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino, 0);
+                                  inum.no_formal_ino);
        if (IS_ERR(inode))
                goto fail_gunlock2;
@@ -900,18 +991,29 @@ fail:
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
+        struct inode *inode = &ip->i_inode;
        struct buffer_head *dibh;
        int error;
        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (!error) {
+        if (error)
-                error = inode_setattr(&ip->i_inode, attr);
+                return error;
-                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-                gfs2_dinode_out(ip, dibh->b_data);
+            attr->ia_size != i_size_read(inode)) {
-                brelse(dibh);
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
        }
-        return error;
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        gfs2_assert_warn(GFS2_SB(inode), !error);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(ip, dibh->b_data);
+        brelse(dibh);
+        return 0;
 }
 /**
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino);
-                                       int skip_freeing);
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..ac750bd31a6f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
        return list_empty(&ai->ai_ail1_list);
 }
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 {
        struct list_head *head;
        u64 sync_gen;
-        struct list_head *first;
+        struct gfs2_ail *ai;
-        struct gfs2_ail *first_ai, *ai, *tmp;
        int done = 0;
        gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
        }
        sync_gen = sdp->sd_ail_sync_gen++;
-        first = head->prev;
-        first_ai = list_entry(first, struct gfs2_ail, ai_list);
-        first_ai->ai_sync_gen = sync_gen;
-        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-        if (flags & DIO_ALL)
-                first = NULL;
        while(!done) {
-                if (first && (head->prev != first ||
-                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
-                        break;
                done = 1;
-                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                list_for_each_entry_reverse(ai, head, ai_list) {
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 * flush time, so we ensure that we have just enough free blocks at all
 * times to avoid running out during a log flush.
 *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
 * Returns: errno
 */
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned int try = 0;
        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned wanted = blks + reserved_blks;
+        DEFINE_WAIT(wait);
+        int did_wait = 0;
+        unsigned int free_blocks;
        if (gfs2_assert_warn(sdp, blks) ||
            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
                return -EINVAL;
+retry:
-        mutex_lock(&sdp->sd_log_reserve_mutex);
+        free_blocks = atomic_read(&sdp->sd_log_blks_free);
-        gfs2_log_lock(sdp);
+        if (unlikely(free_blocks <= wanted)) {
-        while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
+                do {
-                gfs2_log_unlock(sdp);
+                        prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
-                gfs2_ail1_empty(sdp, 0);
+                                        TASK_UNINTERRUPTIBLE);
-                gfs2_log_flush(sdp, NULL);
+                        wake_up(&sdp->sd_logd_waitq);
+                        did_wait = 1;
-                if (try++)
+                        if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
-                        gfs2_ail1_start(sdp, 0);
+                                io_schedule();
-                gfs2_log_lock(sdp);
+                        free_blocks = atomic_read(&sdp->sd_log_blks_free);
+                } while(free_blocks <= wanted);
+                finish_wait(&sdp->sd_log_waitq, &wait);
        }
-        atomic_sub(blks, &sdp->sd_log_blks_free);
+        if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+                                free_blocks - blks) != free_blocks)
+                goto retry;
        trace_gfs2_log_blocks(sdp, -blks);
-        gfs2_log_unlock(sdp);
-        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        /*
+         * If we waited, then so might others, wake them up _after_ we get
+         * our share of the log.
+         */
+        if (unlikely(did_wait))
+                wake_up(&sdp->sd_log_waitq);
        down_read(&sdp->sd_log_flush_lock);
        return 0;
 }
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-        gfs2_log_lock(sdp);
-        atomic_add(blks, &sdp->sd_log_blks_free);
-        trace_gfs2_log_blocks(sdp, blks);
-        gfs2_assert_withdraw(sdp,
-                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-        gfs2_log_unlock(sdp);
-        up_read(&sdp->sd_log_flush_lock);
-}
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
        struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        ail2_empty(sdp, new_tail);
-        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
        trace_gfs2_log_blocks(sdp, dist);
-        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-        gfs2_log_unlock(sdp);
+                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_tail = new_tail;
 }
@@ -610,16 +595,17 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
+        submit_bh(WRITE_BARRIER | REQ_META, bh);
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
                set_buffer_uptodate(bh);
+                fs_info(sdp, "barrier sync failed - disabling barriers\n");
                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
                lock_buffer(bh);
 skip_barrier:
                get_bh(bh);
-                submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+                submit_bh(WRITE_SYNC | REQ_META, bh);
                wait_on_buffer(bh);
        }
        if (!buffer_uptodate(bh))
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 *
 */
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
        struct gfs2_ail *ai;
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 * @sdp: the filesystem
 * @tr: the transaction
 *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
 * Returns: errno
 */
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        up_read(&sdp->sd_log_flush_lock);
-        gfs2_log_lock(sdp);
+        if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
-        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
+            ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
-                wake_up_process(sdp->sd_logd_process);
+            atomic_read(&sdp->sd_log_thresh2)))
-        gfs2_log_unlock(sdp);
+                wake_up(&sdp->sd_logd_waitq);
 }
 /**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 {
        gfs2_log_flush(sdp, NULL);
        for (;;) {
-                gfs2_ail1_start(sdp, DIO_ALL);
+                gfs2_ail1_start(sdp);
                if (gfs2_ail1_empty(sdp, DIO_ALL))
                        break;
                msleep(10);
        }
 }
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+        return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+        unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+        return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
 /**
 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
        struct gfs2_sbd *sdp = data;
-        unsigned long t;
+        unsigned long t = 1;
-        int need_flush;
+        DEFINE_WAIT(wait);
+        unsigned preflush;
        while (!kthread_should_stop()) {
-                /* Advance the log tail */
-                t = sdp->sd_log_flush_time +
+                preflush = atomic_read(&sdp->sd_log_pinned);
-                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                        gfs2_log_flush(sdp, NULL);
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                }
-                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (gfs2_ail_flush_reqd(sdp)) {
-                gfs2_log_lock(sdp);
+                        gfs2_ail1_start(sdp);
-                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+                        io_schedule();
-                gfs2_log_unlock(sdp);
+                        gfs2_ail1_empty(sdp, 0);
-                if (need_flush || time_after_eq(jiffies, t)) {
                        gfs2_log_flush(sdp, NULL);
-                        sdp->sd_log_flush_time = jiffies;
+                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
+                wake_up(&sdp->sd_log_waitq);
                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
                if (freezing(current))
                        refrigerator();
-                schedule_timeout_interruptible(t);
+                do {
+                        prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+                                        TASK_INTERRUPTIBLE);
+                        if (!gfs2_ail_flush_reqd(sdp) &&
+                            !gfs2_jrnl_flush_reqd(sdp) &&
+                            !kthread_should_stop())
+                                t = schedule_timeout(t);
+                } while(t && !gfs2_ail_flush_reqd(sdp) &&
+                        !gfs2_jrnl_flush_reqd(sdp) &&
+                        !kthread_should_stop());
+                finish_wait(&sdp->sd_logd_waitq, &wait);
        }
        return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
        sdp->sd_log_head = sdp->sd_log_tail = value;
 }
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
                                      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-{
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-        if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+extern int gfs2_logd(void *data);
-                __gfs2_log_flush(sbd, gl);
-}
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
 }
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
+        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..b1e9630eb46a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
-#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -24,6 +23,7 @@
 #include "util.h"
 #include "glock.h"
 #include "quota.h"
+#include "recovery.h"
 static struct shrinker qd_shrinker = {
        .shrink = gfs2_shrink_qd_memory,
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_glock_cachep)
                goto fail;
-        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
                                        sizeof(struct gfs2_glock) +
                                        sizeof(struct address_space),
                                        0, 0, gfs2_init_gl_aspace_once);
@@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
-        error = slow_work_register_user(THIS_MODULE);
+        error = -ENOMEM;
-        if (error)
+        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                goto fail_slow;
+                                          WQ_NON_REENTRANT | WQ_RESCUER, 0);
+        if (!gfs_recovery_wq)
+                goto fail_wq;
        gfs2_register_debugfs();
@@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void)
        return 0;
-fail_slow:
+fail_wq:
        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
        unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
-        slow_work_unregister_user(THIS_MODULE);
+        destroy_workqueue(gfs_recovery_wq);
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..f3b071f921aa 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,11 +34,10 @@
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-        int err;
        struct buffer_head *bh, *head;
        int nr_underway = 0;
-        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+        int write_op = REQ_META |
-                        WRITE_SYNC_PLUG : WRITE));
+                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
        BUG_ON(!PageLocked(page));
        BUG_ON(!page_has_buffers(page));
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        } while (bh != head);
        unlock_page(page);
-        err = 0;
        if (nr_underway == 0)
                end_page_writeback(page);
-        return err;
+        return 0;
 }
 const struct address_space_operations gfs2_meta_aops = {
@@ -227,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
        }
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
-        submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+        submit_bh(READ_SYNC | REQ_META, bh);
        if (!(flags & DIO_WAIT))
                return 0;
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
+                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
                        gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
@@ -433,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
+                ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..4d4b1e8ac64c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/slow-work.h>
 #include <linux/quotaops.h>
 #include "gfs2.h"
@@ -57,8 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -78,7 +75,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        sb->s_fs_info = sdp;
        sdp->sd_vfs = sb;
+        set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
        gfs2_tune_init(&sdp->sd_tune);
        init_waitqueue_head(&sdp->sd_glock_wait);
@@ -101,14 +98,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
+        atomic_set(&sdp->sd_log_pinned, 0);
        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
-        mutex_init(&sdp->sd_log_reserve_mutex);
+        init_waitqueue_head(&sdp->sd_log_waitq);
+        init_waitqueue_head(&sdp->sd_logd_waitq);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -276,7 +274,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
        bio->bi_end_io = end_bio_io_page;
        bio->bi_private = page;
-        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+        submit_bio(READ_SYNC | REQ_META, bio);
        wait_on_page_locked(page);
        bio_put(bio);
        if (!PageUptodate(page)) {
@@ -487,7 +485,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
@@ -674,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
-                slow_work_init(&jd->jd_work, &gfs2_recover_ops);
+                INIT_WORK(&jd->jd_work, gfs2_recover_func);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -733,6 +731,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_args.ar_spectator) {
                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
        } else {
                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
                        fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +770,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                        goto fail_jinode_gh;
                }
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
@@ -779,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_lockstruct.ls_first) {
                unsigned int x;
                for (x = 0; x < sdp->sd_journals; x++) {
-                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+                        error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
+                                                     true);
                        if (error) {
                                fs_err(sdp, "error recovering journal %u: %d\n",
                                       x, error);
@@ -789,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                gfs2_others_may_mount(sdp);
        } else if (!sdp->sd_args.ar_spectator) {
-                error = gfs2_recover_journal(sdp->sd_jdesc);
+                error = gfs2_recover_journal(sdp->sd_jdesc, true);
                if (error) {
                        fs_err(sdp, "error recovering my journal: %d\n", error);
                        goto fail_jinode_gh;
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_quotad;
-        sdp->sd_log_flush_time = jiffies;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
        error = IS_ERR(p);
        if (error) {
@@ -1049,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                        ret = match_int(&tmp[0], &option);
                        if (ret || option < 0) 
                                goto hostdata_error;
-                        ls->ls_jid = option;
+                        if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags))
+                                ls->ls_jid = option;
                        break;
                case Opt_id:
                        /* Obsolete, but left for backward compat purposes */
@@ -1101,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
                lm->lm_unmount(sdp);
 }
+static int gfs2_journalid_wait(void *word)
+{
+        if (signal_pending(current))
+                return -EINTR;
+        schedule();
+        return 0;
+}
+static int wait_on_journal(struct gfs2_sbd *sdp)
+{
+        if (sdp->sd_args.ar_spectator)
+                return 0;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                return 0;
+        return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE);
+}
 void gfs2_online_uevent(struct gfs2_sbd *sdp)
 {
        struct super_block *sb = sdp->sd_vfs;
@@ -1160,7 +1180,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
        if (sdp->sd_args.ar_statfs_quantum) {
                sdp->sd_tune.gt_statfs_slow = 0;
@@ -1193,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (error)
                goto fail_locking;
+        error = wait_on_journal(sdp);
+        if (error)
+                goto fail_sb;
        error = init_inodes(sdp, DO);
        if (error)
                goto fail_sb;
@@ -1323,7 +1347,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
-        args.ar_commit = 60;
+        args.ar_commit = 30;
        args.ar_statfs_quantum = 30;
        args.ar_quota_quantum = 60;
        args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..1009be2c9737 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
+/*
+ * XXX(truncate): the truncate_setsize calls should be moved to the end.
+ */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,10 +1084,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
-                error = vmtruncate(inode, attr->ia_size);
+                truncate_setsize(inode, attr->ia_size);
                gfs2_trans_end(sdp);
-                if (error) 
-                        return error;
        }
        error = gfs2_truncatei(ip, attr->ia_size);
@@ -1133,8 +1134,16 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_end_trans;
-        error = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-        gfs2_assert_warn(sdp, !error);
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                gfs2_assert_warn(sdp, !error);
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..1bc6b5695e6d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        struct gfs2_quota_data *qd;
        struct gfs2_sbd *sdp;
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        unsigned blocksize, iblock, pos;
        struct buffer_head *bh, *dibh;
        struct page *page;
-        void *kaddr;
+        void *kaddr, *ptr;
-        struct gfs2_quota *qp;
+        struct gfs2_quota q, *qp;
-        s64 value;
+        int err, nbytes;
-        int err = -EIO;
        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
-        
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+        if (err < 0)
+                return err;
+        err = -EIO;
+        qp = &q;
+        qp->qu_value = be64_to_cpu(qp->qu_value);
+        qp->qu_value += change;
+        qp->qu_value = cpu_to_be64(qp->qu_value);
+        qd->qd_qb.qb_value = qp->qu_value;
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
+        /* Write the quota into the quota file on disk */
+        ptr = qp;
+        nbytes = sizeof(struct gfs2_quota);
+get_a_page:
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -667,7 +692,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        if (!buffer_mapped(bh)) {
                gfs2_block_map(inode, iblock, bh, 1);
                if (!buffer_mapped(bh))
-                        goto unlock;
+                        goto unlock_out;
+                /* If it's a newly allocated disk block for quota, zero it */
+                if (buffer_new(bh))
+                        zero_user(page, pos - blocksize, bh->b_size);
        }
        if (PageUptodate(page))
@@ -677,32 +705,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                ll_rw_block(READ_META, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
-                        goto unlock;
+                        goto unlock_out;
        }
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        qp = kaddr + offset;
+        if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
-        value = (s64)be64_to_cpu(qp->qu_value) + change;
+                nbytes = PAGE_CACHE_SIZE - offset;
-        qp->qu_value = cpu_to_be64(value);
+        memcpy(kaddr + offset, ptr, nbytes);
-        qd->qd_qb.qb_value = qp->qu_value;
-        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
-                        qd->qd_qb.qb_warn = qp->qu_warn;
-                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
-                        qd->qd_qb.qb_limit = qp->qu_limit;
-                }
-        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
+        unlock_page(page);
+        page_cache_release(page);
+        /* If quota straddles page boundary, we need to update the rest of the
+         * quota at the beginning of the next page */
+        if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
+                ptr = ptr + nbytes;
+                nbytes = sizeof(struct gfs2_quota) - nbytes;
+                offset = 0;
+                index++;
+                goto get_a_page;
+        }
+        /* Update the disk inode timestamp and size (if extended) */
        err = gfs2_meta_inode_buffer(ip, &dibh);
        if (err)
-                goto unlock;
+                goto out;
        size = loc + sizeof(struct gfs2_quota);
        if (size > inode->i_size) {
@@ -715,7 +745,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        brelse(dibh);
        mark_inode_dirty(inode);
-unlock:
+out:
+        return err;
+unlock_out:
        unlock_page(page);
        page_cache_release(page);
        return err;
@@ -755,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                goto out;
        for (x = 0; x < num_qd; x++) {
-                int alloc_required;
                offset = qd2offset(qda[x]);
-                error = gfs2_write_alloc_required(ip, offset,
+                if (gfs2_write_alloc_required(ip, offset,
-                                                  sizeof(struct gfs2_quota),
+                                              sizeof(struct gfs2_quota)))
-                                                  &alloc_required);
-                if (error)
-                        goto out_gunlock;
-                if (alloc_required)
                        nalloc++;
        }
@@ -779,8 +805,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
         * rgrp since it won't be allocated during the transaction
         */
        al->al_requested = 1;
-        /* +1 in the end for block requested above for unstuffing */
+        /* +3 in the end for unstuffing block, inode size update block
-        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+         * and another block in case quota straddles page boundary and 
+         * two blocks need to be updated instead of 1 */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        if (nalloc)
                al->al_requested += nalloc * (data_blocks + ind_blocks);                
@@ -1418,10 +1446,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        memset(fqs, 0, sizeof(struct fs_quota_stat));
        fqs->qs_version = FS_QSTAT_VERSION;
-        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        switch (sdp->sd_args.ar_quota) {
-        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+        case GFS2_QUOTA_ON:
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
+                /*FALLTHRU*/
+        case GFS2_QUOTA_ACCOUNT:
+                fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
+                break;
+        case GFS2_QUOTA_OFF:
+                break;
+        }
        if (sdp->sd_quota_inode) {
                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1468,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        return 0;
 }
-static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_lvb *qlvb;
@@ -1462,7 +1498,7 @@ static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
        fdq->d_version = FS_DQUOT_VERSION;
-        fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = id;
        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1477,8 +1513,8 @@ out:
 /* GFS2 only supports a subset of the XFS fields */
 #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
-static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1497,12 +1533,12 @@ static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
        switch(type) {
        case USRQUOTA:
                type = QUOTA_USER;
-                if (fdq->d_flags != XFS_USER_QUOTA)
+                if (fdq->d_flags != FS_USER_QUOTA)
                        return -EINVAL;
                break;
        case GRPQUOTA:
                type = QUOTA_GROUP;
-                if (fdq->d_flags != XFS_GROUP_QUOTA)
+                if (fdq->d_flags != FS_GROUP_QUOTA)
                        return -EINVAL;
                break;
        default:
@@ -1542,10 +1578,7 @@ static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
                goto out_i;
        offset = qd2offset(qd);
-        error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota),
+        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
-                                          &alloc_required);
-        if (error)
-                goto out_i;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
@@ -1585,7 +1618,7 @@ out_put:
 const struct quotactl_ops gfs2_quotactl_ops = {
        .quota_sync     = gfs2_quota_sync,
        .get_xstate     = gfs2_quota_get_xstate,
-        .get_xquota     = gfs2_xquota_get,
+        .get_dqblk      = gfs2_get_dqblk,
-        .set_xquota     = gfs2_xquota_set,
+        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
        return ret;
 }
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d437..f7f89a94a5a4 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -28,6 +27,8 @@
 #include "util.h"
 #include "dir.h"
+struct workqueue_struct *gfs_recovery_wq;
 int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh)
 {
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
-static int gfs2_recover_get_ref(struct slow_work *work)
+void gfs2_recover_func(struct work_struct *work)
-{
-        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
-                return -EBUSY;
-        return 0;
-}
-static void gfs2_recover_put_ref(struct slow_work *work)
-{
-        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
-        clear_bit(JDF_RECOVERY, &jd->jd_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
-}
-static void gfs2_recover_work(struct slow_work *work)
 {
        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work)
                gfs2_glock_dq_uninit(&j_gh);
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-        return;
+        goto done;
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
@@ -590,32 +575,35 @@ fail_gunlock_j:
        }
        fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 fail:
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+done:
+        clear_bit(JDF_RECOVERY, &jd->jd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
 }
-struct slow_work_ops gfs2_recover_ops = {
-        .owner   = THIS_MODULE,
-        .get_ref = gfs2_recover_get_ref,
-        .put_ref = gfs2_recover_put_ref,
-        .execute = gfs2_recover_work,
-};
 static int gfs2_recovery_wait(void *word)
 {
        schedule();
        return 0;
 }
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
 {
        int rv;
-        rv = slow_work_enqueue(&jd->jd_work);
-        if (rv)
+        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
-                return rv;
+                return -EBUSY;
-        wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
+        /* we have JDF_RECOVERY, queue should always succeed */
+        rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+        BUG_ON(!rv);
+        if (wait)
+                wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
+                            TASK_UNINTERRUPTIBLE);
        return 0;
 }
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569a..2226136c7647 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
 #include "incore.h"
+extern struct workqueue_struct *gfs_recovery_wq;
 static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 {
        if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
-extern struct slow_work_ops gfs2_recover_ops;
+extern void gfs2_recover_func(struct work_struct *work);
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            DISCARD_FL_BARRIER);
+                                                            BLKDEV_IFL_WAIT |
+                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
        }
        if (nr_sects) {
                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                         DISCARD_FL_BARRIER);
+                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
 * @rgd: The rgrp
 *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-                                     u64 skip)
+                           u64 skip)
 {
-        struct inode *inode;
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
+                return no_addr;
-                                          no_addr, -1, 1);
-                if (!IS_ERR(inode))
-                        return inode;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return NULL;
+        return 0;
 }
 /**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+                          u64 *last_unlinked)
 {
-        struct inode *inode = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
+        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        /* If the rg came in already locked, there's no
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                           way we can recover from a failed try_rgrp_unlink
+                           because that would require an iput which can only
+                           happen after the rgrp is unlocked. */
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
        }
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                if (rgd == begin) {
                        if (++loops >= 3)
-                                return ERR_PTR(-ENOSPC);
+                                return -ENOSPC;
                        if (!skipped)
                                loops++;
                        flags = 0;
@@ -1172,7 +1178,7 @@ out:
                forward_rgrp_set(sdp, rgd);
        }
-        return NULL;
+        return 0;
 }
 /**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
-        struct inode *inode;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK;
+        u64 last_unlinked = NO_BLOCK, unlinked;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
        if (error)
                return error;
-        inode = get_local_rgrp(ip, &last_unlinked);
+        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-        if (inode) {
+           dinodes along the way, error will equal -EAGAIN and unlinked will
+           contains it block address. We then need to look up that inode and
+           try to free it, and try the allocation again. */
+        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (IS_ERR(inode))
+                if (error != -EAGAIN)
-                        return PTR_ERR(inode);
+                        return error;
-                iput(inode);
+                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+                /* regardless of whether or not gfs2_process_unlinked_inode
+                   was successful, we don't want to repeat it again. */
+                last_unlinked = unlinked;
                gfs2_log_flush(sdp, NULL);
+                error = 0;
                goto try_again;
        }
+        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..77cb9f830ee4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-        int ar;
-        int error;
        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
@@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        }
        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
+        if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
-        if (!error && ar) {
                gfs2_consist_inode(ip);
-                error = -EIO;
+                return -EIO;
        }
-        return error;
+        return 0;
 }
 /**
@@ -1113,7 +1110,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        int error;
        spin_lock(&gt->gt_spin);
-        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
        if (gt->gt_statfs_slow)
                args.ar_statfs_quantum = 0;
@@ -1160,7 +1157,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        else
                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
-        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_logd_secs = args.ar_commit;
        gt->gt_quota_quantum = args.ar_quota_quantum;
        if (args.ar_statfs_quantum) {
                gt->gt_statfs_slow = 0;
@@ -1191,7 +1188,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 * node for later deallocation.
 */
-static void gfs2_drop_inode(struct inode *inode)
+static int gfs2_drop_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1200,26 +1197,7 @@ static void gfs2_drop_inode(struct inode *inode)
                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
                        clear_nlink(inode);
        }
-        generic_drop_inode(inode);
+        return generic_drop_inode(inode);
-}
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-static void gfs2_clear_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        ip->i_gl->gl_object = NULL;
-        gfs2_glock_put(ip->i_gl);
-        ip->i_gl = NULL;
-        if (ip->i_iopen_gh.gh_gl) {
-                ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-        }
 }
 static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
@@ -1305,8 +1283,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        val = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_logd_secs;
-        if (val != 60)
+        if (val != 30)
                seq_printf(s, ",commit=%d", val);
        val = sdp->sd_tune.gt_statfs_quantum;
        if (val != 30)
@@ -1334,7 +1312,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                seq_printf(s, ",nobarrier");
+        if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+                seq_printf(s, ",demote_interface_used");
        return 0;
 }
@@ -1346,13 +1325,16 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 * is safe, just less efficient.
 */
-static void gfs2_delete_inode(struct inode *inode)
+static void gfs2_evict_inode(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_holder gh;
        int error;
+        if (inode->i_nlink)
+                goto out;
        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
        if (unlikely(error)) {
                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
@@ -1406,10 +1388,18 @@ out_unlock:
        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
        if (error && error != GLR_TRYFAILED && error != -EROFS)
-                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+                fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
 out:
        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
+        end_writeback(inode);
+        ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+        ip->i_gl = NULL;
+        if (ip->i_iopen_gh.gh_gl) {
+                ip->i_iopen_gh.gh_gl->gl_object = NULL;
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+        }
 }
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
@@ -1433,14 +1423,13 @@ const struct super_operations gfs2_super_ops = {
        .alloc_inode            = gfs2_alloc_inode,
        .destroy_inode          = gfs2_destroy_inode,
        .write_inode            = gfs2_write_inode,
-        .delete_inode           = gfs2_delete_inode,
+        .evict_inode            = gfs2_evict_inode,
        .put_super              = gfs2_put_super,
        .sync_fs                = gfs2_sync_fs,
        .freeze_fs              = gfs2_freeze,
        .unfreeze_fs            = gfs2_unfreeze,
        .statfs                 = gfs2_statfs,
        .remount_fs             = gfs2_remount_fs,
-        .clear_inode            = gfs2_clear_inode,
        .drop_inode             = gfs2_drop_inode,
        .show_options           = gfs2_show_options,
 };
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
-extern struct xattr_handler *gfs2_xattr_handlers[];
+extern const struct xattr_handler *gfs2_xattr_handlers[];
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 54fd98425991..ccacffd2faaa 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
 #include "quota.h"
 #include "util.h"
 #include "glops.h"
+#include "recovery.h"
 struct gfs2_attr {
        struct attribute attr;
@@ -232,6 +233,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        glops = gfs2_glops_list[gltype];
        if (glops == NULL)
                return -EINVAL;
+        if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+                fs_info(sdp, "demote interface used\n");
        rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
        if (rv)
                return rv;
@@ -323,6 +326,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_first);
 }
+static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        unsigned first;
+        int rv;
+        rv = sscanf(buf, "%u", &first);
+        if (rv != 1 || first > 1)
+                return -EINVAL;
+        spin_lock(&sdp->sd_jindex_spin);
+        rv = -EBUSY;
+        if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+                goto out;
+        rv = -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                goto out;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                goto out;
+        sdp->sd_lockstruct.ls_first = first;
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
 static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -350,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                if (jd->jd_jid != jid)
                        continue;
-                rv = slow_work_enqueue(&jd->jd_work);
+                rv = gfs2_recover_journal(jd, false);
                break;
        }
 out:
@@ -375,14 +402,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
 }
+static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        unsigned jid;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        spin_lock(&sdp->sd_jindex_spin);
+        rv = -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                goto out;
+        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
+                goto out;
+        rv = -EBUSY;
+        if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+                goto out;
+        sdp->sd_lockstruct.ls_jid = jid;
+        smp_mb__after_clear_bit();
+        wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+        rv = 0;
+out:
+        spin_unlock(&sdp->sd_jindex_spin);
+        return rv ? rv : len;
+}
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
 GDLM_ATTR(block,                0644, block_show,               block_store);
 GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
+GDLM_ATTR(jid,                  0644, jid_show,                 jid_store);
-GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
+GDLM_ATTR(first,                0644, lkfirst_show,             lkfirst_store);
 GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
 GDLM_ATTR(recover,              0600, NULL,                     recover_store);
 GDLM_ATTR(recover_done,         0444, recover_done_show,        NULL);
@@ -468,8 +522,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
@@ -481,8 +533,6 @@ TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_incore_log_blocks.attr,
-        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
        &tune_attr_max_readahead.attr,
@@ -566,7 +616,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
-        if (!sdp->sd_args.ar_spectator)
+        if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
        if (gfs2_uuid_valid(uuid))
                add_uevent_var(env, "UUID=%pUB", uuid);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
 #include "meta_io.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                     unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
        return error;
 }
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+                                  sdp->sd_jdesc->jd_blocks);
+        up_read(&sdp->sd_log_flush_lock);
+}
 void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..776af6eb4bcb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,6 +1296,7 @@ fail:
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
+        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
        struct buffer_head *dibh;
@@ -1321,14 +1322,25 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
                return error;
        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (!error) {
+        if (error)
-                error = inode_setattr(&ip->i_inode, attr);
+                goto out_trans_end;
-                gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-                gfs2_dinode_out(ip, dibh->b_data);
+            attr->ia_size != i_size_read(inode)) {
-                brelse(dibh);
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                gfs2_assert_warn(GFS2_SB(inode), !error);
        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_dinode_out(ip, dibh->b_data);
+        brelse(dibh);
+out_trans_end:
        gfs2_trans_end(sdp);
        return error;
 }
@@ -1535,21 +1547,21 @@ out_alloc:
        return error;
 }
-static struct xattr_handler gfs2_xattr_user_handler = {
+static const struct xattr_handler gfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = GFS2_EATYPE_USR,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-static struct xattr_handler gfs2_xattr_security_handler = {
+static const struct xattr_handler gfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = GFS2_EATYPE_SECURITY,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-struct xattr_handler *gfs2_xattr_handlers[] = {
+const struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
        &gfs2_xattr_system_handler,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index fe35e3b626c4..4f55651aaa51 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -193,7 +193,7 @@ extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
                        __be32 log_size, __be32 phys_size, u32 clump_size);
 extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *);
-extern void hfs_clear_inode(struct inode *);
+extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 /* attr.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 14f5cb1b9fdc..397b7adc7ce6 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -39,10 +39,19 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hfs_get_block,
                                &HFS_I(mapping->host)->phys_size);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
@@ -112,9 +121,24 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+        ssize_t ret;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs, hfs_get_block, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && ret < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
+        return ret;
 }
 static int hfs_writepages(struct address_space *mapping,
@@ -507,8 +531,10 @@ out:
        return NULL;
 }
-void hfs_clear_inode(struct inode *inode)
+void hfs_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
                HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
                iput(HFS_I(inode)->rsrc_inode);
@@ -588,13 +614,43 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
                        attr->ia_mode = inode->i_mode & ~S_IWUGO;
                attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
        }
-        error = inode_setattr(inode, attr);
-        if (error)
-                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
        return 0;
 }
+static int hfs_file_fsync(struct file *filp, int datasync)
+{
+        struct inode *inode = filp->f_mapping->host;
+        struct super_block * sb;
+        int ret, err;
+        /* sync the inode to buffers */
+        ret = write_inode_now(inode, 0);
+        /* sync the superblock to buffers */
+        sb = inode->i_sb;
+        if (sb->s_dirt) {
+                lock_super(sb);
+                sb->s_dirt = 0;
+                if (!(sb->s_flags & MS_RDONLY))
+                        hfs_mdb_commit(sb);
+                unlock_super(sb);
+        }
+        /* .. finally sync the buffers to disk */
+        err = sync_blockdev(sb->s_bdev);
+        if (!ret)
+                ret = err;
+        return ret;
+}
 static const struct file_operations hfs_file_operations = {
        .llseek         = generic_file_llseek,
@@ -604,7 +660,7 @@ static const struct file_operations hfs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .splice_read    = generic_file_splice_read,
-        .fsync          = file_fsync,
+        .fsync          = hfs_file_fsync,
        .open           = hfs_file_open,
        .release        = hfs_file_release,
 };
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0a81eb7111f3..34235d4bf08b 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -181,7 +181,7 @@ static const struct super_operations hfs_super_operations = {
        .alloc_inode    = hfs_alloc_inode,
        .destroy_inode  = hfs_destroy_inode,
        .write_inode    = hfs_write_inode,
-        .clear_inode    = hfs_clear_inode,
+        .evict_inode    = hfs_evict_inode,
        .put_super      = hfs_put_super,
        .write_super    = hfs_write_super,
        .sync_fs        = hfs_sync_fs,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..dc856be3c2b0 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-                  unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
                     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
@@ -352,6 +351,7 @@ int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 /* super.c */
 struct inode *hfsplus_iget(struct super_block *, unsigned long);
+int hfsplus_sync_fs(struct super_block *sb, int wait);
 /* tables.c */
 extern u16 hfsplus_case_fold_table[];
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..c5a979d62c65 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -31,10 +31,19 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hfsplus_get_block,
                                &HFSPLUS_I(mapping->host).phys_size);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
@@ -105,9 +114,24 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+        ssize_t ret;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs, hfsplus_get_block, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && ret < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
+        return ret;
 }
 static int hfsplus_writepages(struct address_space *mapping,
@@ -266,9 +290,56 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
        return 0;
 }
+static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
+static int hfsplus_file_fsync(struct file *filp, int datasync)
+{
+        struct inode *inode = filp->f_mapping->host;
+        struct super_block * sb;
+        int ret, err;
+        /* sync the inode to buffers */
+        ret = write_inode_now(inode, 0);
+        /* sync the superblock to buffers */
+        sb = inode->i_sb;
+        if (sb->s_dirt) {
+                if (!(sb->s_flags & MS_RDONLY))
+                        hfsplus_sync_fs(sb, 1);
+                else
+                        sb->s_dirt = 0;
+        }
+        /* .. finally sync the buffers to disk */
+        err = sync_blockdev(sb->s_bdev);
+        if (!ret)
+                ret = err;
+        return ret;
+}
 static const struct inode_operations hfsplus_file_inode_operations = {
        .lookup         = hfsplus_file_lookup,
        .truncate       = hfsplus_file_truncate,
+        .setattr        = hfsplus_setattr,
        .setxattr       = hfsplus_setxattr,
        .getxattr       = hfsplus_getxattr,
        .listxattr      = hfsplus_listxattr,
@@ -282,10 +353,10 @@ static const struct file_operations hfsplus_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .splice_read    = generic_file_splice_read,
-        .fsync          = file_fsync,
+        .fsync          = hfsplus_file_fsync,
        .open           = hfsplus_file_open,
        .release        = hfsplus_file_release,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
 };
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                  unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
+        lock_kernel();
        switch (cmd) {
        case HFSPLUS_IOC_EXT2_GETFLAGS:
                flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
        case HFSPLUS_IOC_EXT2_SETFLAGS: {
                int err = 0;
                err = mnt_want_write(filp->f_path.mnt);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
                if (!is_owner_or_cap(inode)) {
                        err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                mark_inode_dirty(inode);
 setflags_out:
                mnt_drop_write(filp->f_path.mnt);
+                unlock_kernel();
                return err;
        }
        default:
+                unlock_kernel();
                return -ENOTTY;
        }
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 74b473a8ef92..3b55c050c742 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -145,16 +145,18 @@ static int hfsplus_write_inode(struct inode *inode,
        return ret;
 }
-static void hfsplus_clear_inode(struct inode *inode)
+static void hfsplus_evict_inode(struct inode *inode)
 {
-        dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino);
+        dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        if (HFSPLUS_IS_RSRC(inode)) {
                HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
                iput(HFSPLUS_I(inode).rsrc_inode);
        }
 }
-static int hfsplus_sync_fs(struct super_block *sb, int wait)
+int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
@@ -293,7 +295,7 @@ static const struct super_operations hfsplus_sops = {
        .alloc_inode    = hfsplus_alloc_inode,
        .destroy_inode  = hfsplus_destroy_inode,
        .write_inode    = hfsplus_write_inode,
-        .clear_inode    = hfsplus_clear_inode,
+        .evict_inode    = hfsplus_evict_inode,
        .put_super      = hfsplus_put_super,
        .write_super    = hfsplus_write_super,
        .sync_fs        = hfsplus_sync_fs,
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 2f34f8f2134b..6bbd75c5589b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -53,18 +53,28 @@ struct hostfs_iattr {
        struct timespec ia_ctime;
 };
-extern int stat_file(const char *path, unsigned long long *inode_out,
+struct hostfs_stat {
-                     int *mode_out, int *nlink_out, int *uid_out, int *gid_out,
+        unsigned long long ino;
-                     unsigned long long *size_out, struct timespec *atime_out,
+        unsigned int mode;
-                     struct timespec *mtime_out, struct timespec *ctime_out,
+        unsigned int nlink;
-                     int *blksize_out, unsigned long long *blocks_out, int fd);
+        unsigned int uid;
+        unsigned int gid;
+        unsigned long long size;
+        struct timespec atime, mtime, ctime;
+        unsigned int blksize;
+        unsigned long long blocks;
+        unsigned int maj;
+        unsigned int min;
+};
+extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
 extern int access_file(char *path, int r, int w, int x);
 extern int open_file(char *path, int r, int w, int append);
-extern int file_type(const char *path, int *maj, int *min);
 extern void *open_dir(char *path, int *err_out);
 extern char *read_dir(void *stream, unsigned long long *pos,
                      unsigned long long *ino_out, int *len_out);
 extern void close_file(void *stream);
+extern int replace_file(int oldfd, int fd);
 extern void close_dir(void *stream);
 extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
 extern int write_file(int fd, unsigned long long *offset, const char *buf,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..f7dc9b5f9ef8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,12 +14,12 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
 #include "hostfs.h"
 #include "init.h"
 #include "kern.h"
 struct hostfs_inode_info {
-        char *host_filename;
        int fd;
        fmode_t mode;
        struct inode vfs_inode;
@@ -49,7 +49,7 @@ static int append = 0;
 static const struct inode_operations hostfs_iops;
 static const struct inode_operations hostfs_dir_iops;
-static const struct address_space_operations hostfs_link_aops;
+static const struct inode_operations hostfs_link_iops;
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -90,71 +90,58 @@ __uml_setup("hostfs=", hostfs_args,
 );
 #endif
-static char *dentry_name(struct dentry *dentry, int extra)
+static char *__dentry_name(struct dentry *dentry, char *name)
 {
-        struct dentry *parent;
+        char *p = __dentry_path(dentry, name, PATH_MAX);
-        char *root, *name;
+        char *root;
-        int len;
+        size_t len;
-        len = 0;
-        parent = dentry;
-        while (parent->d_parent != parent) {
-                len += parent->d_name.len + 1;
-                parent = parent->d_parent;
-        }
-        root = HOSTFS_I(parent->d_inode)->host_filename;
+        spin_unlock(&dcache_lock);
-        len += strlen(root);
-        name = kmalloc(len + extra + 1, GFP_KERNEL);
-        if (name == NULL)
-                return NULL;
-        name[len] = '\0';
+        root = dentry->d_sb->s_fs_info;
-        parent = dentry;
+        len = strlen(root);
-        while (parent->d_parent != parent) {
+        if (IS_ERR(p)) {
-                len -= parent->d_name.len + 1;
+                __putname(name);
-                name[len] = '/';
+                return NULL;
-                strncpy(&name[len + 1], parent->d_name.name,
+        }
-                        parent->d_name.len);
+        strlcpy(name, root, PATH_MAX);
-                parent = parent->d_parent;
+        if (len > p - name) {
+                __putname(name);
+                return NULL;
+        }
+        if (p > name + len) {
+                char *s = name + len;
+                while ((*s++ = *p++) != '\0')
+                        ;
        }
-        strncpy(name, root, strlen(root));
        return name;
 }
-static char *inode_name(struct inode *ino, int extra)
+static char *dentry_name(struct dentry *dentry)
 {
-        struct dentry *dentry;
+        char *name = __getname();
+        if (!name)
+                return NULL;
-        dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
+        spin_lock(&dcache_lock);
-        return dentry_name(dentry, extra);
+        return __dentry_name(dentry, name); /* will unlock */
 }
-static int read_name(struct inode *ino, char *name)
+static char *inode_name(struct inode *ino)
 {
-        /*
+        struct dentry *dentry;
-         * The non-int inode fields are copied into ints by stat_file and
+        char *name = __getname();
-         * then copied into the inode because passing the actual pointers
+        if (!name)
-         * in and having them treated as int * breaks on big-endian machines
+                return NULL;
-         */
-        int err;
-        int i_mode, i_nlink, i_blksize;
-        unsigned long long i_size;
-        unsigned long long i_ino;
-        unsigned long long i_blocks;
-        err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid,
-                        &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime,
-                        &ino->i_ctime, &i_blksize, &i_blocks, -1);
-        if (err)
-                return err;
-        ino->i_ino = i_ino;
+        spin_lock(&dcache_lock);
-        ino->i_mode = i_mode;
+        if (list_empty(&ino->i_dentry)) {
-        ino->i_nlink = i_nlink;
+                spin_unlock(&dcache_lock);
-        ino->i_size = i_size;
+                __putname(name);
-        ino->i_blocks = i_blocks;
+                return NULL;
-        return 0;
+        }
+        dentry = list_first_entry(&ino->i_dentry, struct dentry, d_alias);
+        return __dentry_name(dentry, name); /* will unlock */
 }
 static char *follow_link(char *link)
@@ -205,53 +192,11 @@ static char *follow_link(char *link)
        return ERR_PTR(n);
 }
-static int hostfs_read_inode(struct inode *ino)
-{
-        char *name;
-        int err = 0;
-        /*
-         * Unfortunately, we are called from iget() when we don't have a dentry
-         * allocated yet.
-         */
-        if (list_empty(&ino->i_dentry))
-                goto out;
-        err = -ENOMEM;
-        name = inode_name(ino, 0);
-        if (name == NULL)
-                goto out;
-        if (file_type(name, NULL, NULL) == OS_TYPE_SYMLINK) {
-                name = follow_link(name);
-                if (IS_ERR(name)) {
-                        err = PTR_ERR(name);
-                        goto out;
-                }
-        }
-        err = read_name(ino, name);
-        kfree(name);
- out:
-        return err;
-}
 static struct inode *hostfs_iget(struct super_block *sb)
 {
-        struct inode *inode;
+        struct inode *inode = new_inode(sb);
-        long ret;
-        inode = iget_locked(sb, 0);
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        if (inode->i_state & I_NEW) {
-                ret = hostfs_read_inode(inode);
-                if (ret < 0) {
-                        iget_failed(inode);
-                        return ERR_PTR(ret);
-                }
-                unlock_new_inode(inode);
-        }
        return inode;
 }
@@ -269,7 +214,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
        long long f_files;
        long long f_ffree;
-        err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
+        err = do_statfs(dentry->d_sb->s_fs_info,
                        &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
                        &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
                        &sf->f_namelen, sf->f_spare);
@@ -288,47 +233,32 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
        struct hostfs_inode_info *hi;
-        hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+        hi = kzalloc(sizeof(*hi), GFP_KERNEL);
        if (hi == NULL)
                return NULL;
+        hi->fd = -1;
-        *hi = ((struct hostfs_inode_info) { .host_filename      = NULL,
-                                            .fd                 = -1,
-                                            .mode               = 0 });
        inode_init_once(&hi->vfs_inode);
        return &hi->vfs_inode;
 }
-static void hostfs_delete_inode(struct inode *inode)
+static void hostfs_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        if (HOSTFS_I(inode)->fd != -1) {
                close_file(&HOSTFS_I(inode)->fd);
                HOSTFS_I(inode)->fd = -1;
        }
-        clear_inode(inode);
 }
 static void hostfs_destroy_inode(struct inode *inode)
 {
-        kfree(HOSTFS_I(inode)->host_filename);
-        /*
-         * XXX: This should not happen, probably. The check is here for
-         * additional safety.
-         */
-        if (HOSTFS_I(inode)->fd != -1) {
-                close_file(&HOSTFS_I(inode)->fd);
-                printk(KERN_DEBUG "Closing host fd in .destroy_inode\n");
-        }
        kfree(HOSTFS_I(inode));
 }
 static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-        struct inode *root = vfs->mnt_sb->s_root->d_inode;
+        const char *root_path = vfs->mnt_sb->s_fs_info;
-        const char *root_path = HOSTFS_I(root)->host_filename;
        size_t offset = strlen(root_ino) + 1;
        if (strlen(root_path) > offset)
@@ -339,9 +269,8 @@ static int hostfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 static const struct super_operations hostfs_sbops = {
        .alloc_inode    = hostfs_alloc_inode,
-        .drop_inode     = generic_delete_inode,
-        .delete_inode   = hostfs_delete_inode,
        .destroy_inode  = hostfs_destroy_inode,
+        .evict_inode    = hostfs_evict_inode,
        .statfs         = hostfs_statfs,
        .show_options   = hostfs_show_options,
 };
@@ -353,11 +282,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
        unsigned long long next, ino;
        int error, len;
-        name = dentry_name(file->f_path.dentry, 0);
+        name = dentry_name(file->f_path.dentry);
        if (name == NULL)
                return -ENOMEM;
        dir = open_dir(name, &error);
-        kfree(name);
+        __putname(name);
        if (dir == NULL)
                return -error;
        next = file->f_pos;
@@ -373,47 +302,66 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 int hostfs_file_open(struct inode *ino, struct file *file)
 {
+        static DEFINE_MUTEX(open_mutex);
        char *name;
        fmode_t mode = 0;
+        int err;
        int r = 0, w = 0, fd;
        mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
        if ((mode & HOSTFS_I(ino)->mode) == mode)
                return 0;
-        /*
+        mode |= HOSTFS_I(ino)->mode;
-         * The file may already have been opened, but with the wrong access,
-         * so this resets things and reopens the file with the new access.
-         */
-        if (HOSTFS_I(ino)->fd != -1) {
-                close_file(&HOSTFS_I(ino)->fd);
-                HOSTFS_I(ino)->fd = -1;
-        }
-        HOSTFS_I(ino)->mode |= mode;
+retry:
-        if (HOSTFS_I(ino)->mode & FMODE_READ)
+        if (mode & FMODE_READ)
                r = 1;
-        if (HOSTFS_I(ino)->mode & FMODE_WRITE)
+        if (mode & FMODE_WRITE)
                w = 1;
        if (w)
                r = 1;
-        name = dentry_name(file->f_path.dentry, 0);
+        name = dentry_name(file->f_path.dentry);
        if (name == NULL)
                return -ENOMEM;
        fd = open_file(name, r, w, append);
-        kfree(name);
+        __putname(name);
        if (fd < 0)
                return fd;
-        FILE_HOSTFS_I(file)->fd = fd;
+        mutex_lock(&open_mutex);
+        /* somebody else had handled it first? */
+        if ((mode & HOSTFS_I(ino)->mode) == mode) {
+                mutex_unlock(&open_mutex);
+                return 0;
+        }
+        if ((mode | HOSTFS_I(ino)->mode) != mode) {
+                mode |= HOSTFS_I(ino)->mode;
+                mutex_unlock(&open_mutex);
+                close_file(&fd);
+                goto retry;
+        }
+        if (HOSTFS_I(ino)->fd == -1) {
+                HOSTFS_I(ino)->fd = fd;
+        } else {
+                err = replace_file(fd, HOSTFS_I(ino)->fd);
+                close_file(&fd);
+                if (err < 0) {
+                        mutex_unlock(&open_mutex);
+                        return err;
+                }
+        }
+        HOSTFS_I(ino)->mode = mode;
+        mutex_unlock(&open_mutex);
        return 0;
 }
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-        return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+        return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 static const struct file_operations hostfs_file_fops = {
@@ -544,54 +492,50 @@ static const struct address_space_operations hostfs_aops = {
        .write_end      = hostfs_write_end,
 };
-static int init_inode(struct inode *inode, struct dentry *dentry)
+static int read_name(struct inode *ino, char *name)
 {
-        char *name;
+        dev_t rdev;
-        int type, err = -ENOMEM;
+        struct hostfs_stat st;
-        int maj, min;
+        int err = stat_file(name, &st, -1);
-        dev_t rdev = 0;
+        if (err)
+                return err;
-        if (dentry) {
+        /* Reencode maj and min with the kernel encoding.*/
-                name = dentry_name(dentry, 0);
+        rdev = MKDEV(st.maj, st.min);
-                if (name == NULL)
-                        goto out;
-                type = file_type(name, &maj, &min);
-                /* Reencode maj and min with the kernel encoding.*/
-                rdev = MKDEV(maj, min);
-                kfree(name);
-        }
-        else type = OS_TYPE_DIR;
-        err = 0;
+        switch (st.mode & S_IFMT) {
-        if (type == OS_TYPE_SYMLINK)
+        case S_IFLNK:
-                inode->i_op = &page_symlink_inode_operations;
+                ino->i_op = &hostfs_link_iops;
-        else if (type == OS_TYPE_DIR)
-                inode->i_op = &hostfs_dir_iops;
-        else inode->i_op = &hostfs_iops;
-        if (type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
-        else inode->i_fop = &hostfs_file_fops;
-        if (type == OS_TYPE_SYMLINK)
-                inode->i_mapping->a_ops = &hostfs_link_aops;
-        else inode->i_mapping->a_ops = &hostfs_aops;
-        switch (type) {
-        case OS_TYPE_CHARDEV:
-                init_special_inode(inode, S_IFCHR, rdev);
                break;
-        case OS_TYPE_BLOCKDEV:
+        case S_IFDIR:
-                init_special_inode(inode, S_IFBLK, rdev);
+                ino->i_op = &hostfs_dir_iops;
+                ino->i_fop = &hostfs_dir_fops;
                break;
-        case OS_TYPE_FIFO:
+        case S_IFCHR:
-                init_special_inode(inode, S_IFIFO, 0);
+        case S_IFBLK:
+        case S_IFIFO:
+        case S_IFSOCK:
+                init_special_inode(ino, st.mode & S_IFMT, rdev);
+                ino->i_op = &hostfs_iops;
                break;
-        case OS_TYPE_SOCK:
-                init_special_inode(inode, S_IFSOCK, 0);
+        default:
-                break;
+                ino->i_op = &hostfs_iops;
-        }
+                ino->i_fop = &hostfs_file_fops;
- out:
+                ino->i_mapping->a_ops = &hostfs_aops;
-        return err;
+        }
+        ino->i_ino = st.ino;
+        ino->i_mode = st.mode;
+        ino->i_nlink = st.nlink;
+        ino->i_uid = st.uid;
+        ino->i_gid = st.gid;
+        ino->i_atime = st.atime;
+        ino->i_mtime = st.mtime;
+        ino->i_ctime = st.ctime;
+        ino->i_size = st.size;
+        ino->i_blocks = st.blocks;
+        return 0;
 }
 int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
@@ -607,12 +551,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto out;
        }
-        error = init_inode(inode, dentry);
-        if (error)
-                goto out_put;
        error = -ENOMEM;
-        name = dentry_name(dentry, 0);
+        name = dentry_name(dentry);
        if (name == NULL)
                goto out_put;
@@ -622,9 +562,10 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, int mode,
                         mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
        if (fd < 0)
                error = fd;
-        else error = read_name(inode, name);
+        else
+                error = read_name(inode, name);
-        kfree(name);
+        __putname(name);
        if (error)
                goto out_put;
@@ -652,17 +593,14 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
                goto out;
        }
-        err = init_inode(inode, dentry);
-        if (err)
-                goto out_put;
        err = -ENOMEM;
-        name = dentry_name(dentry, 0);
+        name = dentry_name(dentry);
        if (name == NULL)
                goto out_put;
        err = read_name(inode, name);
-        kfree(name);
+        __putname(name);
        if (err == -ENOENT) {
                iput(inode);
                inode = NULL;
@@ -680,36 +618,21 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
        return ERR_PTR(err);
 }
-static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
-{
-        char *file;
-        int len;
-        file = inode_name(ino, dentry->d_name.len + 1);
-        if (file == NULL)
-                return NULL;
-        strcat(file, "/");
-        len = strlen(file);
-        strncat(file, dentry->d_name.name, dentry->d_name.len);
-        file[len + dentry->d_name.len] = '\0';
-        return file;
-}
 int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
 {
        char *from_name, *to_name;
        int err;
-        if ((from_name = inode_dentry_name(ino, from)) == NULL)
+        if ((from_name = dentry_name(from)) == NULL)
                return -ENOMEM;
-        to_name = dentry_name(to, 0);
+        to_name = dentry_name(to);
        if (to_name == NULL) {
-                kfree(from_name);
+                __putname(from_name);
                return -ENOMEM;
        }
        err = link_file(to_name, from_name);
-        kfree(from_name);
+        __putname(from_name);
-        kfree(to_name);
+        __putname(to_name);
        return err;
 }
@@ -718,13 +641,14 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
        char *file;
        int err;
-        if ((file = inode_dentry_name(ino, dentry)) == NULL)
-                return -ENOMEM;
        if (append)
                return -EPERM;
+        if ((file = dentry_name(dentry)) == NULL)
+                return -ENOMEM;
        err = unlink_file(file);
-        kfree(file);
+        __putname(file);
        return err;
 }
@@ -733,10 +657,10 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
        char *file;
        int err;
-        if ((file = inode_dentry_name(ino, dentry)) == NULL)
+        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = make_symlink(file, to);
-        kfree(file);
+        __putname(file);
        return err;
 }
@@ -745,10 +669,10 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
        char *file;
        int err;
-        if ((file = inode_dentry_name(ino, dentry)) == NULL)
+        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = do_mkdir(file, mode);
-        kfree(file);
+        __putname(file);
        return err;
 }
@@ -757,10 +681,10 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
        char *file;
        int err;
-        if ((file = inode_dentry_name(ino, dentry)) == NULL)
+        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = do_rmdir(file);
-        kfree(file);
+        __putname(file);
        return err;
 }
@@ -776,22 +700,20 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
                goto out;
        }
-        err = init_inode(inode, dentry);
-        if (err)
-                goto out_put;
        err = -ENOMEM;
-        name = dentry_name(dentry, 0);
+        name = dentry_name(dentry);
        if (name == NULL)
                goto out_put;
        init_special_inode(inode, mode, dev);
        err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
-        if (err)
+        if (!err)
                goto out_free;
        err = read_name(inode, name);
-        kfree(name);
+        __putname(name);
+        if (err)
+                goto out_put;
        if (err)
                goto out_put;
@@ -799,7 +721,7 @@ int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        return 0;
 out_free:
-        kfree(name);
+        __putname(name);
 out_put:
        iput(inode);
 out:
@@ -812,15 +734,15 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        char *from_name, *to_name;
        int err;
-        if ((from_name = inode_dentry_name(from_ino, from)) == NULL)
+        if ((from_name = dentry_name(from)) == NULL)
                return -ENOMEM;
-        if ((to_name = inode_dentry_name(to_ino, to)) == NULL) {
+        if ((to_name = dentry_name(to)) == NULL) {
-                kfree(from_name);
+                __putname(from_name);
                return -ENOMEM;
        }
        err = rename_file(from_name, to_name);
-        kfree(from_name);
+        __putname(from_name);
-        kfree(to_name);
+        __putname(to_name);
        return err;
 }
@@ -832,7 +754,7 @@ int hostfs_permission(struct inode *ino, int desired)
        if (desired & MAY_READ) r = 1;
        if (desired & MAY_WRITE) w = 1;
        if (desired & MAY_EXEC) x = 1;
-        name = inode_name(ino, 0);
+        name = inode_name(ino);
        if (name == NULL)
                return -ENOMEM;
@@ -841,7 +763,7 @@ int hostfs_permission(struct inode *ino, int desired)
                err = 0;
        else
                err = access_file(name, r, w, x);
-        kfree(name);
+        __putname(name);
        if (!err)
                err = generic_permission(ino, desired, NULL);
        return err;
@@ -849,13 +771,14 @@ int hostfs_permission(struct inode *ino, int desired)
 int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+        struct inode *inode = dentry->d_inode;
        struct hostfs_iattr attrs;
        char *name;
        int err;
-        int fd = HOSTFS_I(dentry->d_inode)->fd;
+        int fd = HOSTFS_I(inode)->fd;
-        err = inode_change_ok(dentry->d_inode, attr);
+        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -897,15 +820,26 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (attr->ia_valid & ATTR_MTIME_SET) {
                attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
        }
-        name = dentry_name(dentry, 0);
+        name = dentry_name(dentry);
        if (name == NULL)
                return -ENOMEM;
        err = set_attr(name, &attrs, fd);
-        kfree(name);
+        __putname(name);
        if (err)
                return err;
-        return inode_setattr(dentry->d_inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                if (err)
+                        return err;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 static const struct inode_operations hostfs_iops = {
@@ -935,32 +869,41 @@ static const struct inode_operations hostfs_dir_iops = {
        .setattr        = hostfs_setattr,
 };
-int hostfs_link_readpage(struct file *file, struct page *page)
+static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
+{
-        char *buffer, *name;
+        char *link = __getname();
-        int err;
+        if (link) {
+                char *path = dentry_name(dentry);
-        buffer = kmap(page);
+                int err = -ENOMEM;
-        name = inode_name(page->mapping->host, 0);
+                if (path) {
-        if (name == NULL)
+                        err = hostfs_do_readlink(path, link, PATH_MAX);
-                return -ENOMEM;
+                        if (err == PATH_MAX)
-        err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE);
+                                err = -E2BIG;
-        kfree(name);
+                        __putname(path);
-        if (err == PAGE_CACHE_SIZE)
+                }
-                err = -E2BIG;
+                if (err < 0) {
-        else if (err > 0) {
+                        __putname(link);
-                flush_dcache_page(page);
+                        link = ERR_PTR(err);
-                SetPageUptodate(page);
+                }
-                if (PageError(page)) ClearPageError(page);
+        } else {
-                err = 0;
+                link = ERR_PTR(-ENOMEM);
        }
-        kunmap(page);
-        unlock_page(page);
+        nd_set_link(nd, link);
-        return err;
+        return NULL;
 }
-static const struct address_space_operations hostfs_link_aops = {
+static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-        .readpage       = hostfs_link_readpage,
+{
+        char *s = nd_get_link(nd);
+        if (!IS_ERR(s))
+                __putname(s);
+}
+static const struct inode_operations hostfs_link_iops = {
+        .readlink       = generic_readlink,
+        .follow_link    = hostfs_follow_link,
+        .put_link       = hostfs_put_link,
 };
 static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
@@ -980,49 +923,41 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
                req_root = "";
        err = -ENOMEM;
-        host_root_path = kmalloc(strlen(root_ino) + 1
+        sb->s_fs_info = host_root_path =
-                                 + strlen(req_root) + 1, GFP_KERNEL);
+                kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
        if (host_root_path == NULL)
                goto out;
        sprintf(host_root_path, "%s/%s", root_ino, req_root);
-        root_inode = hostfs_iget(sb);
+        root_inode = new_inode(sb);
-        if (IS_ERR(root_inode)) {
+        if (!root_inode)
-                err = PTR_ERR(root_inode);
+                goto out;
-                goto out_free;
-        }
-        err = init_inode(root_inode, NULL);
+        err = read_name(root_inode, host_root_path);
        if (err)
                goto out_put;
-        HOSTFS_I(root_inode)->host_filename = host_root_path;
+        if (S_ISLNK(root_inode->i_mode)) {
-        /*
+                char *name = follow_link(host_root_path);
-         * Avoid that in the error path, iput(root_inode) frees again
+                if (IS_ERR(name))
-         * host_root_path through hostfs_destroy_inode!
+                        err = PTR_ERR(name);
-         */
+                else
-        host_root_path = NULL;
+                        err = read_name(root_inode, name);
+                kfree(name);
+                if (err)
+                        goto out_put;
+        }
        err = -ENOMEM;
        sb->s_root = d_alloc_root(root_inode);
        if (sb->s_root == NULL)
                goto out_put;
-        err = hostfs_read_inode(root_inode);
-        if (err) {
-                /* No iput in this case because the dput does that for us */
-                dput(sb->s_root);
-                sb->s_root = NULL;
-                goto out;
-        }
        return 0;
 out_put:
        iput(root_inode);
-out_free:
-        kfree(host_root_path);
 out:
        return err;
 }
@@ -1034,11 +969,17 @@ static int hostfs_read_sb(struct file_system_type *type,
        return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
+static void hostfs_kill_sb(struct super_block *s)
+{
+        kill_anon_super(s);
+        kfree(s->s_fs_info);
+}
 static struct file_system_type hostfs_type = {
        .owner          = THIS_MODULE,
        .name           = "hostfs",
        .get_sb         = hostfs_read_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = hostfs_kill_sb,
        .fs_flags       = 0,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index b79424f93282..6777aa06ce2c 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -19,11 +19,27 @@
 #include "user.h"
 #include <utime.h>
-int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
+static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
-              int *nlink_out, int *uid_out, int *gid_out,
+{
-              unsigned long long *size_out, struct timespec *atime_out,
+        p->ino = buf->st_ino;
-              struct timespec *mtime_out, struct timespec *ctime_out,
+        p->mode = buf->st_mode;
-              int *blksize_out, unsigned long long *blocks_out, int fd)
+        p->nlink = buf->st_nlink;
+        p->uid = buf->st_uid;
+        p->gid = buf->st_gid;
+        p->size = buf->st_size;
+        p->atime.tv_sec = buf->st_atime;
+        p->atime.tv_nsec = 0;
+        p->ctime.tv_sec = buf->st_ctime;
+        p->ctime.tv_nsec = 0;
+        p->mtime.tv_sec = buf->st_mtime;
+        p->mtime.tv_nsec = 0;
+        p->blksize = buf->st_blksize;
+        p->blocks = buf->st_blocks;
+        p->maj = os_major(buf->st_rdev);
+        p->min = os_minor(buf->st_rdev);
+}
+int stat_file(const char *path, struct hostfs_stat *p, int fd)
 {
        struct stat64 buf;
@@ -33,68 +49,10 @@ int stat_file(const char *path, unsigned long long *inode_out, int *mode_out,
        } else if (lstat64(path, &buf) < 0) {
                return -errno;
        }
+        stat64_to_hostfs(&buf, p);
-        if (inode_out != NULL)
-                *inode_out = buf.st_ino;
-        if (mode_out != NULL)
-                *mode_out = buf.st_mode;
-        if (nlink_out != NULL)
-                *nlink_out = buf.st_nlink;
-        if (uid_out != NULL)
-                *uid_out = buf.st_uid;
-        if (gid_out != NULL)
-                *gid_out = buf.st_gid;
-        if (size_out != NULL)
-                *size_out = buf.st_size;
-        if (atime_out != NULL) {
-                atime_out->tv_sec = buf.st_atime;
-                atime_out->tv_nsec = 0;
-        }
-        if (mtime_out != NULL) {
-                mtime_out->tv_sec = buf.st_mtime;
-                mtime_out->tv_nsec = 0;
-        }
-        if (ctime_out != NULL) {
-                ctime_out->tv_sec = buf.st_ctime;
-                ctime_out->tv_nsec = 0;
-        }
-        if (blksize_out != NULL)
-                *blksize_out = buf.st_blksize;
-        if (blocks_out != NULL)
-                *blocks_out = buf.st_blocks;
        return 0;
 }
-int file_type(const char *path, int *maj, int *min)
-{
-        struct stat64 buf;
-        if (lstat64(path, &buf) < 0)
-                return -errno;
-        /*
-         * We cannot pass rdev as is because glibc and the kernel disagree
-         * about its definition.
-         */
-        if (maj != NULL)
-                *maj = major(buf.st_rdev);
-        if (min != NULL)
-                *min = minor(buf.st_rdev);
-        if (S_ISDIR(buf.st_mode))
-                return OS_TYPE_DIR;
-        else if (S_ISLNK(buf.st_mode))
-                return OS_TYPE_SYMLINK;
-        else if (S_ISCHR(buf.st_mode))
-                return OS_TYPE_CHARDEV;
-        else if (S_ISBLK(buf.st_mode))
-                return OS_TYPE_BLOCKDEV;
-        else if (S_ISFIFO(buf.st_mode))
-                return OS_TYPE_FIFO;
-        else if (S_ISSOCK(buf.st_mode))
-                return OS_TYPE_SOCK;
-        else return OS_TYPE_FILE;
-}
 int access_file(char *path, int r, int w, int x)
 {
        int mode = 0;
@@ -202,6 +160,11 @@ int fsync_file(int fd, int datasync)
        return 0;
 }
+int replace_file(int oldfd, int fd)
+{
+        return dup2(oldfd, fd);
+}
 void close_file(void *stream)
 {
        close(*((int *) stream));
@@ -235,8 +198,8 @@ int file_create(char *name, int ur, int uw, int ux, int gr,
 int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
 {
+        struct hostfs_stat st;
        struct timeval times[2];
-        struct timespec atime_ts, mtime_ts;
        int err, ma;
        if (attrs->ia_valid & HOSTFS_ATTR_MODE) {
@@ -279,15 +242,14 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
         */
        ma = (HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET);
        if (attrs->ia_valid & ma) {
-                err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
+                err = stat_file(file, &st, fd);
-                                &atime_ts, &mtime_ts, NULL, NULL, NULL, fd);
                if (err != 0)
                        return err;
-                times[0].tv_sec = atime_ts.tv_sec;
+                times[0].tv_sec = st.atime.tv_sec;
-                times[0].tv_usec = atime_ts.tv_nsec / 1000;
+                times[0].tv_usec = st.atime.tv_nsec / 1000;
-                times[1].tv_sec = mtime_ts.tv_sec;
+                times[1].tv_sec = st.mtime.tv_sec;
-                times[1].tv_usec = mtime_ts.tv_nsec / 1000;
+                times[1].tv_usec = st.mtime.tv_nsec / 1000;
                if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) {
                        times[0].tv_sec = attrs->ia_atime.tv_sec;
@@ -308,9 +270,9 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd)
        /* Note: ctime is not handled */
        if (attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)) {
-                err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL,
+                err = stat_file(file, &st, fd);
-                                &attrs->ia_atime, &attrs->ia_mtime, NULL,
+                attrs->ia_atime = st.atime;
-                                NULL, NULL, fd);
+                attrs->ia_mtime = st.mtime;
                if (err != 0)
                        return err;
        }
@@ -361,7 +323,7 @@ int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor)
 {
        int err;
-        err = mknod(file, mode, makedev(major, minor));
+        err = mknod(file, mode, os_makedev(major, minor));
        if (err)
                return -errno;
        return 0;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..c0340887c7ea 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
        return 0;
 }
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-        /*return file_fsync(file, dentry);*/
+        /*return file_fsync(file, datasync);*/
        return 0; /* Don't fsync :-) */
 }
@@ -97,10 +97,19 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                hpfs_get_block,
                                &hpfs_i(mapping->host)->mmu_private);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..b59eac0232a0 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 /* file.c */
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
@@ -281,7 +281,7 @@ void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
 int hpfs_setattr(struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
-void hpfs_delete_inode(struct inode *);
+void hpfs_evict_inode(struct inode *);
 /* map.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1042a9bc97f3..56f0da1cfd10 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,15 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                goto out_unlock;
-        error = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-        if (error)
+            attr->ia_size != i_size_read(inode)) {
-                goto out_unlock;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
        hpfs_write_inode(inode);
@@ -296,11 +302,13 @@ void hpfs_write_if_changed(struct inode *inode)
                hpfs_write_inode(inode);
 }
-void hpfs_delete_inode(struct inode *inode)
+void hpfs_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
-        lock_kernel();
+        end_writeback(inode);
-        hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+        if (!inode->i_nlink) {
-        unlock_kernel();
+                lock_kernel();
-        clear_inode(inode);
+                hpfs_remove_fnode(inode->i_sb, inode->i_ino);
+                unlock_kernel();
+        }
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index aa53842c599c..2607010be2fe 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -450,7 +450,7 @@ static const struct super_operations hpfs_sops =
 {
        .alloc_inode    = hpfs_alloc_inode,
        .destroy_inode  = hpfs_destroy_inode,
-        .delete_inode   = hpfs_delete_inode,
+        .evict_inode    = hpfs_evict_inode,
        .put_super      = hpfs_put_super,
        .statfs         = hpfs_statfs,
        .remount_fs     = hpfs_remount_fs,
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..7b027720d820 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/types.h>
+#include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
 #include "os.h"
@@ -587,7 +588,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        return err;
 }
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
        return 0;
 }
@@ -623,12 +624,11 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
        return &hi->vfs_inode;
 }
-void hppfs_delete_inode(struct inode *ino)
+void hppfs_evict_inode(struct inode *ino)
 {
+        end_writeback(ino);
        dput(HPPFS_I(ino)->proc_dentry);
        mntput(ino->i_sb->s_fs_info);
-        clear_inode(ino);
 }
 static void hppfs_destroy_inode(struct inode *inode)
@@ -639,7 +639,7 @@ static void hppfs_destroy_inode(struct inode *inode)
 static const struct super_operations hppfs_sbops = {
        .alloc_inode    = hppfs_alloc_inode,
        .destroy_inode  = hppfs_destroy_inode,
-        .delete_inode   = hppfs_delete_inode,
+        .evict_inode    = hppfs_evict_inode,
        .statfs         = hppfs_statfs,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..6e5bd42f3860 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -371,27 +371,10 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
        hugetlb_unreserve_pages(inode, start, freed);
 }
-static void hugetlbfs_delete_inode(struct inode *inode)
+static void hugetlbfs_evict_inode(struct inode *inode)
 {
        truncate_hugepages(inode, 0);
-        clear_inode(inode);
+        end_writeback(inode);
-}
-static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
-{
-        if (generic_detach_inode(inode)) {
-                truncate_hugepages(inode, 0);
-                clear_inode(inode);
-                destroy_inode(inode);
-        }
-}
-static void hugetlbfs_drop_inode(struct inode *inode)
-{
-        if (!inode->i_nlink)
-                generic_delete_inode(inode);
-        else
-                hugetlbfs_forget_inode(inode);
 }
 static inline void
@@ -448,19 +431,20 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
        error = inode_change_ok(inode, attr);
        if (error)
-                goto out;
+                return error;
        if (ia_valid & ATTR_SIZE) {
                error = -EINVAL;
-                if (!(attr->ia_size & ~huge_page_mask(h)))
+                if (attr->ia_size & ~huge_page_mask(h))
-                        error = hugetlb_vmtruncate(inode, attr->ia_size);
+                        return -EINVAL;
+                error = hugetlb_vmtruncate(inode, attr->ia_size);
                if (error)
-                        goto out;
+                        return error;
-                attr->ia_valid &= ~ATTR_SIZE;
        }
-        error = inode_setattr(inode, attr);
-out:
+        setattr_copy(inode, attr);
-        return error;
+        mark_inode_dirty(inode);
+        return 0;
 }
 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
@@ -688,7 +672,7 @@ static void init_once(void *foo)
 const struct file_operations hugetlbfs_file_operations = {
        .read                   = hugetlbfs_read,
        .mmap                   = hugetlbfs_file_mmap,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
 };
@@ -712,9 +696,8 @@ static const struct inode_operations hugetlbfs_inode_operations = {
 static const struct super_operations hugetlbfs_ops = {
        .alloc_inode    = hugetlbfs_alloc_inode,
        .destroy_inode  = hugetlbfs_destroy_inode,
+        .evict_inode    = hugetlbfs_evict_inode,
        .statfs         = hugetlbfs_statfs,
-        .delete_inode   = hugetlbfs_delete_inode,
-        .drop_inode     = hugetlbfs_drop_inode,
        .put_super      = hugetlbfs_put_super,
        .show_options   = generic_show_options,
 };
diff --git a/fs/inode.c b/fs/inode.c
index aaaaf096aa8e..d4fe9c031864 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,7 +20,6 @@
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
-#include <linux/inotify.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
@@ -264,12 +263,8 @@ void inode_init_once(struct inode *inode)
        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
        INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
        i_size_ordered_init(inode);
-#ifdef CONFIG_INOTIFY
-        INIT_LIST_HEAD(&inode->inotify_watches);
-        mutex_init(&inode->inotify_mutex);
-#endif
 #ifdef CONFIG_FSNOTIFY
-        INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+        INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
        INIT_LIST_HEAD(&inode->i_obj_list);
        mutex_init(&inode->i_obj_mutex);
@@ -288,42 +283,42 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_read(&inode->i_count)) {
+        if (atomic_inc_return(&inode->i_count) != 1)
-                atomic_inc(&inode->i_count);
                return;
-        }
-        atomic_inc(&inode->i_count);
        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
                list_move(&inode->i_list, &inode_in_use);
        inodes_stat.nr_unused--;
 }
-/**
+void end_writeback(struct inode *inode)
- * clear_inode - clear an inode
- * @inode: inode to clear
- *
- * This is called by the filesystem to tell us
- * that the inode is no longer useful. We just
- * terminate it with extreme prejudice.
- */
-void clear_inode(struct inode *inode)
 {
        might_sleep();
-        invalidate_inode_buffers(inode);
        BUG_ON(inode->i_data.nrpages);
+        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        inode_sync_wait(inode);
-        if (inode->i_sb->s_op->clear_inode)
+        inode->i_state = I_FREEING | I_CLEAR;
-                inode->i_sb->s_op->clear_inode(inode);
+}
+EXPORT_SYMBOL(end_writeback);
+static void evict(struct inode *inode)
+{
+        const struct super_operations *op = inode->i_sb->s_op;
+        if (op->evict_inode) {
+                op->evict_inode(inode);
+        } else {
+                if (inode->i_data.nrpages)
+                        truncate_inode_pages(&inode->i_data, 0);
+                end_writeback(inode);
+        }
        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
                bd_forget(inode);
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);
-        inode->i_state = I_CLEAR;
 }
-EXPORT_SYMBOL(clear_inode);
 /*
 * dispose_list - dispose of the contents of a local list
@@ -342,9 +337,7 @@ static void dispose_list(struct list_head *head)
                inode = list_first_entry(head, struct inode, i_list);
                list_del(&inode->i_list);
-                if (inode->i_data.nrpages)
+                evict(inode);
-                        truncate_inode_pages(&inode->i_data, 0);
-                clear_inode(inode);
                spin_lock(&inode_lock);
                hlist_del_init(&inode->i_hash);
@@ -417,7 +410,6 @@ int invalidate_inodes(struct super_block *sb)
        down_write(&iprune_sem);
        spin_lock(&inode_lock);
-        inotify_unmount_inodes(&sb->s_inodes);
        fsnotify_unmount_inodes(&sb->s_inodes);
        busy = invalidate_list(&sb->s_inodes, &throw_away);
        spin_unlock(&inode_lock);
@@ -516,7 +508,7 @@ static void prune_icache(int nr_to_scan)
 * This function is passed the number of inodes to scan, and it returns the
 * total number of remaining possibly-reclaimable inodes.
 */
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        if (nr) {
                /*
@@ -557,7 +549,7 @@ repeat:
                        continue;
                if (!test(inode, data))
                        continue;
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
@@ -582,7 +574,7 @@ repeat:
                        continue;
                if (inode->i_sb != sb)
                        continue;
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
+                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
@@ -844,7 +836,7 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
        spin_lock(&inode_lock);
-        if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)))
+        if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
                __iget(inode);
        else
                /*
@@ -1093,7 +1085,7 @@ int insert_inode_locked(struct inode *inode)
                                continue;
                        if (old->i_sb != sb)
                                continue;
-                        if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+                        if (old->i_state & (I_FREEING|I_WILL_FREE))
                                continue;
                        break;
                }
@@ -1132,7 +1124,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                                continue;
                        if (!test(old, data))
                                continue;
-                        if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+                        if (old->i_state & (I_FREEING|I_WILL_FREE))
                                continue;
                        break;
                }
@@ -1184,71 +1176,51 @@ void remove_inode_hash(struct inode *inode)
 }
 EXPORT_SYMBOL(remove_inode_hash);
+int generic_delete_inode(struct inode *inode)
+{
+        return 1;
+}
+EXPORT_SYMBOL(generic_delete_inode);
 /*
- * Tell the filesystem that this inode is no longer of any interest and should
+ * Normal UNIX filesystem behaviour: delete the
- * be completely destroyed.
+ * inode when the usage count drops to zero, and
- *
+ * i_nlink is zero.
- * We leave the inode in the inode hash table until *after* the filesystem's
- * ->delete_inode completes.  This ensures that an iget (such as nfsd might
- * instigate) will always find up-to-date information either in the hash or on
- * disk.
- *
- * I_FREEING is set so that no-one will take a new reference to the inode while
- * it is being deleted.
 */
-void generic_delete_inode(struct inode *inode)
+int generic_drop_inode(struct inode *inode)
 {
-        const struct super_operations *op = inode->i_sb->s_op;
+        return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
-        WARN_ON(inode->i_state & I_NEW);
-        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
-        spin_unlock(&inode_lock);
-        security_inode_delete(inode);
-        if (op->delete_inode) {
-                void (*delete)(struct inode *) = op->delete_inode;
-                /* Filesystems implementing their own
-                 * s_op->delete_inode are required to call
-                 * truncate_inode_pages and clear_inode()
-                 * internally */
-                delete(inode);
-        } else {
-                truncate_inode_pages(&inode->i_data, 0);
-                clear_inode(inode);
-        }
-        spin_lock(&inode_lock);
-        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
-        wake_up_inode(inode);
-        BUG_ON(inode->i_state != I_CLEAR);
-        destroy_inode(inode);
 }
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL_GPL(generic_drop_inode);
-/**
+/*
- *      generic_detach_inode - remove inode from inode lists
+ * Called when we're dropping the last reference
- *      @inode: inode to remove
+ * to an inode.
- *
- *      Remove inode from inode lists, write it if it's dirty. This is just an
- *      internal VFS helper exported for hugetlbfs. Do not use!
 *
- *      Returns 1 if inode should be completely destroyed.
+ * Call the FS "drop_inode()" function, defaulting to
+ * the legacy UNIX filesystem behaviour.  If it tells
+ * us to evict inode, do so.  Otherwise, retain inode
+ * in cache if fs is alive, sync and evict if fs is
+ * shutting down.
 */
-int generic_detach_inode(struct inode *inode)
+static void iput_final(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
+        const struct super_operations *op = inode->i_sb->s_op;
+        int drop;
-        if (!hlist_unhashed(&inode->i_hash)) {
+        if (op && op->drop_inode)
+                drop = op->drop_inode(inode);
+        else
+                drop = generic_drop_inode(inode);
+        if (!drop) {
                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
                        list_move(&inode->i_list, &inode_unused);
                inodes_stat.nr_unused++;
                if (sb->s_flags & MS_ACTIVE) {
                        spin_unlock(&inode_lock);
-                        return 0;
+                        return;
                }
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_WILL_FREE;
@@ -1266,56 +1238,15 @@ int generic_detach_inode(struct inode *inode)
        inode->i_state |= I_FREEING;
        inodes_stat.nr_inodes--;
        spin_unlock(&inode_lock);
-        return 1;
+        evict(inode);
-}
+        spin_lock(&inode_lock);
-EXPORT_SYMBOL_GPL(generic_detach_inode);
+        hlist_del_init(&inode->i_hash);
+        spin_unlock(&inode_lock);
-static void generic_forget_inode(struct inode *inode)
-{
-        if (!generic_detach_inode(inode))
-                return;
-        if (inode->i_data.nrpages)
-                truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
        wake_up_inode(inode);
+        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        destroy_inode(inode);
 }
-/*
- * Normal UNIX filesystem behaviour: delete the
- * inode when the usage count drops to zero, and
- * i_nlink is zero.
- */
-void generic_drop_inode(struct inode *inode)
-{
-        if (!inode->i_nlink)
-                generic_delete_inode(inode);
-        else
-                generic_forget_inode(inode);
-}
-EXPORT_SYMBOL_GPL(generic_drop_inode);
-/*
- * Called when we're dropping the last reference
- * to an inode.
- *
- * Call the FS "drop()" function, defaulting to
- * the legacy UNIX filesystem behaviour..
- *
- * NOTE! NOTE! NOTE! We're called with the inode lock
- * held, and the drop function is supposed to release
- * the lock!
- */
-static inline void iput_final(struct inode *inode)
-{
-        const struct super_operations *op = inode->i_sb->s_op;
-        void (*drop)(struct inode *) = generic_drop_inode;
-        if (op && op->drop_inode)
-                drop = op->drop_inode;
-        drop(inode);
-}
 /**
 *      iput    - put an inode
 *      @inode: inode to put
@@ -1328,7 +1259,7 @@ static inline void iput_final(struct inode *inode)
 void iput(struct inode *inode)
 {
        if (inode) {
-                BUG_ON(inode->i_state == I_CLEAR);
+                BUG_ON(inode->i_state & I_CLEAR);
                if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
                        iput_final(inode);
@@ -1612,3 +1543,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
                                  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
+/**
+ * Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+                        mode_t mode)
+{
+        inode->i_uid = current_fsuid();
+        if (dir && dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
 * 2 of the License, or (at your option) any later version.
 */
+#include <linux/lglock.h>
 struct super_block;
 struct linux_binprm;
 struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void __init mnt_init(void);
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
 /*
 * fs_struct.c
@@ -80,6 +83,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
 /*
 * file_table.c
 */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
@@ -87,6 +92,8 @@ extern struct file *get_empty_filp(void);
 * super.c
 */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
 /*
 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..f855ea4fc888 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -29,7 +29,6 @@
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
- * invokes filesystem specific ->ioctl method.  If neither method exists,
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
@@ -39,21 +38,12 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 {
        int error = -ENOTTY;
-        if (!filp->f_op)
+        if (!filp->f_op || !filp->f_op->unlocked_ioctl)
                goto out;
-        if (filp->f_op->unlocked_ioctl) {
+        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-                error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
+        if (error == -ENOIOCTLCMD)
-                if (error == -ENOIOCTLCMD)
+                error = -EINVAL;
-                        error = -EINVAL;
-                goto out;
-        } else if (filp->f_op->ioctl) {
-                lock_kernel();
-                error = filp->f_op->ioctl(filp->f_path.dentry->d_inode,
-                                          filp, cmd, arg);
-                unlock_kernel();
-        }
 out:
        return error;
 }
@@ -525,15 +515,8 @@ static int ioctl_fsfreeze(struct file *filp)
        if (sb->s_op->freeze_fs == NULL)
                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Freeze */
-        sb = freeze_bdev(sb->s_bdev);
+        return freeze_super(sb);
-        if (IS_ERR(sb))
-                return PTR_ERR(sb);
-        return 0;
 }
 static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +526,8 @@ static int ioctl_fsthaw(struct file *filp)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Thaw */
-        return thaw_bdev(sb->s_bdev, sb);
+        return thaw_super(sb);
 }
 /*
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 const struct file_operations isofs_dir_operations =
 {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = isofs_readdir,
 };
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f2943..5a44811b5027 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -722,7 +722,12 @@ root_found:
        }
        s->s_magic = ISOFS_SUPER_MAGIC;
-        s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
+        /*
+         * With multi-extent files, file size is only limited by the maximum
+         * size of a file system, which is 8 TB.
+         */
+        s->s_maxbytes = 0x80000000000LL;
        /*
         * The CDROM is read-only, has no nodes (devices) on it, and since
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654d..05a38b9c4c0e 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
                clear_buffer_jwrite(bh);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ecb44c94ba8d..95d8c11c929e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
        struct buffer_head *bh;
        journal_header_t *header;
        int ret;
-        int barrier_done = 0;
        if (is_journal_aborted(journal))
                return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
        if (journal->j_flags & JFS_BARRIER) {
-                set_buffer_ordered(bh);
+                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
-                barrier_done = 1;
-        }
-        ret = sync_dirty_buffer(bh);
-        if (barrier_done)
-                clear_buffer_ordered(bh);
-        /* is it possible for another commit to fail at roughly
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
-         * to remember if we sent a barrier request
-         */
-        if (ret == -EOPNOTSUPP && barrier_done) {
-                char b[BDEVNAME_SIZE];
-                printk(KERN_WARNING
+                /*
-                        "JBD: barrier-based sync failed on %s - "
+                 * Is it possible for another commit to fail at roughly
-                        "disabling barriers\n",
+                 * the same time as this one?  If so, we don't want to
-                        bdevname(journal->j_dev, b));
+                 * trust the barrier flag in the super, but instead want
-                spin_lock(&journal->j_state_lock);
+                 * to remember if we sent a barrier request
-                journal->j_flags &= ~JFS_BARRIER;
+                 */
-                spin_unlock(&journal->j_state_lock);
+                if (ret == -EOPNOTSUPP) {
+                        char b[BDEVNAME_SIZE];
-                /* And try again, without the barrier */
+                        printk(KERN_WARNING
-                set_buffer_uptodate(bh);
+                                "JBD: barrier-based sync failed on %s - "
-                set_buffer_dirty(bh);
+                                "disabling barriers\n",
+                                bdevname(journal->j_dev, b));
+                        spin_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JFS_BARRIER;
+                        spin_unlock(&journal->j_state_lock);
+                        /* And try again, without the barrier */
+                        set_buffer_uptodate(bh);
+                        set_buffer_dirty(bh);
+                        ret = sync_dirty_buffer(bh);
+                }
+        } else {
                ret = sync_dirty_buffer(bh);
        }
        put_bh(bh);             /* One for getblk() */
        journal_put_journal_head(descriptor);
@@ -786,6 +787,12 @@ wait_for_iobuf:
        jbd_debug(3, "JBD: commit phase 6\n");
+        /* All metadata is written, now write commit record and do cleanup */
+        spin_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_COMMIT_RECORD;
+        spin_unlock(&journal->j_state_lock);
        if (journal_write_commit_record(journal, commit_transaction))
                err = -EIO;
@@ -923,7 +930,7 @@ restart_loop:
        jbd_debug(3, "JBD: commit phase 8\n");
-        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..2c4b1f109da9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 }
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+        int ret = 0;
+        transaction_t *commit_trans;
+        if (!(journal->j_flags & JFS_BARRIER))
+                return 0;
+        spin_lock(&journal->j_state_lock);
+        /* Transaction already committed? */
+        if (tid_geq(journal->j_commit_sequence, tid))
+                goto out;
+        /*
+         * Transaction is being committed and we already proceeded to
+         * writing commit record?
+         */
+        commit_trans = journal->j_committing_transaction;
+        if (commit_trans && commit_trans->t_tid == tid &&
+            commit_trans->t_state >= T_COMMIT_RECORD)
+                goto out;
+        ret = 1;
+out:
+        spin_unlock(&journal->j_state_lock);
+        return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+/*
 * Log buffer allocation routines:
 */
@@ -992,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
        if (wait)
                sync_dirty_buffer(bh);
        else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
 {
        int err = 0;
+        
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
@@ -1248,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
 int journal_check_available_features (journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
 {
-        journal_superblock_t *sb;
        if (!compat && !ro && !incompat)
                return 1;
-        sb = journal->j_superblock;
        /* We can support any known requested features iff the
         * superblock is in version 2.  Otherwise we fail to support any
         * extended sb features. */
@@ -1448,7 +1477,6 @@ int journal_flush(journal_t *journal)
 int journal_wipe(journal_t *journal, int write)
 {
-        journal_superblock_t *sb;
        int err = 0;
        J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1457,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write)
        if (err)
                return err;
-        sb = journal->j_superblock;
        if (!journal->j_tail)
                goto no_recovery;
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b17..81051dafebf5 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
 int journal_skip_recovery(journal_t *journal)
 {
        int                     err;
-        journal_superblock_t *  sb;
        struct recovery_info    info;
        memset (&info, 0, sizeof(info));
-        sb = journal->j_superblock;
        err = do_one_pass(journal, &info, PASS_SCAN);
@@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal)
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD_DEBUG
-                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+                int dropped = info.end_transaction -
+                              be32_to_cpu(journal->j_superblock->s_sequence);
 #endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
        unsigned int            sequence;
        int                     blocktype;
-        /* Precompute the maximum metadata descriptors in a descriptor block */
-        int                     MAX_BLOCKS_PER_DESC;
-        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-                               / sizeof(journal_block_tag_t));
        /*
         * First thing is to establish what we expect to find in the log
         * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343a..d29018307e2e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..5247e7ffdcb4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
        int nblocks, space_left;
-        assert_spin_locked(&journal->j_state_lock);
+        /* assert_spin_locked(&journal->j_state_lock); */
        nblocks = jbd_space_needed(journal);
        while (__jbd2_log_space_left(journal) < nblocks) {
                if (journal->j_flags & JBD2_ABORT)
                        return;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                mutex_lock(&journal->j_checkpoint_mutex);
                /*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                 * filesystem, so abort the journal and leave a stack
                 * trace for forensic evidence.
                 */
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
                space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                        if (journal->j_committing_transaction)
                                tid = journal->j_committing_transaction->t_tid;
                        spin_unlock(&journal->j_list_lock);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        if (chkpt) {
                                jbd2_log_do_checkpoint(journal);
                        } else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                                WARN_ON(1);
                                jbd2_journal_abort(journal, 0);
                        }
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                } else {
                        spin_unlock(&journal->j_list_lock);
                }
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+        for (i = 0; i < *batch_count; i++)
+                write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
@@ -474,7 +476,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         * next transaction ID we will write, and where it will
         * start. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
        if (transaction) {
@@ -496,7 +498,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        /* If the oldest pinned transaction is at the tail of the log
           already then there's not much we can do right now. */
        if (journal->j_tail_sequence == first_tid) {
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                return 1;
        }
@@ -516,7 +518,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        journal->j_free += freed;
        journal->j_tail_sequence = first_tid;
        journal->j_tail = blocknr;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        /*
         * If there is an external journal, we need to make sure that
@@ -530,7 +532,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
@@ -774,7 +777,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(transaction->t_log_list == NULL);
        J_ASSERT(transaction->t_checkpoint_list == NULL);
        J_ASSERT(transaction->t_checkpoint_io_list == NULL);
-        J_ASSERT(transaction->t_updates == 0);
+        J_ASSERT(atomic_read(&transaction->t_updates) == 0);
        J_ASSERT(journal->j_committing_transaction != transaction);
        J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..7c068c189d80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
        struct commit_header *tmp;
        struct buffer_head *bh;
        int ret;
-        int barrier_done = 0;
        struct timespec now = current_kernel_time();
        if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-                set_buffer_ordered(bh);
+                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
-                barrier_done = 1;
+                if (ret == -EOPNOTSUPP) {
-        }
+                        printk(KERN_WARNING
-        ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                               "JBD2: Disabling barriers on %s, "
-        if (barrier_done)
+                               "not supported by device\n", journal->j_devname);
-                clear_buffer_ordered(bh);
+                        write_lock(&journal->j_state_lock);
+                        journal->j_flags &= ~JBD2_BARRIER;
-        /* is it possible for another commit to fail at roughly
+                        write_unlock(&journal->j_state_lock);
-         * the same time as this one?  If so, we don't want to
-         * trust the barrier flag in the super, but instead want
+                        /* And try again, without the barrier */
-         * to remember if we sent a barrier request
+                        lock_buffer(bh);
-         */
+                        set_buffer_uptodate(bh);
-        if (ret == -EOPNOTSUPP && barrier_done) {
+                        clear_buffer_dirty(bh);
-                printk(KERN_WARNING
+                        ret = submit_bh(WRITE_SYNC_PLUG, bh);
-                       "JBD: barrier-based sync failed on %s - "
+                }
-                       "disabling barriers\n", journal->j_devname);
+        } else {
-                spin_lock(&journal->j_state_lock);
-                journal->j_flags &= ~JBD2_BARRIER;
-                spin_unlock(&journal->j_state_lock);
-                /* And try again, without the barrier */
-                lock_buffer(bh);
-                set_buffer_uptodate(bh);
-                clear_buffer_dirty(bh);
                ret = submit_bh(WRITE_SYNC_PLUG, bh);
        }
        *cbh = bh;
@@ -180,11 +171,11 @@ retry:
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
                printk(KERN_WARNING
-                       "JBD2: wait_on_commit_record: sync failed on %s - "
+                       "JBD2: %s: disabling barries on %s - not supported "
-                       "disabling barriers\n", journal->j_devname);
+                       "by device\n", __func__, journal->j_devname);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                journal->j_flags &= ~JBD2_BARRIER;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                lock_buffer(bh);
                clear_buffer_dirty(bh);
@@ -400,7 +391,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
        /*
@@ -417,23 +408,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                                              stats.run.rs_locked);
        spin_lock(&commit_transaction->t_handle_lock);
-        while (commit_transaction->t_updates) {
+        while (atomic_read(&commit_transaction->t_updates)) {
                DEFINE_WAIT(wait);
                prepare_to_wait(&journal->j_wait_updates, &wait,
                                        TASK_UNINTERRUPTIBLE);
-                if (commit_transaction->t_updates) {
+                if (atomic_read(&commit_transaction->t_updates)) {
                        spin_unlock(&commit_transaction->t_handle_lock);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        schedule();
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                        spin_lock(&commit_transaction->t_handle_lock);
                }
                finish_wait(&journal->j_wait_updates, &wait);
        }
        spin_unlock(&commit_transaction->t_handle_lock);
-        J_ASSERT (commit_transaction->t_outstanding_credits <=
+        J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
                        journal->j_max_transaction_buffers);
        /*
@@ -497,7 +488,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        jbd_debug (3, "JBD: commit phase 2\n");
@@ -519,19 +510,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_COMMIT;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        trace_jbd2_commit_logging(journal, commit_transaction);
        stats.run.rs_logging = jiffies;
        stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
                                               stats.run.rs_logging);
-        stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+        stats.run.rs_blocks =
+                atomic_read(&commit_transaction->t_outstanding_credits);
        stats.run.rs_blocks_logged = 0;
        J_ASSERT(commit_transaction->t_nr_buffers <=
-                 commit_transaction->t_outstanding_credits);
+                 atomic_read(&commit_transaction->t_outstanding_credits));
        err = 0;
        descriptor = NULL;
@@ -616,7 +608,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                 * the free space in the log, but this counter is changed
                 * by jbd2_journal_next_log_block() also.
                 */
-                commit_transaction->t_outstanding_credits--;
+                atomic_dec(&commit_transaction->t_outstanding_credits);
                /* Bump b_count to prevent truncate from stumbling over
                   the shadowed buffer!  @@@ This can go if we ever get
@@ -717,7 +709,8 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +720,8 @@ start_journal_io:
                if (err)
                        __jbd2_journal_abort_hard(journal);
                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, NULL);
+                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+                                BLKDEV_IFL_WAIT);
        }
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
@@ -975,7 +969,7 @@ restart_loop:
         * __jbd2_journal_drop_transaction(). Otherwise we could race with
         * other checkpointing code processing the transaction...
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        /*
         * Now recheck if some buffers did not get attached to the transaction
@@ -983,7 +977,7 @@ restart_loop:
         */
        if (commit_transaction->t_forget) {
                spin_unlock(&journal->j_list_lock);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                goto restart_loop;
        }
@@ -1001,7 +995,8 @@ restart_loop:
         * File the transaction statistics
         */
        stats.ts_tid = commit_transaction->t_tid;
-        stats.run.rs_handle_count = commit_transaction->t_handle_count;
+        stats.run.rs_handle_count =
+                atomic_read(&commit_transaction->t_handle_count);
        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
                             commit_transaction->t_tid, &stats.run);
@@ -1035,7 +1030,7 @@ restart_loop:
                                journal->j_average_commit_time*3) / 4;
        else
                journal->j_average_commit_time = commit_time;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c03d4dce4d76..0e8014ea6b94 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
 #include <linux/hash.h>
 #include <linux/log2.h>
 #include <linux/vmalloc.h>
+#include <linux/backing-dev.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
-EXPORT_SYMBOL(jbd2_journal_start);
-EXPORT_SYMBOL(jbd2_journal_restart);
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +142,7 @@ static int kjournald2(void *arg)
        /*
         * And now, wait forever for commit wakeup events.
         */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
 loop:
        if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +153,10 @@ loop:
        if (journal->j_commit_sequence != journal->j_commit_request) {
                jbd_debug(1, "OK, requests differ\n");
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                del_timer_sync(&journal->j_commit_timer);
                jbd2_journal_commit_transaction(journal);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
                goto loop;
        }
@@ -169,9 +168,9 @@ loop:
                 * be already stopped.
                 */
                jbd_debug(1, "Now suspending kjournald2\n");
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                refrigerator();
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        } else {
                /*
                 * We assume on resume that commits are already there,
@@ -191,9 +190,9 @@ loop:
                if (journal->j_flags & JBD2_UNMOUNT)
                        should_sleep = 0;
                if (should_sleep) {
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        schedule();
-                        spin_lock(&journal->j_state_lock);
+                        write_lock(&journal->j_state_lock);
                }
                finish_wait(&journal->j_wait_commit, &wait);
        }
@@ -211,7 +210,7 @@ loop:
        goto loop;
 end_loop:
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        del_timer_sync(&journal->j_commit_timer);
        journal->j_task = NULL;
        wake_up(&journal->j_wait_done_commit);
@@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
 static void journal_kill_thread(journal_t *journal)
 {
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_UNMOUNT;
        while (journal->j_task) {
                wake_up(&journal->j_wait_commit);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /*
@@ -297,7 +296,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
-        struct jbd2_buffer_trigger_type *triggers;
        journal_t *journal = transaction->t_journal;
        /*
@@ -311,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
         */
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+retry_alloc:
+        new_bh = alloc_buffer_head(GFP_NOFS);
+        if (!new_bh) {
+                /*
+                 * Failure is not an option, but __GFP_NOFAIL is going
+                 * away; so we retry ourselves here.
+                 */
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                goto retry_alloc;
+        }
        /* keep subsequent assertions sane */
        new_bh->b_state = 0;
        init_buffer(new_bh, NULL, NULL);
@@ -328,21 +336,21 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
-                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
-         * Fire any commit trigger.  Do this before checking for escaping,
+         * Fire data frozen trigger if data already wasn't frozen.  Do this
-         * as the trigger may modify the magic offset.  If a copy-out
+         * before checking for escaping, as the trigger may modify the magic
-         * happens afterwards, it will have the correct data in the buffer.
+         * offset.  If a copy-out happens afterwards, it will have the correct
+         * data in the buffer.
         */
-        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+        if (!done_copy_out)
-                                   triggers);
+                jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+                                           jh_in->b_triggers);
        /*
         * Check for escaping
@@ -443,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal)
 {
        int left = journal->j_free;
-        assert_spin_locked(&journal->j_state_lock);
+        /* assert_spin_locked(&journal->j_state_lock); */
        /*
         * Be pessimistic here about the number of those free blocks which
@@ -488,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 {
        int ret;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        ret = __jbd2_log_start_commit(journal, tid);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return ret;
 }
@@ -509,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
        transaction_t *transaction = NULL;
        tid_t tid;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
                __jbd2_log_start_commit(journal, transaction->t_tid);
@@ -517,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
                transaction = journal->j_committing_transaction;
        if (!transaction) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                return 0;       /* Nothing to retry */
        }
        tid = transaction->t_tid;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        jbd2_log_wait_commit(journal, tid);
        return 1;
 }
@@ -536,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
        int ret = 0;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;
@@ -555,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return ret;
 }
@@ -567,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 {
        int err = 0;
+        read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
-        spin_lock(&journal->j_state_lock);
        if (!tid_geq(journal->j_commit_request, tid)) {
                printk(KERN_EMERG
                       "%s: error: j_commit_request=%d, tid=%d\n",
                       __func__, journal->j_commit_request, tid);
        }
-        spin_unlock(&journal->j_state_lock);
 #endif
-        spin_lock(&journal->j_state_lock);
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
                wake_up(&journal->j_wait_commit);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        if (unlikely(is_journal_aborted(journal))) {
                printk(KERN_EMERG "journal commit I/O error\n");
@@ -603,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 {
        unsigned long blocknr;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        J_ASSERT(journal->j_free > 1);
        blocknr = journal->j_head;
@@ -611,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
        journal->j_free--;
        if (journal->j_head == journal->j_last)
                journal->j_head = journal->j_first;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return jbd2_journal_bmap(journal, blocknr, retp);
 }
@@ -831,7 +837,7 @@ static journal_t * journal_init_common (void)
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
        spin_lock_init(&journal->j_list_lock);
-        spin_lock_init(&journal->j_state_lock);
+        rwlock_init(&journal->j_state_lock);
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
@@ -1097,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                set_buffer_uptodate(bh);
        }
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
        sb->s_start    = cpu_to_be32(journal->j_tail);
        sb->s_errno    = cpu_to_be32(journal->j_errno);
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
@@ -1118,19 +1124,19 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                        set_buffer_uptodate(bh);
                }
        } else
-                ll_rw_block(SWRITE, 1, &bh);
+                write_dirty_buffer(bh, WRITE);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
         * any future commit will have to be careful to update the
         * superblock again to re-record the true start of the log. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (sb->s_start)
                journal->j_flags &= ~JBD2_FLUSHED;
        else
                journal->j_flags |= JBD2_FLUSHED;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /*
@@ -1392,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
 int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
 {
-        journal_superblock_t *sb;
        if (!compat && !ro && !incompat)
                return 1;
-        sb = journal->j_superblock;
        /* We can support any known requested features iff the
         * superblock is in version 2.  Otherwise we fail to support any
         * extended sb features. */
@@ -1546,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal)
        transaction_t *transaction = NULL;
        unsigned long old_tail;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        /* Force everything buffered to the log... */
        if (journal->j_running_transaction) {
@@ -1559,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal)
        if (transaction) {
                tid_t tid = transaction->t_tid;
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                jbd2_log_wait_commit(journal, tid);
        } else {
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
        }
        /* ...and flush everything in the log out to disk. */
@@ -1586,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal)
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        old_tail = journal->j_tail;
        journal->j_tail = 0;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        jbd2_journal_update_superblock(journal, 1);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_tail = old_tail;
        J_ASSERT(!journal->j_running_transaction);
@@ -1599,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal)
        J_ASSERT(!journal->j_checkpoint_transactions);
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return 0;
 }
@@ -1618,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal)
 int jbd2_journal_wipe(journal_t *journal, int write)
 {
-        journal_superblock_t *sb;
        int err = 0;
        J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1627,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
        if (err)
                return err;
-        sb = journal->j_superblock;
        if (!journal->j_tail)
                goto no_recovery;
@@ -1666,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
        printk(KERN_ERR "Aborting journal on device %s.\n",
               journal->j_devname);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_ABORT;
        transaction = journal->j_running_transaction;
        if (transaction)
                __jbd2_log_start_commit(journal, transaction->t_tid);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 /* Soft abort: record the abort error status in the journal superblock,
@@ -1756,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal)
 {
        int err;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                err = journal->j_errno;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        return err;
 }
@@ -1776,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal)
 {
        int err = 0;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                journal->j_errno = 0;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        return err;
 }
@@ -1794,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal)
 */
 void jbd2_journal_ack_err(journal_t *journal)
 {
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        if (journal->j_errno)
                journal->j_flags |= JBD2_ACK_ERR;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -1889,7 +1888,7 @@ static struct kmem_cache *get_slab(size_t size)
        BUG_ON(i >= JBD2_MAX_SLABS);
        if (unlikely(i < 0))
                i = 0;
-        BUG_ON(jbd2_slab[i] == 0);
+        BUG_ON(jbd2_slab[i] == NULL);
        return jbd2_slab[i];
 }
@@ -2202,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
 void jbd2_journal_release_jbd_inode(journal_t *journal,
                                    struct jbd2_inode *jinode)
 {
-        int writeout = 0;
        if (!journal)
                return;
 restart:
@@ -2220,9 +2217,6 @@ restart:
                goto restart;
        }
-        /* Do we need to wait for data writeback? */
-        if (journal->j_committing_transaction == jinode->i_transaction)
-                writeout = 1;
        if (jinode->i_transaction) {
                list_del(&jinode->i_list);
                jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb89..2bc4d5f116f1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
 int jbd2_journal_skip_recovery(journal_t *journal)
 {
        int                     err;
-        journal_superblock_t *  sb;
        struct recovery_info    info;
        memset (&info, 0, sizeof(info));
-        sb = journal->j_superblock;
        err = do_one_pass(journal, &info, PASS_SCAN);
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
                ++journal->j_transaction_sequence;
        } else {
 #ifdef CONFIG_JBD2_DEBUG
-                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+                int dropped = info.end_transaction - 
+                        be32_to_cpu(journal->j_superblock->s_sequence);
 #endif
                jbd_debug(1,
                          "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
        int                     tag_bytes = journal_tag_bytes(journal);
        __u32                   crc32_sum = ~0; /* Transactional Checksums */
-        /* Precompute the maximum metadata descriptors in a descriptor block */
-        int                     MAX_BLOCKS_PER_DESC;
-        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
-                               / tag_bytes);
        /*
         * First thing is to establish what we expect to find in the log
         * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e3..9ad321fd63fd 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+        write_dirty_buffer(bh, write_op);
 }
 #endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..f3479d6e0a83 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
+        atomic_set(&transaction->t_updates, 0);
+        atomic_set(&transaction->t_outstanding_credits, 0);
+        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@ -77,71 +82,106 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 */
 /*
+ * Update transiaction's maximum wait time, if debugging is enabled.
+ *
+ * In order for t_max_wait to be reliable, it must be protected by a
+ * lock.  But doing so will mean that start_this_handle() can not be
+ * run in parallel on SMP systems, which limits our scalability.  So
+ * unless debugging is enabled, we no longer update t_max_wait, which
+ * means that maximum wait time reported by the jbd2_run_stats
+ * tracepoint will always be zero.
+ */
+static inline void update_t_max_wait(transaction_t *transaction)
+{
+#ifdef CONFIG_JBD2_DEBUG
+        unsigned long ts = jiffies;
+        if (jbd2_journal_enable_debug &&
+            time_after(transaction->t_start, ts)) {
+                ts = jbd2_time_diff(ts, transaction->t_start);
+                spin_lock(&transaction->t_handle_lock);
+                if (ts > transaction->t_max_wait)
+                        transaction->t_max_wait = ts;
+                spin_unlock(&transaction->t_handle_lock);
+        }
+#endif
+}
+/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
 * transaction's buffer credits.
 */
-static int start_this_handle(journal_t *journal, handle_t *handle)
+static int start_this_handle(journal_t *journal, handle_t *handle,
+                             int gfp_mask)
 {
        transaction_t *transaction;
        int needed;
        int nblocks = handle->h_buffer_credits;
        transaction_t *new_transaction = NULL;
-        int ret = 0;
-        unsigned long ts = jiffies;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
                       current->comm, nblocks,
                       journal->j_max_transaction_buffers);
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto out;
        }
 alloc_transaction:
        if (!journal->j_running_transaction) {
-                new_transaction = kzalloc(sizeof(*new_transaction),
+                new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
-                                                GFP_NOFS|__GFP_NOFAIL);
                if (!new_transaction) {
-                        ret = -ENOMEM;
+                        /*
-                        goto out;
+                         * If __GFP_FS is not present, then we may be
+                         * being called from inside the fs writeback
+                         * layer, so we MUST NOT fail.  Since
+                         * __GFP_NOFAIL is going away, we will arrange
+                         * to retry the allocation ourselves.
+                         */
+                        if ((gfp_mask & __GFP_FS) == 0) {
+                                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                goto alloc_transaction;
+                        }
+                        return -ENOMEM;
                }
        }
        jbd_debug(3, "New handle %p going live.\n", handle);
-repeat:
        /*
         * We need to hold j_state_lock until t_updates has been incremented,
         * for proper journal barrier handling
         */
-        spin_lock(&journal->j_state_lock);
+repeat:
-repeat_locked:
+        read_lock(&journal->j_state_lock);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
-                ret = -EROFS;
+                kfree(new_transaction);
-                goto out;
+                return -EROFS;
        }
        /* Wait on the journal's transaction barrier if necessary */
        if (journal->j_barrier_count) {
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
                goto repeat;
        }
        if (!journal->j_running_transaction) {
-                if (!new_transaction) {
+                read_unlock(&journal->j_state_lock);
-                        spin_unlock(&journal->j_state_lock);
+                if (!new_transaction)
                        goto alloc_transaction;
+                write_lock(&journal->j_state_lock);
+                if (!journal->j_running_transaction) {
+                        jbd2_get_transaction(journal, new_transaction);
+                        new_transaction = NULL;
                }
-                jbd2_get_transaction(journal, new_transaction);
+                write_unlock(&journal->j_state_lock);
-                new_transaction = NULL;
+                goto repeat;
        }
        transaction = journal->j_running_transaction;
@@ -155,7 +195,7 @@ repeat_locked:
                prepare_to_wait(&journal->j_wait_transaction_locked,
                                        &wait, TASK_UNINTERRUPTIBLE);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -166,8 +206,8 @@ repeat_locked:
         * buffers requested by this operation, we need to stall pending a log
         * checkpoint to free some more log space.
         */
-        spin_lock(&transaction->t_handle_lock);
+        needed = atomic_add_return(nblocks,
-        needed = transaction->t_outstanding_credits + nblocks;
+                                   &transaction->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {
                /*
@@ -178,11 +218,11 @@ repeat_locked:
                DEFINE_WAIT(wait);
                jbd_debug(2, "Handle %p starting new commit...\n", handle);
-                spin_unlock(&transaction->t_handle_lock);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
                                TASK_UNINTERRUPTIBLE);
                __jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -215,35 +255,31 @@ repeat_locked:
         */
        if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
                jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-                spin_unlock(&transaction->t_handle_lock);
+                atomic_sub(nblocks, &transaction->t_outstanding_credits);
-                __jbd2_log_wait_for_space(journal);
+                read_unlock(&journal->j_state_lock);
-                goto repeat_locked;
+                write_lock(&journal->j_state_lock);
+                if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
+                        __jbd2_log_wait_for_space(journal);
+                write_unlock(&journal->j_state_lock);
+                goto repeat;
        }
        /* OK, account for the buffers that this operation expects to
-         * use and add the handle to the running transaction. */
+         * use and add the handle to the running transaction. 
+         */
-        if (time_after(transaction->t_start, ts)) {
+        update_t_max_wait(transaction);
-                ts = jbd2_time_diff(ts, transaction->t_start);
-                if (ts > transaction->t_max_wait)
-                        transaction->t_max_wait = ts;
-        }
        handle->h_transaction = transaction;
-        transaction->t_outstanding_credits += nblocks;
+        atomic_inc(&transaction->t_updates);
-        transaction->t_updates++;
+        atomic_inc(&transaction->t_handle_count);
-        transaction->t_handle_count++;
        jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-                  handle, nblocks, transaction->t_outstanding_credits,
+                  handle, nblocks,
+                  atomic_read(&transaction->t_outstanding_credits),
                  __jbd2_log_space_left(journal));
-        spin_unlock(&transaction->t_handle_lock);
+        read_unlock(&journal->j_state_lock);
-        spin_unlock(&journal->j_state_lock);
        lock_map_acquire(&handle->h_lockdep_map);
-out:
+        kfree(new_transaction);
-        if (unlikely(new_transaction))          /* It's usually NULL */
+        return 0;
-                kfree(new_transaction);
-        return ret;
 }
 static struct lock_class_key jbd2_handle_key;
@@ -278,7 +314,7 @@ static handle_t *new_handle(int nblocks)
 *
 * Return a pointer to a newly allocated handle, or NULL on failure
 */
-handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -298,7 +334,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
        current->journal_info = handle;
-        err = start_this_handle(journal, handle);
+        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
                jbd2_free_handle(handle);
                current->journal_info = NULL;
@@ -308,6 +344,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 out:
        return handle;
 }
+EXPORT_SYMBOL(jbd2__journal_start);
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+        return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_start);
 /**
 * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +387,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        result = 1;
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        /* Don't extend a locked-down transaction! */
        if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +397,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        }
        spin_lock(&transaction->t_handle_lock);
-        wanted = transaction->t_outstanding_credits + nblocks;
+        wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
        if (wanted > journal->j_max_transaction_buffers) {
                jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +412,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
        }
        handle->h_buffer_credits += nblocks;
-        transaction->t_outstanding_credits += nblocks;
+        atomic_add(nblocks, &transaction->t_outstanding_credits);
        result = 0;
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
 unlock:
        spin_unlock(&transaction->t_handle_lock);
 error_out:
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
 out:
        return result;
 }
@@ -394,8 +439,7 @@ out:
 * transaction capabable of guaranteeing the requested number of
 * credits.
 */
+int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
-int jbd2_journal_restart(handle_t *handle, int nblocks)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
@@ -410,28 +454,34 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
-        J_ASSERT(transaction->t_updates > 0);
+        J_ASSERT(atomic_read(&transaction->t_updates) > 0);
        J_ASSERT(journal_current_handle() == handle);
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
-        transaction->t_outstanding_credits -= handle->h_buffer_credits;
+        atomic_sub(handle->h_buffer_credits,
-        transaction->t_updates--;
+                   &transaction->t_outstanding_credits);
+        if (atomic_dec_and_test(&transaction->t_updates))
-        if (!transaction->t_updates)
                wake_up(&journal->j_wait_updates);
        spin_unlock(&transaction->t_handle_lock);
        jbd_debug(2, "restarting handle %p\n", handle);
        __jbd2_log_start_commit(journal, transaction->t_tid);
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
-        ret = start_this_handle(journal, handle);
+        ret = start_this_handle(journal, handle, gfp_mask);
        return ret;
 }
+EXPORT_SYMBOL(jbd2__journal_restart);
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+        return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+}
+EXPORT_SYMBOL(jbd2_journal_restart);
 /**
 * void jbd2_journal_lock_updates () - establish a transaction barrier.
@@ -447,7 +497,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
 {
        DEFINE_WAIT(wait);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;
        /* Wait until there are no running updates */
@@ -458,19 +508,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
                        break;
                spin_lock(&transaction->t_handle_lock);
-                if (!transaction->t_updates) {
+                if (!atomic_read(&transaction->t_updates)) {
                        spin_unlock(&transaction->t_handle_lock);
                        break;
                }
                prepare_to_wait(&journal->j_wait_updates, &wait,
                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_updates, &wait);
-                spin_lock(&journal->j_state_lock);
+                write_lock(&journal->j_state_lock);
        }
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        /*
         * We have now established a barrier against other normal updates, but
@@ -494,9 +544,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
        J_ASSERT(journal->j_barrier_count != 0);
        mutex_unlock(&journal->j_barrier);
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_wait_transaction_locked);
 }
@@ -725,6 +775,9 @@ done:
                page = jh2bh(jh)->b_page;
                offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
                source = kmap_atomic(page, KM_USER0);
+                /* Fire data frozen trigger just before we copy the data */
+                jbd2_buffer_frozen_trigger(jh, source + offset,
+                                           jh->b_triggers);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
@@ -963,15 +1016,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
        jh->b_triggers = type;
 }
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
 {
        struct buffer_head *bh = jh2bh(jh);
-        if (!triggers || !triggers->t_commit)
+        if (!triggers || !triggers->t_frozen)
                return;
-        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
 }
 void jbd2_buffer_abort_trigger(struct journal_head *jh,
@@ -1235,7 +1288,8 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int err;
+        int err, wait_for_commit = 0;
+        tid_t tid;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1243,7 +1297,7 @@ int jbd2_journal_stop(handle_t *handle)
        if (is_handle_aborted(handle))
                err = -EIO;
        else {
-                J_ASSERT(transaction->t_updates > 0);
+                J_ASSERT(atomic_read(&transaction->t_updates) > 0);
                err = 0;
        }
@@ -1288,9 +1342,9 @@ int jbd2_journal_stop(handle_t *handle)
                journal->j_last_sync_writer = pid;
-                spin_lock(&journal->j_state_lock);
+                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
-                spin_unlock(&journal->j_state_lock);
+                read_unlock(&journal->j_state_lock);
                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));
@@ -1311,15 +1365,8 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
-        spin_lock(&journal->j_state_lock);
+        atomic_sub(handle->h_buffer_credits,
-        spin_lock(&transaction->t_handle_lock);
+                   &transaction->t_outstanding_credits);
-        transaction->t_outstanding_credits -= handle->h_buffer_credits;
-        transaction->t_updates--;
-        if (!transaction->t_updates) {
-                wake_up(&journal->j_wait_updates);
-                if (journal->j_barrier_count)
-                        wake_up(&journal->j_wait_transaction_locked);
-        }
        /*
         * If the handle is marked SYNC, we need to set another commit
@@ -1328,32 +1375,42 @@ int jbd2_journal_stop(handle_t *handle)
         * transaction is too old now.
         */
        if (handle->h_sync ||
-                        transaction->t_outstanding_credits >
+            (atomic_read(&transaction->t_outstanding_credits) >
-                                journal->j_max_transaction_buffers ||
+             journal->j_max_transaction_buffers) ||
-                        time_after_eq(jiffies, transaction->t_expires)) {
+            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */
-                tid_t tid = transaction->t_tid;
-                spin_unlock(&transaction->t_handle_lock);
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
-                        err = jbd2_log_wait_commit(journal, tid);
+                        wait_for_commit = 1;
-        } else {
-                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
        }
+        /*
+         * Once we drop t_updates, if it goes to zero the transaction
+         * could start commiting on us and eventually disappear.  So
+         * once we do this, we must not dereference transaction
+         * pointer again.
+         */
+        tid = transaction->t_tid;
+        if (atomic_dec_and_test(&transaction->t_updates)) {
+                wake_up(&journal->j_wait_updates);
+                if (journal->j_barrier_count)
+                        wake_up(&journal->j_wait_transaction_locked);
+        }
+        if (wait_for_commit)
+                err = jbd2_log_wait_commit(journal, tid);
        lock_map_release(&handle->h_lockdep_map);
        jbd2_free_handle(handle);
@@ -1719,7 +1776,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                goto zap_buffer_unlocked;
        /* OK, we have data buffer in journaled mode */
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
@@ -1772,7 +1829,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                        jbd2_journal_put_journal_head(jh);
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
-                        spin_unlock(&journal->j_state_lock);
+                        write_unlock(&journal->j_state_lock);
                        return ret;
                } else {
                        /* There is no currently-running transaction. So the
@@ -1786,7 +1843,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                                jbd2_journal_put_journal_head(jh);
                                spin_unlock(&journal->j_list_lock);
                                jbd_unlock_bh_state(bh);
-                                spin_unlock(&journal->j_state_lock);
+                                write_unlock(&journal->j_state_lock);
                                return ret;
                        } else {
                                /* The orphan record's transaction has
@@ -1810,7 +1867,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
                jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
-                spin_unlock(&journal->j_state_lock);
+                write_unlock(&journal->j_state_lock);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
@@ -1829,7 +1886,7 @@ zap_buffer:
 zap_buffer_no_jh:
        spin_unlock(&journal->j_list_lock);
        jbd_unlock_bh_state(bh);
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2136,9 +2193,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
        /* Locks are here just to force reading of recent values, it is
         * enough that the transaction was not committing before we started
         * a transaction adding the inode to orphan list */
-        spin_lock(&journal->j_state_lock);
+        read_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
-        spin_unlock(&journal->j_state_lock);
+        read_unlock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        inode_trans = jinode->i_transaction;
        spin_unlock(&journal->j_list_lock);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                        if (inode->i_mode != mode) {
                                struct iattr attr;
-                                attr.ia_valid = ATTR_MODE;
+                                attr.ia_valid = ATTR_MODE | ATTR_CTIME;
                                attr.ia_mode = mode;
+                                attr.ia_ctime = CURRENT_TIME_SEC;
                                rc = jffs2_do_setattr(inode, &attr);
                                if (rc < 0)
                                        return rc;
@@ -419,7 +420,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
        return rc;
 }
-struct xattr_handler jffs2_acl_access_xattr_handler = {
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_access_listxattr,
@@ -427,7 +428,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
        .set    = jffs2_acl_setxattr,
 };
-struct xattr_handler jffs2_acl_default_xattr_handler = {
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
-extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern struct xattr_handler jffs2_acl_default_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
 #else
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..404111b016c9 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -23,10 +24,9 @@ static int jffs2_garbage_collect_thread(void *);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
 {
-        spin_lock(&c->erase_completion_lock);
+        assert_spin_locked(&c->erase_completion_lock);
        if (c->gc_task && jffs2_thread_should_wake(c))
                send_sig(SIGHUP, c->gc_task, 1);
-        spin_unlock(&c->erase_completion_lock);
 }
 /* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c5e1450d79f9..a906f538d11c 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f0294410868d..617a1e5694c1 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -2,11 +2,12 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
- * Created by Arjan van de Ven <arjanv@redhat.com>
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
- *
 * Copyright © 2004 Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                  University of Szeged, Hungary
 *
+ * Created by Arjan van de Ven <arjan@infradead.org>
+ *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
 */
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 7d1d72faa774..e471a9106fd9 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -3,6 +3,7 @@
 *
 * Copyright © 2004   Ferenc Havasi <havasi@inf.u-szeged.hu>,
 *                    University of Szeged, Hungary
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * For licensing information, see the file 'LICENCE' in this directory.
 *
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index cd02acafde8a..ed25ae7c98eb 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2007 Nokia Corporation. All rights reserved.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by Richard Purdie <rpurdie@openedhand.com>
 *
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 546d1538d076..9696ad9ef5f7 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 170d289ac785..a12b4f763373 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by Arjan van de Ven <arjanv@redhat.com>
 *
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index b46661a42758..97fc45de6f81 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index ec3538413926..e0b76c87a91a 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index a113ecc3bafe..c4f8eef5ca68 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..ed78a3cf3cb0 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -222,16 +223,17 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
        jffs2_free_raw_inode(ri);
-        d_instantiate(dentry, inode);
        D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
                  inode->i_ino, inode->i_mode, inode->i_nlink,
                  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
 fail:
-        make_bad_inode(inode);
+        iget_failed(inode);
-        iput(inode);
        jffs2_free_raw_inode(ri);
        return ret;
 }
@@ -360,8 +362,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* We use f->target field to store the target path. */
@@ -370,8 +372,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        memcpy(f->target, target, targetlen + 1);
@@ -386,30 +388,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +433,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +449,12 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        iget_failed(inode);
+        return ret;
 }
@@ -519,8 +520,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -531,30 +532,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +577,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +594,12 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        iget_failed(inode);
+        return ret;
 }
 static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +693,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -705,30 +705,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +753,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +769,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        iget_failed(inode);
+        return ret;
 }
 static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..abac961f617b 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -103,9 +104,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        jffs2_erase_failed(c, jeb, bad_offset);
 }
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 {
        struct jffs2_eraseblock *jeb;
+        int work_done = 0;
        mutex_lock(&c->erase_free_sem);
@@ -121,6 +123,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                        mutex_unlock(&c->erase_free_sem);
                        jffs2_mark_erased_block(c, jeb);
+                        work_done++;
                        if (!--count) {
                                D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
                                goto done;
@@ -157,6 +160,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
        mutex_unlock(&c->erase_free_sem);
 done:
        D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+        return work_done;
 }
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +169,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
        list_move_tail(&jeb->list, &c->erase_complete_list);
+        /* Wake the GC thread to mark them clean */
+        jffs2_garbage_collect_trigger(c);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
-        /* Ensure that kupdated calls us again to mark them clean */
+        wake_up(&c->erase_wait);
-        jffs2_erase_pending_trigger(c);
 }
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +492,9 @@ filebad:
 refile:
        /* Stick it back on the list from whence it came and come back later */
-        jffs2_erase_pending_trigger(c);
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
+        jffs2_garbage_collect_trigger(c);
        list_move(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..1c0a08d711aa 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -26,9 +27,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..6b2964a19850 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -169,13 +170,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        /* We have to do the vmtruncate() without f->sem held, since
+        /* We have to do the truncate_setsize() without f->sem held, since
           some pages may be locked and waiting for it in readpage().
           We are protected from a simultaneous write() extending i_size
           back past iattr->ia_size, because do_truncate() holds the
           generic inode semaphore. */
        if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-                vmtruncate(inode, iattr->ia_size);      
+                truncate_setsize(inode, iattr->ia_size);
                inode->i_blocks = (inode->i_size + 511) >> 9;
        }       
@@ -225,7 +226,7 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
-void jffs2_clear_inode (struct inode *inode)
+void jffs2_evict_inode (struct inode *inode)
 {
        /* We can forget about this inode for now - drop all
         *  the nodelists associated with it, etc.
@@ -233,7 +234,9 @@ void jffs2_clear_inode (struct inode *inode)
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
-        D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+        D1(printk(KERN_DEBUG "jffs2_evict_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        jffs2_do_clear_inode(c, f);
 }
@@ -313,8 +316,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        case S_IFBLK:
        case S_IFCHR:
                /* Read the device numbers from the media */
-                if (f->metadata->size != sizeof(jdev.old) &&
+                if (f->metadata->size != sizeof(jdev.old_id) &&
-                    f->metadata->size != sizeof(jdev.new)) {
+                    f->metadata->size != sizeof(jdev.new_id)) {
                        printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
                        goto error_io;
                }
@@ -325,10 +328,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
                        printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
                        goto error;
                }
-                if (f->metadata->size == sizeof(jdev.old))
+                if (f->metadata->size == sizeof(jdev.old_id))
-                        rdev = old_decode_dev(je16_to_cpu(jdev.old));
+                        rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
                else
-                        rdev = new_decode_dev(je32_to_cpu(jdev.new));
+                        rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
        case S_IFSOCK:
        case S_IFIFO:
@@ -465,7 +468,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        inode->i_blocks = 0;
        inode->i_size = 0;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                make_bad_inode(inode);
+                unlock_new_inode(inode);
+                iput(inode);
+                return ERR_PTR(-EINVAL);
+        }
        return inode;
 }
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..846a79452497 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
@@ -214,6 +215,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                return ret;
        }
+        /* If there are any blocks which need erasing, erase them now */
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list)) {
+                spin_unlock(&c->erase_completion_lock);
+                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+                if (jffs2_erase_pending_blocks(c, 1)) {
+                        mutex_unlock(&c->alloc_sem);
+                        return 0;
+                }
+                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+                spin_lock(&c->erase_completion_lock);
+        }
        /* First, work out which block we're garbage-collecting */
        jeb = c->gcblock;
@@ -222,7 +236,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!jeb) {
                /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
-                if (!list_empty(&c->erase_pending_list)) {
+                if (c->nr_erasing_blocks) {
                        spin_unlock(&c->erase_completion_lock);
                        mutex_unlock(&c->alloc_sem);
                        return -EAGAIN;
@@ -435,7 +449,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                list_add_tail(&c->gcblock->list, &c->erase_pending_list);
                c->gcblock = NULL;
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index 9d41f43e47bb..859a598af020 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index c6923da98263..2e4a86763c07 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 85ef6dbb1be7..6784bc89add1 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -2,6 +2,7 @@
 * JFFS2 -- Journalling Flash File System, Version 2.
 *
 * Copyright © 2001-2007 Red Hat, Inc.
+ * Copyright © 2004-2010 David Woodhouse <dwmw2@infradead.org>
 *
 * Created by David Woodhouse <dwmw2@infradead.org>
 *
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..523a91691052 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -24,7 +24,6 @@
 #ifdef __ECOS
 #include "os-ecos.h"
 #else
-#include <linux/mtd/compatmac.h> /* For compatibility with older kernels */
 #include "os-linux.h"
 #endif
@@ -312,11 +311,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
        if (old_valid_dev(rdev)) {
-                jdev->old = cpu_to_je16(old_encode_dev(rdev));
+                jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
-                return sizeof(jdev->old);
+                return sizeof(jdev->old_id);
        } else {
-                jdev->new = cpu_to_je32(new_encode_dev(rdev));
+                jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
-                return sizeof(jdev->new);
+                return sizeof(jdev->new_id);
        }
 }
@@ -464,7 +463,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 191359dde4e1..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        ret = jffs2_garbage_collect_pass(c);
-                        if (ret == -EAGAIN)
+                        if (ret == -EAGAIN) {
-                                jffs2_erase_pending_blocks(c, 1);
+                                spin_lock(&c->erase_completion_lock);
-                        else if (ret)
+                                if (c->nr_erasing_blocks &&
+                                    list_empty(&c->erase_pending_list) &&
+                                    list_empty(&c->erase_complete_list)) {
+                                        DECLARE_WAITQUEUE(wait, current);
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        add_wait_queue(&c->erase_wait, &wait);
+                                        D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+                                        spin_unlock(&c->erase_completion_lock);
+                                        schedule();
+                                } else
+                                        spin_unlock(&c->erase_completion_lock);
+                        } else if (ret)
                                return ret;
                        cond_resched();
@@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
                        list_move_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
                                  ejeb->offset));
                }
@@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
 void jffs2_complete_reservation(struct jffs2_sb_info *c)
 {
        D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+        spin_lock(&c->erase_completion_lock);
        jffs2_garbage_collect_trigger(c);
+        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->alloc_sem);
 }
@@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                                D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                                list_add_tail(&jeb->list, &c->erase_pending_list);
                                c->nr_erasing_blocks++;
-                                jffs2_erase_pending_trigger(c);
+                                jffs2_garbage_collect_trigger(c);
                        } else {
                                /* Sometimes, however, we leave it elsewhere so it doesn't get
                                   immediately reused, and we spread the load a bit. */
@@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
        int nr_very_dirty = 0;
        struct jffs2_eraseblock *jeb;
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list))
+                return 1;
        if (c->unchecked_size) {
                D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
                          c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..00bae7cc2e48 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 #endif /* WRITEBUFFER */
-/* erase.c */
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
 {
        OFNI_BS_2SFFJ(c)->s_dirt = 1;
 }
@@ -159,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 /* ioctl.c */
@@ -172,7 +171,7 @@ extern const struct inode_operations jffs2_symlink_inode_operations;
 int jffs2_setattr (struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
-void jffs2_clear_inode (struct inode *);
+void jffs2_evict_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        ret = -EIO;
                        goto out;
                }
-                jffs2_erase_pending_trigger(c);
+                spin_lock(&c->erase_completion_lock);
+                jffs2_garbage_collect_trigger(c);
+                spin_unlock(&c->erase_completion_lock);
        }
        ret = 0;
 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_security_xattr_handler = {
+const struct xattr_handler jffs2_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list = jffs2_security_listxattr,
        .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..662bba099501 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-                jffs2_garbage_collect_trigger(c);
-                jffs2_erase_pending_blocks(c, 0);
                jffs2_flush_wbuf_gc(c, 0);
        }
@@ -137,7 +135,7 @@ static const struct super_operations jffs2_super_operations =
        .write_super =  jffs2_write_super,
        .statfs =       jffs2_statfs,
        .remount_fs =   jffs2_remount_fs,
-        .clear_inode =  jffs2_clear_inode,
+        .evict_inode =  jffs2_evict_inode,
        .dirty_inode =  jffs2_dirty_inode,
        .sync_fs =      jffs2_sync_fs,
 };
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
        struct jffs2_inodirty *new;
        /* Mark the superblock dirty so that kupdated will flush... */
-        jffs2_erase_pending_trigger(c);
+        jffs2_dirty_trigger(c);
        if (jffs2_wbuf_pending_for_ino(c, ino))
                return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
                        D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                        list_add_tail(&jeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                } else {
                        /* Sometimes, however, we leave it elsewhere so it doesn't get
                           immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
                D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
                list_move(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..9b572ca40a49 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -588,7 +588,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-        /* It's called from jffs2_clear_inode() on inode removing.
+        /* It's called from jffs2_evict_inode() on inode removing.
           When an inode with XATTR is removed, those XATTRs must be removed. */
        struct jffs2_xattr_ref *ref, *_ref;
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-        /* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+        /* success of check_xattr_ref_inode() means that inode (ic) dose not have
         * duplicate name/value pairs. If duplicate name/value pair would be found,
         * one will be removed.
         */
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
 *   is an implementation of setxattr handler on jffs2.
 * -------------------------------------------------- */
-struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler *jffs2_xattr_handlers[] = {
        &jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
        &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *xprefix_to_handler(int xprefix) {
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
-        struct xattr_handler *ret;
+        const struct xattr_handler *ret;
        switch (xprefix) {
        case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct jffs2_inode_cache *ic = f->inocache;
        struct jffs2_xattr_ref *ref, **pref;
        struct jffs2_xattr_datum *xd;
-        struct xattr_handler *xhandle;
+        const struct xattr_handler *xhandle;
        ssize_t len, rc;
        int retry = 0;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
                             const char *buffer, size_t size, int flags);
-extern struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler *jffs2_xattr_handlers[];
-extern struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_user_xattr_handler;
-extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
 extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_getxattr          generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #ifdef CONFIG_JFFS2_FS_SECURITY
 extern int jffs2_init_security(struct inode *inode, struct inode *dir);
-extern struct xattr_handler jffs2_security_xattr_handler;
+extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
 #define jffs2_init_security(inode,dir)  (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_trusted_xattr_handler = {
+const struct xattr_handler jffs2_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list = jffs2_trusted_listxattr,
        .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_user_xattr_handler = {
+const struct xattr_handler jffs2_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list = jffs2_user_listxattr,
        .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..c5ce6c1d1ff4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -17,6 +17,7 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
+#include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
@@ -27,9 +28,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int rc = 0;
        if (!(inode->i_state & I_DIRTY) ||
@@ -98,7 +99,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (rc)
                return rc;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
@@ -107,11 +108,18 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
                        return rc;
        }
-        rc = inode_setattr(inode, iattr);
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(inode)) {
+                rc = vmtruncate(inode, iattr->ia_size);
+                if (rc)
+                        return rc;
+        }
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
-        if (!rc && (iattr->ia_valid & ATTR_MODE))
+        if (iattr->ia_valid & ATTR_MODE)
                rc = jfs_acl_chmod(inode);
        return rc;
 }
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index ed9ba6fe04f5..9978803ceedc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,31 +145,32 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                return 0;
 }
-void jfs_delete_inode(struct inode *inode)
+void jfs_evict_inode(struct inode *inode)
 {
-        jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
+        jfs_info("In jfs_evict_inode, inode = 0x%p", inode);
-        if (!is_bad_inode(inode))
+        if (!inode->i_nlink && !is_bad_inode(inode)) {
                dquot_initialize(inode);
-        if (!is_bad_inode(inode) &&
+                if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
-            (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
+                        truncate_inode_pages(&inode->i_data, 0);
-                truncate_inode_pages(&inode->i_data, 0);
-                if (test_cflag(COMMIT_Freewmap, inode))
+                        if (test_cflag(COMMIT_Freewmap, inode))
-                        jfs_free_zero_link(inode);
+                                jfs_free_zero_link(inode);
-                diFree(inode);
+                        diFree(inode);
-                /*
+                        /*
-                 * Free the inode from the quota allocation.
+                         * Free the inode from the quota allocation.
-                 */
+                         */
-                dquot_initialize(inode);
+                        dquot_initialize(inode);
-                dquot_free_inode(inode);
+                        dquot_free_inode(inode);
-                dquot_drop(inode);
+                }
+        } else {
+                truncate_inode_pages(&inode->i_data, 0);
        }
+        end_writeback(inode);
-        clear_inode(inode);
+        dquot_drop(inode);
 }
 void jfs_dirty_inode(struct inode *inode)
@@ -303,8 +304,17 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
 {
-        return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        int ret;
+        ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
                                jfs_get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
@@ -317,9 +327,24 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                offset, nr_segs, jfs_get_block, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && ret < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
+        return ret;
 }
 const struct address_space_operations jfs_aops = {
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9e2f6a721668..c92ea3b3ea5e 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2438,7 +2438,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
        /* check if this is a control page update for an allocation.
         * if so, update the leaf to reflect the new leaf value using
-         * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+         * dbSplit(); otherwise (deallocation), use dbJoin() to update
         * the leaf with the new value.  in addition to updating the
         * leaf, dbSplit() will also split the binary buddy system of
         * the leaves, if required, and bubble new values within the
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                goto fail_unlock;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, parent, mode);
-        if (parent->i_mode & S_ISGID) {
-                inode->i_gid = parent->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        /*
         * New inodes need to save sane values on disk when
         * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        if (rc)
                goto fail_drop;
-        inode->i_mode = mode;
        /* inherit flags from parent */
        jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                if (S_ISLNK(mode))
                        jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
        }
-        jfs_inode->mode2 |= mode;
+        jfs_inode->mode2 |= inode->i_mode;
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..155e91eff07d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,13 +21,13 @@
 struct fid;
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
 extern int jfs_write_inode(struct inode *, struct writeback_control *);
-extern void jfs_delete_inode(struct inode *);
+extern void jfs_evict_inode(struct inode *);
 extern void jfs_dirty_inode(struct inode *);
 extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..ec8c3e4baca3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -132,11 +132,6 @@ static void jfs_destroy_inode(struct inode *inode)
        kmem_cache_free(jfs_inode_cachep, ji);
 }
-static void jfs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
 static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
@@ -179,6 +174,8 @@ static void jfs_put_super(struct super_block *sb)
        jfs_info("In jfs_put_super");
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        rc = jfs_umount(sb);
@@ -396,10 +393,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                JFS_SBI(sb)->flag = flag;
                ret = jfs_mount_rw(sb, 1);
+                /* mark the fs r/w for quota activity */
+                sb->s_flags &= ~MS_RDONLY;
                unlock_kernel();
+                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+                rc = dquot_suspend(sb, -1);
+                if (rc < 0) {
+                        unlock_kernel();
+                        return rc;
+                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
                unlock_kernel();
@@ -469,6 +476,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         */
        sb->s_op = &jfs_super_operations;
        sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        /*
         * Initialize direct-mapping inode/address-space
@@ -749,8 +760,7 @@ static const struct super_operations jfs_super_operations = {
        .destroy_inode  = jfs_destroy_inode,
        .dirty_inode    = jfs_dirty_inode,
        .write_inode    = jfs_write_inode,
-        .delete_inode   = jfs_delete_inode,
+        .evict_inode    = jfs_evict_inode,
-        .clear_inode    = jfs_clear_inode,
        .put_super      = jfs_put_super,
        .sync_fs        = jfs_sync_fs,
        .freeze_fs      = jfs_freeze,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fa96bbb26343..2d7f165d0f1d 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -86,46 +86,25 @@ struct ea_buffer {
 #define EA_MALLOC       0x0008
+static int is_known_namespace(const char *name)
+{
+        if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+            strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+            strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+            strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+                return false;
+        return true;
+}
 /*
 * These three routines are used to recognize on-disk extended attributes
 * that are in a recognized namespace.  If the attribute is not recognized,
 * "os2." is prepended to the name
 */
-static inline int is_os2_xattr(struct jfs_ea *ea)
+static int is_os2_xattr(struct jfs_ea *ea)
 {
-        /*
+        return !is_known_namespace(ea->name);
-         * Check for "system."
-         */
-        if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
-            !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-                return false;
-        /*
-         * Check for "user."
-         */
-        if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
-            !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-                return false;
-        /*
-         * Check for "security."
-         */
-        if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
-            !strncmp(ea->name, XATTR_SECURITY_PREFIX,
-                     XATTR_SECURITY_PREFIX_LEN))
-                return false;
-        /*
-         * Check for "trusted."
-         */
-        if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
-            !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-                return false;
-        /*
-         * Add any other valid namespace prefixes here
-         */
-        /*
-         * We assume it's OS/2's flat namespace
-         */
-        return true;
 }
 static inline int name_size(struct jfs_ea *ea)
@@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return can_set_system_xattr(inode, name, value, value_len);
+        if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+                /*
+                 * This makes sure that we aren't trying to set an
+                 * attribute in a different namespace by prefixing it
+                 * with "os2."
+                 */
+                if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+                                return -EOPNOTSUPP;
+                return 0;
+        }
        /*
         * Don't allow setting an attribute in an unknown namespace.
         */
        if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
            strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
-            strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+            strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-            strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
                return -EOPNOTSUPP;
        return 0;
@@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
        int xattr_size;
        ssize_t size;
        int namelen = strlen(name);
-        char *os2name = NULL;
        char *value;
-        if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
-                os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
-                                  GFP_KERNEL);
-                if (!os2name)
-                        return -ENOMEM;
-                strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
-                name = os2name;
-                namelen -= XATTR_OS2_PREFIX_LEN;
-        }
        down_read(&JFS_IP(inode)->xattr_sem);
        xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
      out:
        up_read(&JFS_IP(inode)->xattr_sem);
-        kfree(os2name);
        return size;
 }
@@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 {
        int err;
+        if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+                /*
+                 * skip past "os2." prefix
+                 */
+                name += XATTR_OS2_PREFIX_LEN;
+                /*
+                 * Don't allow retrieving properly prefixed attributes
+                 * by prepending them with "os2."
+                 */
+                if (is_known_namespace(name))
+                        return -EOPNOTSUPP;
+        }
        err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
        return err;
diff --git a/fs/libfs.c b/fs/libfs.c
index ea9a6cc9b35c..0a9da95317f7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        return NULL;
 }
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        return 0;
-}
- 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
        static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
 };
 const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,39 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 }
+/**
+ * simple_setattr - setattr for simple filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr is a simple ->setattr implementation without a proper
+ * implementation of size changes.
+ *
+ * It can either be used for in-memory filesystems or special files
+ * on simple regular filesystems.  Anything that needs to change on-disk
+ * or wire state on size changes needs its own setattr method.
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        WARN_ON_ONCE(inode->i_op->truncate);
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE)
+                truncate_setsize(inode, iattr->ia_size);
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
+        return 0;
+}
+EXPORT_SYMBOL(simple_setattr);
 int simple_readpage(struct file *file, struct page *page)
 {
        clear_highpage(page);
@@ -418,7 +447,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+                      struct tree_descr *files)
 {
        struct inode *inode;
        struct dentry *root;
@@ -547,6 +577,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 }
 /**
+ * simple_write_to_buffer - copy data from user space to the buffer
+ * @to: the buffer to write to
+ * @available: the size of the buffer
+ * @ppos: the current position in the buffer
+ * @from: the user space buffer to read from
+ * @count: the maximum number of bytes to read
+ *
+ * The simple_write_to_buffer() function reads up to @count bytes from the user
+ * space address starting at @from into the buffer @to at offset @ppos.
+ *
+ * On success, the number of bytes written is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
+                const void __user *from, size_t count)
+{
+        loff_t pos = *ppos;
+        size_t res;
+        if (pos < 0)
+                return -EINVAL;
+        if (pos >= available || !count)
+                return 0;
+        if (count > available - pos)
+                count = available - pos;
+        res = copy_from_user(to + pos, from, count);
+        if (res == count)
+                return -EFAULT;
+        count -= res;
+        *ppos = pos + count;
+        return count;
+}
+/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
@@ -817,13 +881,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:       file to synchronize
+ * @datasync:   only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = 0, /* metadata-only; caller takes care of data */
        };
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -838,7 +911,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
                ret = err;
        return ret;
 }
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+        return 0;
+}
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -861,9 +942,10 @@ EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(simple_write_to_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 243c00071f76..9bd2ce2a3040 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -303,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
        close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
 }
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        return 0;
+}
 static const struct logfs_device_ops bd_devops = {
        .find_first_sb  = bdev_find_first_sb,
        .find_last_sb   = bdev_find_last_sb,
@@ -310,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
        .readpage       = bdev_readpage,
        .writeseg       = bdev_writeseg,
        .erase          = bdev_erase,
+        .can_write_buf  = bdev_can_write_buf,
        .sync           = bdev_sync,
        .put_device     = bdev_put_device,
 };
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05b..a85d47d13e4b 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
        err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
                        page_address(page));
-        if (err == -EUCLEAN) {
+        if (err == -EUCLEAN || err == -EBADMSG) {
+                /* -EBADMSG happens regularly on power failures */
                err = 0;
                /* FIXME: force GC this segment */
        }
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
        put_mtd_device(logfs_super(sb)->s_mtd);
 }
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+        struct logfs_super *super = logfs_super(sb);
+        void *buf;
+        int err;
+        buf = kmalloc(super->s_writesize, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        err = mtd_read(sb, ofs, super->s_writesize, buf);
+        if (err)
+                goto out;
+        if (memchr_inv(buf, 0xff, super->s_writesize))
+                err = -EIO;
+        kfree(buf);
+out:
+        return err;
+}
 static const struct logfs_device_ops mtd_devops = {
        .find_first_sb  = mtd_find_first_sb,
        .find_last_sb   = mtd_find_last_sb,
        .readpage       = mtd_readpage,
        .writeseg       = mtd_writeseg,
        .erase          = mtd_erase,
+        .can_write_buf  = mtd_can_write_buf,
        .sync           = mtd_sync,
        .put_device     = mtd_put_device,
 };
@@ -250,5 +272,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
        const struct logfs_device_ops *devops = &mtd_devops;
        mtd = get_mtd_device(NULL, mtdnr);
+        if (IS_ERR(mtd))
+                return PTR_ERR(mtd);
        return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
 }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 2396a85c0f55..9777eb5b5522 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -12,7 +12,7 @@
 * Atomic dir operations
 *
 * Directory operations are by default not atomic.  Dentries and Inodes are
- * created/removed/altered in seperate operations.  Therefore we need to do
+ * created/removed/altered in separate operations.  Therefore we need to do
 * a small amount of journaling.
 *
 * Create, link, mkdir, mknod and symlink all share the same function to do
@@ -434,8 +434,11 @@ static int __logfs_create(struct inode *dir, struct dentry *dentry,
        int ret;
        ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-        if (!ta)
+        if (!ta) {
+                inode->i_nlink--;
+                iput(inode);
                return -ENOMEM;
+        }
        ta->state = CREATE_1;
        ta->ino = inode->i_ino;
@@ -821,7 +824,7 @@ const struct inode_operations logfs_dir_iops = {
 };
 const struct file_operations logfs_dir_fops = {
        .fsync          = logfs_fsync,
-        .ioctl          = logfs_ioctl,
+        .unlocked_ioctl = logfs_ioctl,
        .readdir        = logfs_readdir,
        .read           = generic_read_dir,
 };
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933e..e86376b87af1 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 static void logfs_invalidatepage(struct page *page, unsigned long offset)
 {
-        move_page_to_btree(page);
+        struct logfs_block *block = logfs_block(page);
+        if (block->reserved_bytes) {
+                struct super_block *sb = page->mapping->host->i_sb;
+                struct logfs_super *super = logfs_super(sb);
+                super->s_dirty_pages -= block->reserved_bytes;
+                block->ops->free_block(sb, block);
+                BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+        } else
+                move_page_to_btree(page);
        BUG_ON(PagePrivate(page) || page->private);
 }
@@ -171,9 +181,9 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
 }
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-                unsigned long arg)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct logfs_inode *li = logfs_inode(inode);
        unsigned int oldflags, flags;
        int err;
@@ -209,13 +219,11 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
        }
 }
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
 {
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
-        struct logfs_super *super = logfs_super(sb);
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        super->s_devops->sync(sb);
        return 0;
 }
@@ -224,15 +232,19 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        int err = 0;
-        if (attr->ia_valid & ATTR_SIZE)
+        err = inode_change_ok(inode, attr);
+        if (err)
+                return err;
+        if (attr->ia_valid & ATTR_SIZE) {
                err = logfs_truncate(inode, attr->ia_size);
-        attr->ia_valid &= ~ATTR_SIZE;
+                if (err)
+                        return err;
+        }
-        if (!err)
+        setattr_copy(inode, attr);
-                err = inode_change_ok(inode, attr);
+        mark_inode_dirty(inode);
-        if (!err)
+        return 0;
-                err = inode_setattr(inode, attr);
-        return err;
 }
 const struct inode_operations logfs_reg_iops = {
@@ -243,7 +255,7 @@ const struct file_operations logfs_reg_fops = {
        .aio_read       = generic_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = logfs_fsync,
-        .ioctl          = logfs_ioctl,
+        .unlocked_ioctl = logfs_ioctl,
        .llseek         = generic_file_llseek,
        .mmap           = generic_file_readonly_mmap,
        .open           = generic_file_open,
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 76c242fbe1b0..caa4419285dc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -122,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
        logfs_safe_iput(inode, cookie);
 }
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_segment_header sh;
@@ -401,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
                        segno, (u64)segno << super->s_segshift,
                        dist, no_free_segments(sb), valid,
                        super->s_free_bytes);
-        cleaned = logfs_gc_segment(sb, segno, dist);
+        cleaned = logfs_gc_segment(sb, segno);
        log_gc("GC segment #%02x complete - now %x valid\n", segno,
                        valid - cleaned);
        BUG_ON(cleaned != valid);
@@ -632,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
 {
        struct logfs_super *super = logfs_super(sb);
        struct logfs_area *area = super->s_area[i];
-        struct logfs_object_header oh;
+        gc_level_t gc_level;
+        u32 cleaned, valid, ec;
        u32 segno = area->a_segno;
-        u32 ofs = area->a_used_bytes;
+        u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-        __be32 crc;
-        int err;
        if (!area->a_is_open)
                return 0;
-        for (ofs = area->a_used_bytes;
+        if (super->s_devops->can_write_buf(sb, ofs) == 0)
-             ofs <= super->s_segsize - sizeof(oh);
+                return 0;
-             ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
-                err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
-                if (err)
-                        return err;
-                if (!memchr_inv(&oh, 0xff, sizeof(oh)))
-                        break;
-                crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
+        printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
-                if (crc != oh.crc) {
+        /*
-                        printk(KERN_INFO "interrupted header at %llx\n",
+         * The device cannot write back the write buffer.  Most likely the
-                                        dev_ofs(sb, segno, ofs));
+         * wbuf was already written out and the system crashed at some point
-                        return 0;
+         * before the journal commit happened.  In that case we wouldn't have
-                }
+         * to do anything.  But if the crash happened before the wbuf was
-        }
+         * written out correctly, we must GC this segment.  So assume the
-        if (ofs != area->a_used_bytes) {
+         * worst and always do the GC run.
-                printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
+         */
-                                ofs - area->a_used_bytes,
+        area->a_is_open = 0;
-                                dev_ofs(sb, segno, area->a_used_bytes));
+        valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-                area->a_used_bytes = ofs;
+        cleaned = logfs_gc_segment(sb, segno);
-        }
+        if (cleaned != valid)
+                return -EIO;
        return 0;
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 14ed27274da2..d8c71ece098f 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -193,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
        inode->i_ctime  = CURRENT_TIME;
        inode->i_mtime  = CURRENT_TIME;
        inode->i_nlink  = 1;
+        li->li_refcount = 1;
        INIT_LIST_HEAD(&li->li_freeing_list);
        for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
@@ -234,33 +235,21 @@ static struct inode *logfs_alloc_inode(struct super_block *sb)
 * purpose is to create a new inode that will not trigger the warning if such
 * an inode is still in use.  An ugly hack, no doubt.  Suggections for
 * improvement are welcome.
+ *
+ * AV: that's what ->put_super() is for...
 */
 struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
 {
        struct inode *inode;
-        inode = logfs_alloc_inode(sb);
+        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        inode->i_mode = S_IFREG;
        inode->i_ino = ino;
-        inode->i_sb = sb;
+        inode->i_data.a_ops = &logfs_reg_aops;
+        mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
-        /* This is a blatant copy of alloc_inode code.  We'd need alloc_inode
-         * to be nonstatic, alas. */
-        {
-                struct address_space * const mapping = &inode->i_data;
-                mapping->a_ops = &logfs_reg_aops;
-                mapping->host = inode;
-                mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, GFP_NOFS);
-                mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = &default_backing_dev_info;
-                inode->i_mapping = mapping;
-                inode->i_nlink = 1;
-        }
        return inode;
 }
@@ -276,7 +265,7 @@ struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
        err = logfs_read_inode(inode);
        if (err) {
-                destroy_meta_inode(inode);
+                iput(inode);
                return ERR_PTR(err);
        }
        logfs_inode_setops(inode);
@@ -297,18 +286,8 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
-void destroy_meta_inode(struct inode *inode)
-{
-        if (inode) {
-                if (inode->i_data.nrpages)
-                        truncate_inode_pages(&inode->i_data, 0);
-                logfs_clear_inode(inode);
-                kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
-        }
-}
 /* called with inode_lock held */
-static void logfs_drop_inode(struct inode *inode)
+static int logfs_drop_inode(struct inode *inode)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
        struct logfs_inode *li = logfs_inode(inode);
@@ -316,7 +295,7 @@ static void logfs_drop_inode(struct inode *inode)
        spin_lock(&logfs_inode_lock);
        list_move(&li->li_freeing_list, &super->s_freeing_list);
        spin_unlock(&logfs_inode_lock);
-        generic_drop_inode(inode);
+        return generic_drop_inode(inode);
 }
 static void logfs_set_ino_generation(struct super_block *sb,
@@ -326,7 +305,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
        u64 ino;
        mutex_lock(&super->s_journal_mutex);
-        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+        ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
        super->s_last_ino = ino;
        super->s_inos_till_wrap--;
        if (super->s_inos_till_wrap < 0) {
@@ -357,14 +336,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        logfs_set_ino_generation(sb, inode);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = current_fsgid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        }
        logfs_inode_setops(inode);
        insert_inode_hash(inode);
@@ -386,17 +358,25 @@ static void logfs_init_once(void *_li)
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
-        /* FIXME: write anchor */
+        logfs_write_anchor(sb);
-        logfs_super(sb)->s_devops->sync(sb);
        return 0;
 }
+static void logfs_put_super(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        /* kill the meta-inodes */
+        iput(super->s_master_inode);
+        iput(super->s_segfile_inode);
+        iput(super->s_mapping_inode);
+}
 const struct super_operations logfs_super_operations = {
        .alloc_inode    = logfs_alloc_inode,
-        .clear_inode    = logfs_clear_inode,
-        .delete_inode   = logfs_delete_inode,
        .destroy_inode  = logfs_destroy_inode,
+        .evict_inode    = logfs_evict_inode,
        .drop_inode     = logfs_drop_inode,
+        .put_super      = logfs_put_super,
        .write_inode    = logfs_write_inode,
        .statfs         = logfs_statfs,
        .sync_fs        = logfs_sync_fs,
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index fb0a613f885b..f46ee8b0e135 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -132,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
        ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
        if (super->s_writesize > 1)
-                logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+                return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
        else
-                logfs_buf_recover(area, ofs, NULL, 0);
+                return logfs_buf_recover(area, ofs, NULL, 0);
-        return 0;
 }
 static void *unpack(void *from, void *to)
@@ -245,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
                read_erasecount(sb, unpack(jh, scratch));
                break;
        case JE_AREA:
-                read_area(sb, unpack(jh, scratch));
+                err = read_area(sb, unpack(jh, scratch));
                break;
        case JE_OBJ_ALIAS:
                err = logfs_load_object_aliases(sb, unpack(jh, scratch),
@@ -890,8 +889,6 @@ void logfs_cleanup_journal(struct super_block *sb)
        struct logfs_super *super = logfs_super(sb);
        btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
-        destroy_meta_inode(super->s_master_inode);
-        super->s_master_inode = NULL;
        kfree(super->s_compressed_je);
        kfree(super->s_je);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0a3df1a0c936..b8786264d243 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
 * @erase:                      erase one segment
 * @read:                       read from the device
 * @erase:                      erase part of the device
+ * @can_write_buf:              decide whether wbuf can be written to ofs
 */
 struct logfs_device_ops {
        struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
        void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
        int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
                        int ensure_write);
+        int (*can_write_buf)(struct super_block *sb, u64 ofs);
        void (*sync)(struct super_block *sb);
        void (*put_device)(struct super_block *sb);
 };
@@ -394,6 +396,7 @@ struct logfs_super {
        int      s_lock_count;
        mempool_t *s_block_pool;                /* struct logfs_block pool */
        mempool_t *s_shadow_pool;               /* struct logfs_shadow pool */
+        struct list_head s_writeback_list;      /* writeback pages */
        /*
         * Space accounting:
         * - s_used_bytes specifies space used to store valid data objects.
@@ -501,9 +504,8 @@ extern const struct inode_operations logfs_reg_iops;
 extern const struct file_operations logfs_reg_fops;
 extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
-int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-                unsigned long arg);
+int logfs_fsync(struct file *file, int datasync);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
@@ -522,13 +524,11 @@ struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
 struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
 int logfs_init_inode_cache(void);
 void logfs_destroy_inode_cache(void);
-void destroy_meta_inode(struct inode *inode);
 void logfs_set_blocks(struct inode *inode, u64 no);
 /* these logically belong into inode.c but actually reside in readwrite.c */
 int logfs_read_inode(struct inode *inode);
 int __logfs_write_inode(struct inode *inode, long flags);
-void logfs_delete_inode(struct inode *inode);
+void logfs_evict_inode(struct inode *inode);
-void logfs_clear_inode(struct inode *inode);
 /* journal.c */
 void logfs_write_anchor(struct super_block *sb);
@@ -598,19 +598,19 @@ void freeseg(struct super_block *sb, u32 segno);
 int logfs_init_areas(struct super_block *sb);
 void logfs_cleanup_areas(struct super_block *sb);
 int logfs_open_area(struct logfs_area *area, size_t bytes);
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler);
-static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 0);
+        return __logfs_buf_write(area, ofs, buf, len, 0);
 }
-static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
                void *buf, size_t len)
 {
-        __logfs_buf_write(area, ofs, buf, len, 1);
+        return __logfs_buf_write(area, ofs, buf, len, 1);
 }
 /* super.c */
@@ -704,7 +704,7 @@ static inline gc_level_t expand_level(u64 ino, level_t __level)
        u8 level = (__force u8)__level;
        if (ino == LOGFS_INO_MASTER) {
-                /* ifile has seperate areas */
+                /* ifile has separate areas */
                level += LOGFS_MAX_LEVELS;
        }
        return (__force gc_level_t)level;
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
index f674725663fe..ae960519c54a 100644
--- a/fs/logfs/logfs_abi.h
+++ b/fs/logfs/logfs_abi.h
@@ -50,9 +50,9 @@ static inline void check_##type(void)				\
 * 12   - gc recycled blocks, long-lived data
 * 13   - replacement blocks, short-lived data
 *
- * Levels 1-11 are necessary for robust gc operations and help seperate
+ * Levels 1-11 are necessary for robust gc operations and help separate
 * short-lived metadata from longer-lived file data.  In the future,
- * file data should get seperated into several segments based on simple
+ * file data should get separated into several segments based on simple
 * heuristics.  Old data recycled during gc operation is expected to be
 * long-lived.  New data is of uncertain life expectancy.  New data
 * used to replace older blocks in existing files is expected to be
@@ -117,7 +117,7 @@ static inline void check_##type(void)				\
 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
 /*
- * LogFS needs to seperate data into levels.  Each level is defined as the
+ * LogFS needs to separate data into levels.  Each level is defined as the
 * maximal possible distance from the master inode (inode of the inode file).
 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
@@ -204,7 +204,7 @@ SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
 * @ds_crc:                     crc32 of structure starting with the next field
 * @ds_ifile_levels:            maximum number of levels for ifile
 * @ds_iblock_levels:           maximum number of levels for regular files
- * @ds_data_levels:             number of seperate levels for data
+ * @ds_data_levels:             number of separate levels for data
 * @pad0:                       reserved, must be 0
 * @ds_feature_incompat:        incompatible filesystem features
 * @ds_feature_ro_compat:       read-only compatible filesystem features
@@ -456,7 +456,7 @@ enum logfs_vim {
 * @vim:                        life expectancy of data
 *
 * "Areas" are segments currently being used for writing.  There is at least
- * one area per GC level.  Several may be used to seperate long-living from
+ * one area per GC level.  Several may be used to separate long-living from
 * short-living data.  If an area with unknown vim is encountered, it can
 * simply be closed.
 * The write buffer immediately follow this header.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 3159db6958e5..6127baf0e188 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -892,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
                return bix;
        else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
                bix = maxbix(li->li_height);
+        else if (bix >= maxbix(li->li_height))
+                return bix;
        else {
                bix = seek_holedata_loop(inode, bix, 0);
                if (bix < maxbix(li->li_height))
@@ -1093,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
 int get_page_reserve(struct inode *inode, struct page *page)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
+        struct logfs_block *block = logfs_block(page);
        int ret;
-        if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+        if (block && block->reserved_bytes)
                return 0;
        logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
-        ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+        while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+                        !list_empty(&super->s_writeback_list)) {
+                block = list_entry(super->s_writeback_list.next,
+                                struct logfs_block, alias_list);
+                block->ops->write_block(block);
+        }
        if (!ret) {
                alloc_data_block(inode, page);
-                logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+                block = logfs_block(page);
+                block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
                super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+                list_move_tail(&block->alias_list, &super->s_writeback_list);
        }
        logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
        return ret;
@@ -1861,7 +1871,7 @@ int logfs_truncate(struct inode *inode, u64 target)
                        size = target;
                logfs_get_wblocks(sb, NULL, 1);
-                err = __logfs_truncate(inode, target);
+                err = __logfs_truncate(inode, size);
                if (!err)
                        err = __logfs_write_inode(inode, 0);
                logfs_put_wblocks(sb, NULL, 1);
@@ -1962,31 +1972,6 @@ static struct page *inode_to_page(struct inode *inode)
        return page;
 }
-/* Cheaper version of write_inode.  All changes are concealed in
- * aliases, which are moved back.  No write to the medium happens.
- */
-void logfs_clear_inode(struct inode *inode)
-{
-        struct super_block *sb = inode->i_sb;
-        struct logfs_inode *li = logfs_inode(inode);
-        struct logfs_block *block = li->li_block;
-        struct page *page;
-        /* Only deleted files may be dirty at this point */
-        BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
-        if (!block)
-                return;
-        if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
-                block->ops->free_block(inode->i_sb, block);
-                return;
-        }
-        BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
-        page = inode_to_page(inode);
-        BUG_ON(!page); /* FIXME: Use emergency page */
-        logfs_put_write_page(page);
-}
 static int do_write_inode(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
@@ -2154,18 +2139,40 @@ static int do_delete_inode(struct inode *inode)
 * ZOMBIE inodes have already been deleted before and should remain dead,
 * if it weren't for valid checking.  No need to kill them again here.
 */
-void logfs_delete_inode(struct inode *inode)
+void logfs_evict_inode(struct inode *inode)
 {
+        struct super_block *sb = inode->i_sb;
        struct logfs_inode *li = logfs_inode(inode);
+        struct logfs_block *block = li->li_block;
+        struct page *page;
-        if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
+        if (!inode->i_nlink) {
-                li->li_flags |= LOGFS_IF_ZOMBIE;
+                if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
-                if (i_size_read(inode) > 0)
+                        li->li_flags |= LOGFS_IF_ZOMBIE;
-                        logfs_truncate(inode, 0);
+                        if (i_size_read(inode) > 0)
-                do_delete_inode(inode);
+                                logfs_truncate(inode, 0);
+                        do_delete_inode(inode);
+                }
        }
        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
+        end_writeback(inode);
+        /* Cheaper version of write_inode.  All changes are concealed in
+         * aliases, which are moved back.  No write to the medium happens.
+         */
+        /* Only deleted files may be dirty at this point */
+        BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
+        if (!block)
+                return;
+        if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
+                block->ops->free_block(inode->i_sb, block);
+                return;
+        }
+        BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
+        page = inode_to_page(inode);
+        BUG_ON(!page); /* FIXME: Use emergency page */
+        logfs_put_write_page(page);
 }
 void btree_write_block(struct logfs_block *block)
@@ -2249,6 +2256,7 @@ int logfs_init_rw(struct super_block *sb)
        int min_fill = 3 * super->s_no_blocks;
        INIT_LIST_HEAD(&super->s_object_alias);
+        INIT_LIST_HEAD(&super->s_writeback_list);
        mutex_init(&super->s_write_mutex);
        super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
                        sizeof(struct logfs_block));
@@ -2261,7 +2269,6 @@ void logfs_cleanup_rw(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
-        destroy_meta_inode(super->s_segfile_inode);
        logfs_mempool_destroy(super->s_block_pool);
        logfs_mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index f77ce2b470ba..9d5187353255 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -67,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
        return page;
 }
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                int use_filler)
 {
        pgoff_t index = ofs >> PAGE_SHIFT;
@@ -81,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                copylen = min((ulong)len, PAGE_SIZE - offset);
                page = get_mapping_page(area->a_sb, index, use_filler);
-                SetPageUptodate(page);
+                if (IS_ERR(page))
+                        return PTR_ERR(page);
                BUG_ON(!page); /* FIXME: reserve a pool */
+                SetPageUptodate(page);
                memcpy(page_address(page) + offset, buf, copylen);
                SetPagePrivate(page);
                page_cache_release(page);
@@ -92,6 +94,7 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                offset = 0;
                index++;
        } while (len);
+        return 0;
 }
 static void pad_partial_page(struct logfs_area *area)
@@ -926,5 +929,4 @@ void logfs_cleanup_areas(struct super_block *sb)
        for_each_area(i)
                free_area(super->s_area[i]);
        free_area(super->s_journal_area);
-        destroy_meta_inode(super->s_mapping_inode);
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d7c23ed8349a..5336155c5d81 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -138,10 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
        sb->s_fs_info = super;
        sb->s_mtd = super->s_mtd;
        sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
        if (sb->s_bdev)
                sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
        if (sb->s_mtd)
                sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
        return 0;
 }
@@ -338,24 +342,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
                goto fail;
        }
+        /* at that point we know that ->put_super() will be called */
        super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
        if (!super->s_erase_page)
-                goto fail;
+                return -ENOMEM;
        memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
        /* FIXME: check for read-only mounts */
        err = logfs_make_writeable(sb);
-        if (err)
+        if (err) {
-                goto fail1;
+                __free_page(super->s_erase_page);
+                return err;
+        }
        log_super("LogFS: Finished mounting\n");
        simple_set_mnt(mnt, sb);
        return 0;
-fail1:
-        __free_page(super->s_erase_page);
 fail:
-        iput(logfs_super(sb)->s_master_inode);
+        iput(super->s_master_inode);
+        iput(super->s_segfile_inode);
+        iput(super->s_mapping_inode);
        return -EIO;
 }
@@ -382,7 +389,7 @@ static struct page *find_super_block(struct super_block *sb)
        if (!first || IS_ERR(first))
                return NULL;
        last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
-        if (!last || IS_ERR(first)) {
+        if (!last || IS_ERR(last)) {
                page_cache_release(first);
                return NULL;
        }
@@ -413,7 +420,7 @@ static int __logfs_read_sb(struct super_block *sb)
        page = find_super_block(sb);
        if (!page)
-                return -EIO;
+                return -EINVAL;
        ds = page_address(page);
        super->s_size = be64_to_cpu(ds->ds_filesystem_size);
@@ -576,10 +583,14 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
        sb->s_flags |= MS_ACTIVE;
        err = logfs_get_sb_final(sb, mnt);
        if (err)
-                goto err1;
+                deactivate_locked_super(sb);
-        return 0;
+        return err;
 err1:
+        /* no ->s_root, no ->put_super() */
+        iput(super->s_master_inode);
+        iput(super->s_segfile_inode);
+        iput(super->s_mapping_inode);
        deactivate_locked_super(sb);
        return err;
 err0:
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..93444747237b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -79,15 +79,12 @@ EXPORT_SYMBOL(mb_cache_entry_find_next);
 struct mb_cache {
        struct list_head                c_cache_list;
        const char                      *c_name;
-        struct mb_cache_op              c_op;
        atomic_t                        c_entry_count;
+        int                             c_max_entries;
        int                             c_bucket_bits;
-#ifndef MB_CACHE_INDEXES_COUNT
+        struct kmem_cache               *c_entry_cache;
-        int                             c_indexes_count;
-#endif
-        struct kmem_cache                       *c_entry_cache;
        struct list_head                *c_block_hash;
-        struct list_head                *c_indexes_hash[0];
+        struct list_head                *c_index_hash;
 };
@@ -101,21 +98,11 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
-static inline int
-mb_cache_indexes(struct mb_cache *cache)
-{
-#ifdef MB_CACHE_INDEXES_COUNT
-        return MB_CACHE_INDEXES_COUNT;
-#else
-        return cache->c_indexes_count;
-#endif
-}
 /*
 * What the mbcache registers as to get shrunk dynamically.
 */
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 static struct shrinker mb_cache_shrinker = {
        .shrink = mb_cache_shrink_fn,
@@ -132,12 +119,9 @@ __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 static void
 __mb_cache_entry_unhash(struct mb_cache_entry *ce)
 {
-        int n;
        if (__mb_cache_entry_is_hashed(ce)) {
                list_del_init(&ce->e_block_list);
-                for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
+                list_del(&ce->e_index.o_list);
-                        list_del(&ce->e_indexes[n].o_list);
        }
 }
@@ -148,16 +132,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
        struct mb_cache *cache = ce->e_cache;
        mb_assert(!(ce->e_used || ce->e_queued));
-        if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
+        kmem_cache_free(cache->c_entry_cache, ce);
-                /* free failed -- put back on the lru list
+        atomic_dec(&cache->c_entry_count);
-                   for freeing later. */
-                spin_lock(&mb_cache_spinlock);
-                list_add(&ce->e_lru_list, &mb_cache_lru_list);
-                spin_unlock(&mb_cache_spinlock);
-        } else {
-                kmem_cache_free(cache->c_entry_cache, ce);
-                atomic_dec(&cache->c_entry_count);
-        }
 }
@@ -191,31 +167,22 @@ forget:
 * This function is called by the kernel memory management when memory
 * gets low.
 *
+ * @shrink: (ignored)
 * @nr_to_scan: Number of objects to scan
 * @gfp_mask: (ignored)
 *
 * Returns the number of objects which are present in the cache.
 */
 static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(free_list);
-        struct list_head *l, *ltmp;
+        struct mb_cache *cache;
+        struct mb_cache_entry *entry, *tmp;
        int count = 0;
-        spin_lock(&mb_cache_spinlock);
-        list_for_each(l, &mb_cache_list) {
-                struct mb_cache *cache =
-                        list_entry(l, struct mb_cache, c_cache_list);
-                mb_debug("cache %s (%d)", cache->c_name,
-                          atomic_read(&cache->c_entry_count));
-                count += atomic_read(&cache->c_entry_count);
-        }
        mb_debug("trying to free %d entries", nr_to_scan);
-        if (nr_to_scan == 0) {
+        spin_lock(&mb_cache_spinlock);
-                spin_unlock(&mb_cache_spinlock);
-                goto out;
-        }
        while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
                struct mb_cache_entry *ce =
                        list_entry(mb_cache_lru_list.next,
@@ -223,12 +190,15 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
                list_move_tail(&ce->e_lru_list, &free_list);
                __mb_cache_entry_unhash(ce);
        }
+        list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+                mb_debug("cache %s (%d)", cache->c_name,
+                          atomic_read(&cache->c_entry_count));
+                count += atomic_read(&cache->c_entry_count);
+        }
        spin_unlock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &free_list) {
+        list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-                __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+                __mb_cache_entry_forget(entry, gfp_mask);
-                                                   e_lru_list), gfp_mask);
        }
-out:
        return (count / 100) * sysctl_vfs_cache_pressure;
 }
@@ -242,72 +212,55 @@ out:
 * memory was available.
 *
 * @name: name of the cache (informal)
- * @cache_op: contains the callback called when freeing a cache entry
- * @entry_size: The size of a cache entry, including
- *              struct mb_cache_entry
- * @indexes_count: number of additional indexes in the cache. Must equal
- *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
- *                 hardwired.
 * @bucket_bits: log2(number of hash buckets)
 */
 struct mb_cache *
-mb_cache_create(const char *name, struct mb_cache_op *cache_op,
+mb_cache_create(const char *name, int bucket_bits)
-                size_t entry_size, int indexes_count, int bucket_bits)
 {
-        int m=0, n, bucket_count = 1 << bucket_bits;
+        int n, bucket_count = 1 << bucket_bits;
        struct mb_cache *cache = NULL;
-        if(entry_size < sizeof(struct mb_cache_entry) +
+        cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
-           indexes_count * sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]))
-                return NULL;
-        cache = kmalloc(sizeof(struct mb_cache) +
-                        indexes_count * sizeof(struct list_head), GFP_KERNEL);
        if (!cache)
-                goto fail;
+                return NULL;
        cache->c_name = name;
-        cache->c_op.free = NULL;
-        if (cache_op)
-                cache->c_op.free = cache_op->free;
        atomic_set(&cache->c_entry_count, 0);
        cache->c_bucket_bits = bucket_bits;
-#ifdef MB_CACHE_INDEXES_COUNT
-        mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-#else
-        cache->c_indexes_count = indexes_count;
-#endif
        cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
                                      GFP_KERNEL);
        if (!cache->c_block_hash)
                goto fail;
        for (n=0; n<bucket_count; n++)
                INIT_LIST_HEAD(&cache->c_block_hash[n]);
-        for (m=0; m<indexes_count; m++) {
+        cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
-                cache->c_indexes_hash[m] = kmalloc(bucket_count *
+                                      GFP_KERNEL);
-                                                 sizeof(struct list_head),
+        if (!cache->c_index_hash)
-                                                 GFP_KERNEL);
+                goto fail;
-                if (!cache->c_indexes_hash[m])
+        for (n=0; n<bucket_count; n++)
-                        goto fail;
+                INIT_LIST_HEAD(&cache->c_index_hash[n]);
-                for (n=0; n<bucket_count; n++)
+        cache->c_entry_cache = kmem_cache_create(name,
-                        INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
+                sizeof(struct mb_cache_entry), 0,
-        }
-        cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
                SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
        if (!cache->c_entry_cache)
-                goto fail;
+                goto fail2;
+        /*
+         * Set an upper limit on the number of cache entries so that the hash
+         * chains won't grow too long.
+         */
+        cache->c_max_entries = bucket_count << 4;
        spin_lock(&mb_cache_spinlock);
        list_add(&cache->c_cache_list, &mb_cache_list);
        spin_unlock(&mb_cache_spinlock);
        return cache;
+fail2:
+        kfree(cache->c_index_hash);
 fail:
-        if (cache) {
+        kfree(cache->c_block_hash);
-                while (--m >= 0)
+        kfree(cache);
-                        kfree(cache->c_indexes_hash[m]);
-                kfree(cache->c_block_hash);
-                kfree(cache);
-        }
        return NULL;
 }
@@ -356,7 +309,6 @@ mb_cache_destroy(struct mb_cache *cache)
 {
        LIST_HEAD(free_list);
        struct list_head *l, *ltmp;
-        int n;
        spin_lock(&mb_cache_spinlock);
        list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
@@ -383,13 +335,11 @@ mb_cache_destroy(struct mb_cache *cache)
        kmem_cache_destroy(cache->c_entry_cache);
-        for (n=0; n < mb_cache_indexes(cache); n++)
+        kfree(cache->c_index_hash);
-                kfree(cache->c_indexes_hash[n]);
        kfree(cache->c_block_hash);
        kfree(cache);
 }
 /*
 * mb_cache_entry_alloc()
 *
@@ -401,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce;
+        struct mb_cache_entry *ce = NULL;
-        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
-        if (ce) {
+                spin_lock(&mb_cache_spinlock);
+                if (!list_empty(&mb_cache_lru_list)) {
+                        ce = list_entry(mb_cache_lru_list.next,
+                                        struct mb_cache_entry, e_lru_list);
+                        list_del_init(&ce->e_lru_list);
+                        __mb_cache_entry_unhash(ce);
+                }
+                spin_unlock(&mb_cache_spinlock);
+        }
+        if (!ce) {
+                ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+                if (!ce)
+                        return NULL;
                atomic_inc(&cache->c_entry_count);
                INIT_LIST_HEAD(&ce->e_lru_list);
                INIT_LIST_HEAD(&ce->e_block_list);
                ce->e_cache = cache;
-                ce->e_used = 1 + MB_CACHE_WRITER;
                ce->e_queued = 0;
        }
+        ce->e_used = 1 + MB_CACHE_WRITER;
        return ce;
 }
@@ -428,17 +390,16 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 *
 * @bdev: device the cache entry belongs to
 * @block: block number
- * @keys: array of additional keys. There must be indexes_count entries
+ * @key: lookup key
- *        in the array (as specified when creating the cache).
 */
 int
 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
-                      sector_t block, unsigned int keys[])
+                      sector_t block, unsigned int key)
 {
        struct mb_cache *cache = ce->e_cache;
        unsigned int bucket;
        struct list_head *l;
-        int error = -EBUSY, n;
+        int error = -EBUSY;
        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
                           cache->c_bucket_bits);
@@ -453,12 +414,9 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
        ce->e_bdev = bdev;
        ce->e_block = block;
        list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
-        for (n=0; n<mb_cache_indexes(cache); n++) {
+        ce->e_index.o_key = key;
-                ce->e_indexes[n].o_key = keys[n];
+        bucket = hash_long(key, cache->c_bucket_bits);
-                bucket = hash_long(keys[n], cache->c_bucket_bits);
+        list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
-                list_add(&ce->e_indexes[n].o_list,
-                         &cache->c_indexes_hash[n][bucket]);
-        }
        error = 0;
 out:
        spin_unlock(&mb_cache_spinlock);
@@ -554,13 +512,12 @@ cleanup:
 static struct mb_cache_entry *
 __mb_cache_entry_find(struct list_head *l, struct list_head *head,
-                      int index, struct block_device *bdev, unsigned int key)
+                      struct block_device *bdev, unsigned int key)
 {
        while (l != head) {
                struct mb_cache_entry *ce =
-                        list_entry(l, struct mb_cache_entry,
+                        list_entry(l, struct mb_cache_entry, e_index.o_list);
-                                   e_indexes[index].o_list);
+                if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-                if (ce->e_bdev == bdev && ce->e_indexes[index].o_key == key) {
                        DEFINE_WAIT(wait);
                        if (!list_empty(&ce->e_lru_list))
@@ -602,23 +559,20 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
 * returned cache entry is locked for shared access ("multiple readers").
 *
 * @cache: the cache to search
- * @index: the number of the additonal index to search (0<=index<indexes_count)
 * @bdev: the device the cache entry should belong to
 * @key: the key in the index
 */
 struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, int index,
+mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
-                          struct block_device *bdev, unsigned int key)
+                          unsigned int key)
 {
        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
        struct list_head *l;
        struct mb_cache_entry *ce;
-        mb_assert(index < mb_cache_indexes(cache));
        spin_lock(&mb_cache_spinlock);
-        l = cache->c_indexes_hash[index][bucket].next;
+        l = cache->c_index_hash[bucket].next;
-        ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
+        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-                                   index, bdev, key);
        spin_unlock(&mb_cache_spinlock);
        return ce;
 }
@@ -639,12 +593,11 @@ mb_cache_entry_find_first(struct mb_cache *cache, int index,
 * }
 *
 * @prev: The previous match
- * @index: the number of the additonal index to search (0<=index<indexes_count)
 * @bdev: the device the cache entry should belong to
 * @key: the key in the index
 */
 struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
+mb_cache_entry_find_next(struct mb_cache_entry *prev,
                         struct block_device *bdev, unsigned int key)
 {
        struct mb_cache *cache = prev->e_cache;
@@ -652,11 +605,9 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
        struct list_head *l;
        struct mb_cache_entry *ce;
-        mb_assert(index < mb_cache_indexes(cache));
        spin_lock(&mb_cache_spinlock);
-        l = prev->e_indexes[index].o_list.next;
+        l = prev->e_index.o_list.next;
-        ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
+        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-                                   index, bdev, key);
        __mb_cache_entry_release_unlock(prev);
        return ce;
 }
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..3f32bcb0d9bd 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -200,13 +200,13 @@ void minix_free_inode(struct inode * inode)
        ino = inode->i_ino;
        if (ino < 1 || ino > sbi->s_ninodes) {
                printk("minix_free_inode: inode 0 or nonexistent inode\n");
-                goto out;
+                return;
        }
        bit = ino & ((1<<k) - 1);
        ino >>= k;
        if (ino >= sbi->s_imap_blocks) {
                printk("minix_free_inode: nonexistent imap in superblock\n");
-                goto out;
+                return;
        }
        minix_clear_inode(inode);       /* clear on-disk copy */
@@ -217,11 +217,9 @@ void minix_free_inode(struct inode * inode)
                printk("minix_free_inode: bit %lu already cleared\n", bit);
        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
- out:
-        clear_inode(inode);             /* clear in-memory copy */
 }
-struct inode * minix_new_inode(const struct inode * dir, int * error)
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
 {
        struct super_block *sb = dir->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +261,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                iput(inode);
                return NULL;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_ino = j;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..085a9262c692 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-        }
        return page;
-fail:
-        dir_put_page(page);
-        return ERR_PTR(-EIO);
 }
 static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
@@ -278,8 +271,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
 got_it:
        pos = page_offset(page) + p - (char *)page_address(page);
-        err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
+        err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
-                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
                goto out_unlock;
        memcpy (namx, name, namelen);
@@ -304,8 +296,7 @@ out_unlock:
 int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 {
-        struct address_space *mapping = page->mapping;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode = (struct inode*)mapping->host;
        char *kaddr = page_address(page);
        loff_t pos = page_offset(page) + (char*)de - kaddr;
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
@@ -313,8 +304,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
        int err;
        lock_page(page);
-        err = __minix_write_begin(NULL, mapping, pos, len,
+        err = minix_prepare_chunk(page, pos, len);
-                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err == 0) {
                if (sbi->s_version == MINIX_V3)
                        ((minix3_dirent *) de)->inode = 0;
@@ -332,16 +322,14 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 int minix_make_empty(struct inode *inode, struct inode *dir)
 {
-        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(inode->i_mapping, 0);
-        struct page *page = grab_cache_page(mapping, 0);
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        char *kaddr;
        int err;
        if (!page)
                return -ENOMEM;
-        err = __minix_write_begin(NULL, mapping, 0, 2 * sbi->s_dirsize,
+        err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
-                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err) {
                unlock_page(page);
                goto fail;
@@ -432,8 +420,7 @@ not_empty:
 void minix_set_link(struct minix_dir_entry *de, struct page *page,
        struct inode *inode)
 {
-        struct address_space *mapping = page->mapping;
+        struct inode *dir = page->mapping->host;
-        struct inode *dir = mapping->host;
        struct minix_sb_info *sbi = minix_sb(dir->i_sb);
        loff_t pos = page_offset(page) +
                        (char *)de-(char*)page_address(page);
@@ -441,8 +428,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
        lock_page(page);
-        err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
+        err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
-                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err == 0) {
                if (sbi->s_version == MINIX_V3)
                        ((minix3_dirent *) de)->inode = inode->i_ino;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..4493ce695ab8 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,11 +19,33 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
+static int minix_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
 const struct inode_operations minix_file_inode_operations = {
        .truncate       = minix_truncate,
+        .setattr        = minix_setattr,
        .getattr        = minix_getattr,
 };
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 756f8c93780c..e39d6bf2e8fb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -24,12 +24,17 @@ static int minix_write_inode(struct inode *inode,
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
-static void minix_delete_inode(struct inode *inode)
+static void minix_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
-        inode->i_size = 0;
+        if (!inode->i_nlink) {
-        minix_truncate(inode);
+                inode->i_size = 0;
-        minix_free_inode(inode);
+                minix_truncate(inode);
+        }
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
+        if (!inode->i_nlink)
+                minix_free_inode(inode);
 }
 static void minix_put_super(struct super_block *sb)
@@ -96,7 +101,7 @@ static const struct super_operations minix_sops = {
        .alloc_inode    = minix_alloc_inode,
        .destroy_inode  = minix_destroy_inode,
        .write_inode    = minix_write_inode,
-        .delete_inode   = minix_delete_inode,
+        .evict_inode    = minix_evict_inode,
        .put_super      = minix_put_super,
        .statfs         = minix_statfs,
        .remount_fs     = minix_remount,
@@ -357,20 +362,26 @@ static int minix_readpage(struct file *file, struct page *page)
        return block_read_full_page(page,minix_get_block);
 }
-int __minix_write_begin(struct file *file, struct address_space *mapping,
+int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return __block_write_begin(page, pos, len, minix_get_block);
-                                minix_get_block);
 }
 static int minix_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return __minix_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                minix_get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t minix_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
        return (block_t *)minix_i(inode)->u.i2_data;
 }
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
        int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
                        printk("MINIX-fs: block_to_path: "
                               "block %ld too big on dev %s\n",
                                block, bdevname(sb->s_bdev, b));
-        } else if (block < 7) {
+        } else if (block < DIRCOUNT) {
                offsets[n++] = block;
-        } else if ((block -= 7) < 256) {
+        } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
-                offsets[n++] = 7;
+                offsets[n++] = DIRCOUNT;
                offsets[n++] = block;
-        } else if ((block -= 256) < 256*256) {
+        } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
-                offsets[n++] = 8;
+                offsets[n++] = DIRCOUNT + 1;
-                offsets[n++] = block>>8;
+                offsets[n++] = block / INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        } else {
-                block -= 256*256;
+                block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
-                offsets[n++] = 9;
+                offsets[n++] = DIRCOUNT + 2;
-                offsets[n++] = block>>16;
+                offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
-                offsets[n++] = (block>>8) & 255;
+                offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        }
        return n;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..407b1c84911e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,16 +46,14 @@ struct minix_sb_info {
 extern struct inode *minix_iget(struct super_block *, unsigned long);
 extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode * dir, int * error);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
 extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
 extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
 extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int __minix_write_begin(struct file *file, struct address_space *mapping,
+extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata);
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..f3f3578393a4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        inode = minix_new_inode(dir, &error);
+        inode = minix_new_inode(dir, mode, &error);
        if (inode) {
-                inode->i_mode = mode;
                minix_set_inode(inode, rdev);
                mark_inode_dirty(inode);
                error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
        if (i > dir->i_sb->s_blocksize)
                goto out;
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
        if (!inode)
                goto out;
-        inode->i_mode = S_IFLNK | 0777;
        minix_set_inode(inode, 0);
        err = page_symlink(inode, symname, i);
        if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
        inode_inc_link_count(dir);
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, S_IFDIR | mode, &err);
        if (!inode)
                goto out_dir;
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        minix_set_inode(inode, 0);
        inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask)
        if (retval)
                return retval;
-        return security_inode_permission(inode,
+        return security_inode_permission(inode, mask);
-                        mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 /**
@@ -484,13 +483,8 @@ ok:
 static __always_inline void set_root(struct nameidata *nd)
 {
-        if (!nd->root.mnt) {
+        if (!nd->root.mnt)
-                struct fs_struct *fs = current->fs;
+                get_fs_root(current->fs, &nd->root);
-                read_lock(&fs->lock);
-                nd->root = fs->root;
-                path_get(&nd->root);
-                read_unlock(&fs->lock);
-        }
 }
 static int link_path_walk(const char *, struct nameidata *);
@@ -523,9 +517,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
        dput(nd->path.dentry);
-        if (nd->path.mnt != path->mnt)
+        if (nd->path.mnt != path->mnt) {
                mntput(nd->path.mnt);
-        nd->path.mnt = path->mnt;
+                nd->path.mnt = path->mnt;
+        }
        nd->path.dentry = path->dentry;
 }
@@ -600,15 +595,16 @@ int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        parent = path->mnt->mnt_parent;
        if (parent == path->mnt) {
-                spin_unlock(&vfsmount_lock);
+                br_read_unlock(vfsmount_lock);
                return 0;
        }
        mntget(parent);
        mountpoint = dget(path->mnt->mnt_mountpoint);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
@@ -691,6 +687,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 }
 /*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+                                struct qstr *name, struct nameidata *nd)
+{
+        struct inode *inode = parent->d_inode;
+        struct dentry *dentry;
+        struct dentry *old;
+        /* Don't create child dentry for a dead directory. */
+        if (unlikely(IS_DEADDIR(inode)))
+                return ERR_PTR(-ENOENT);
+        dentry = d_alloc(parent, name);
+        if (unlikely(!dentry))
+                return ERR_PTR(-ENOMEM);
+        old = inode->i_op->lookup(inode, dentry, nd);
+        if (unlikely(old)) {
+                dput(dentry);
+                dentry = old;
+        }
+        return dentry;
+}
+/*
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
@@ -711,9 +736,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                        return err;
        }
+        /*
+         * Rename seqlock is not required here because in the off chance
+         * of a false negative due to a concurrent rename, we're going to
+         * do the non-racy lookup, below.
+         */
        dentry = __d_lookup(nd->path.dentry, name);
        if (!dentry)
                goto need_lookup;
+found:
        if (dentry->d_op && dentry->d_op->d_revalidate)
                goto need_revalidate;
 done:
@@ -729,56 +760,28 @@ need_lookup:
        mutex_lock(&dir->i_mutex);
        /*
         * First re-do the cached lookup just in case it was created
-         * while we waited for the directory semaphore..
+         * while we waited for the directory semaphore, or the first
-         *
+         * lookup failed due to an unrelated rename.
-         * FIXME! This could use version numbering or similar to
-         * avoid unnecessary cache lookups.
-         *
-         * The "dcache_lock" is purely to protect the RCU list walker
-         * from concurrent renames at this point (we mustn't get false
-         * negatives from the RCU list walk here, unlike the optimistic
-         * fast walk).
         *
-         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+         * This could use version numbering or similar to avoid unnecessary
+         * cache lookups, but then we'd have to do the first lookup in the
+         * non-racy way. However in the common case here, everything should
+         * be hot in cache, so would it be a big win?
         */
        dentry = d_lookup(parent, name);
-        if (!dentry) {
+        if (likely(!dentry)) {
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(parent, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(dir))
-                        goto out_unlock;
-                new = d_alloc(parent, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (new) {
-                        dentry = dir->i_op->lookup(dir, new, nd);
-                        if (dentry)
-                                dput(new);
-                        else
-                                dentry = new;
-                }
-out_unlock:
                mutex_unlock(&dir->i_mutex);
                if (IS_ERR(dentry))
                        goto fail;
                goto done;
        }
        /*
         * Uhhuh! Nasty case: the cache was re-populated while
         * we waited on the semaphore. Need to revalidate.
         */
        mutex_unlock(&dir->i_mutex);
-        if (dentry->d_op && dentry->d_op->d_revalidate) {
+        goto found;
-                dentry = do_revalidate(dentry, nd);
-                if (!dentry)
-                        dentry = ERR_PTR(-ENOENT);
-        }
-        if (IS_ERR(dentry))
-                goto fail;
-        goto done;
 need_revalidate:
        dentry = do_revalidate(dentry, nd);
@@ -1015,11 +1018,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                nd->path = nd->root;
                path_get(&nd->root);
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                get_fs_pwd(current->fs, &nd->path);
-                read_lock(&fs->lock);
-                nd->path = fs->pwd;
-                path_get(&fs->pwd);
-                read_unlock(&fs->lock);
        } else {
                struct dentry *dentry;
@@ -1139,35 +1138,18 @@ static struct dentry *__lookup_hash(struct qstr *name,
                        goto out;
        }
-        dentry = __d_lookup(base, name);
+        /*
+         * Don't bother with __d_lookup: callers are for creat as
-        /* lockess __d_lookup may fail due to concurrent d_move()
+         * well as unlink, so a lot of the time it would cost
-         * in some unrelated directory, so try with d_lookup
+         * a double lookup.
         */
-        if (!dentry)
+        dentry = d_lookup(base, name);
-                dentry = d_lookup(base, name);
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
                dentry = do_revalidate(dentry, nd);
-        if (!dentry) {
+        if (!dentry)
-                struct dentry *new;
+                dentry = d_alloc_and_lookup(base, name, nd);
-                /* Don't create child dentry for a dead directory. */
-                dentry = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(inode))
-                        goto out;
-                new = d_alloc(base, name);
-                dentry = ERR_PTR(-ENOMEM);
-                if (!new)
-                        goto out;
-                dentry = inode->i_op->lookup(inode, new, nd);
-                if (!dentry)
-                        dentry = new;
-                else
-                        dput(new);
-        }
 out:
        return dentry;
 }
@@ -1483,8 +1465,7 @@ static int handle_truncate(struct path *path)
         */
        error = locks_verify_locked(inode);
        if (!error)
-                error = security_path_truncate(path, 0,
+                error = security_path_truncate(path);
-                                       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
        if (!error) {
                error = do_truncate(path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1620,6 +1601,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        case LAST_DOTDOT:
                follow_dotdot(nd);
                dir = nd->path.dentry;
+        case LAST_DOT:
                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
                        if (!dir->d_op->d_revalidate(dir, nd)) {
                                error = -ESTALE;
@@ -1627,7 +1609,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        }
                }
                /* fallthrough */
-        case LAST_DOT:
        case LAST_ROOT:
                if (open_flag & O_CREAT)
                        goto exit;
@@ -2634,7 +2615,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
        int error;
        int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
-        const char *old_name;
+        const unsigned char *old_name;
        if (old_dentry->d_inode == new_dentry->d_inode)
                return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index f20cb57d1067..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -29,6 +31,7 @@
 #include <linux/log2.h>
 #include <linux/idr.h>
 #include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -37,12 +40,10 @@
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
@@ -54,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -64,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
        int res;
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
        if (!res)
                mnt_id_start = mnt->mnt_id + 1;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
        if (res == -EAGAIN)
                goto retry;
@@ -85,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
        int id = mnt->mnt_id;
-        spin_lock(&vfsmount_lock);
+        spin_lock(&mnt_id_lock);
        ida_remove(&mnt_id_ida, id);
        if (mnt_id_start > id)
                mnt_id_start = id;
-        spin_unlock(&vfsmount_lock);
+        spin_unlock(&mnt_id_lock);
 }
 /*
@@ -150,6 +164,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
+#ifdef CONFIG_FSNOTIFY
+                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+#endif
 #ifdef CONFIG_SMP
                mnt->mnt_writers = alloc_percpu(int);
                if (!mnt->mnt_writers)
@@ -344,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -378,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         */
        smp_wmb();
        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return ret;
 }
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_flags &= ~MNT_READONLY;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -410,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
 * find the first or last mount at @dentry on vfsmount @mnt depending on
 * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
 */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                              int dir)
@@ -439,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return child_mnt;
 }
@@ -451,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns) {
@@ -459,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns && ns->event != event) {
@@ -467,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
        old_path->dentry = mnt->mnt_mountpoint;
@@ -478,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        old_path->dentry->d_mounted--;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
                        struct vfsmount *child_mnt)
 {
@@ -486,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
        dentry->d_mounted++;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
        mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -495,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
 */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -610,6 +644,7 @@ static inline void __mntput(struct vfsmount *mnt)
         * provides barriers, so count_mnt_writers() below is safe.  AV
         */
        WARN_ON(count_mnt_writers(mnt));
+        fsnotify_vfsmount_delete(mnt);
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
@@ -618,40 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-        if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
+        if (atomic_add_unless(&mnt->mnt_count, -1, 1))
-                if (likely(!mnt->mnt_pinned)) {
+                return;
-                        spin_unlock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
-                        __mntput(mnt);
+        if (!atomic_dec_and_test(&mnt->mnt_count)) {
-                        return;
+                br_write_unlock(vfsmount_lock);
-                }
+                return;
-                atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        }
-                mnt->mnt_pinned = 0;
+        if (likely(!mnt->mnt_pinned)) {
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
-                acct_auto_close_mnt(mnt);
+                __mntput(mnt);
-                security_sb_umount_close(mnt);
+                return;
-                goto repeat;
        }
+        atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+        mnt->mnt_pinned = 0;
+        br_write_unlock(vfsmount_lock);
+        acct_auto_close_mnt(mnt);
+        goto repeat;
 }
 EXPORT_SYMBOL(mntput_no_expire);
 void mnt_pin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        mnt->mnt_pinned++;
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 void mnt_unpin(struct vfsmount *mnt)
 {
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
                atomic_inc(&mnt->mnt_count);
                mnt->mnt_pinned--;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
@@ -742,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
        struct mnt_namespace *ns = p->ns;
        int res = 0;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
                res = 1;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        return res;
 }
@@ -784,7 +822,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
                { MNT_NOATIME, ",noatime" },
                { MNT_NODIRATIME, ",nodiratime" },
                { MNT_RELATIME, ",relatime" },
-                { MNT_STRICTATIME, ",strictatime" },
                { 0, NULL }
        };
        const struct proc_fs_info *fs_infop;
@@ -949,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += atomic_read(&p->mnt_count);
                minimum_refs += 2;
        }
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        if (actual_refs > minimum_refs)
                return 0;
@@ -981,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_read_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-        spin_unlock(&vfsmount_lock);
+        br_read_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1000,13 +1037,14 @@ void release_mounts(struct list_head *head)
                if (mnt->mnt_parent != mnt) {
                        struct dentry *dentry;
                        struct vfsmount *m;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        dput(dentry);
                        mntput(m);
                }
@@ -1014,6 +1052,10 @@ void release_mounts(struct list_head *head)
        }
 }
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
@@ -1104,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
        }
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        event++;
        if (!(flags & MNT_DETACH))
@@ -1116,9 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
-        if (retval)
-                security_sb_umount_busy(mnt);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1230,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                        spin_lock(&vfsmount_lock);
+                        br_write_lock(vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                        spin_unlock(&vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1261,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(mnt, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1391,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1410,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        return 0;
 out_cleanup_ids:
@@ -1435,28 +1476,38 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
        if (cant_mount(path->dentry))
                goto out_unlock;
-        err = security_sb_check_sb(mnt, path);
-        if (err)
-                goto out_unlock;
-        err = -ENOENT;
        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
-        if (!err)
-                security_sb_post_addmount(mnt, path);
        return err;
 }
 /*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+static int flags_to_propagation_type(int flags)
+{
+        int type = flags & ~MS_REC;
+        /* Fail if any non-propagation flags are set */
+        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+                return 0;
+        /* Only one propagation flag should be set */
+        if (!is_power_of_2(type))
+                return 0;
+        return type;
+}
+/*
 * recursively change the type of the mountpoint.
 */
 static int do_change_type(struct path *path, int flag)
 {
        struct vfsmount *m, *mnt = path->mnt;
        int recurse = flag & MS_REC;
-        int type = flag & ~MS_REC;
+        int type;
        int err = 0;
        if (!capable(CAP_SYS_ADMIN))
@@ -1465,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        type = flags_to_propagation_type(flag);
+        if (!type)
+                return -EINVAL;
        down_write(&namespace_sem);
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
@@ -1472,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
 out_unlock:
        up_write(&namespace_sem);
@@ -1519,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
        err = graft_tree(mnt, path);
        if (err) {
                LIST_HEAD(umount_list);
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
@@ -1574,18 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-                spin_lock(&vfsmount_lock);
+                br_write_lock(vfsmount_lock);
                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-                security_sb_post_remount(path->mnt, flags, data);
+                br_write_lock(vfsmount_lock);
-                spin_lock(&vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
-                spin_unlock(&vfsmount_lock);
+                br_write_unlock(vfsmount_lock);
        }
        return err;
 }
@@ -1762,7 +1816,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -1781,7 +1835,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umounts);
@@ -1838,6 +1892,8 @@ resume:
 /*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
 */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -1996,7 +2052,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
-        flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
+        flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
@@ -2056,9 +2112,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                kfree(new_ns);
                return ERR_PTR(-ENOMEM);
        }
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2220,10 +2276,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out1;
        }
-        read_lock(&current->fs->lock);
+        get_fs_root(current->fs, &root);
-        root = current->fs->root;
-        path_get(&current->fs->root);
-        read_unlock(&current->fs->lock);
        down_write(&namespace_sem);
        mutex_lock(&old.dentry->d_inode->i_mutex);
        error = -EINVAL;
@@ -2255,7 +2308,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out2; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
@@ -2275,9 +2328,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
-        security_sb_post_pivotroot(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
@@ -2291,7 +2343,7 @@ out1:
 out0:
        return error;
 out3:
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        goto out2;
 }
@@ -2338,6 +2390,8 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
+        br_lock_init(vfsmount_lock);
        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2356,9 +2410,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
                      
 const struct file_operations ncp_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ncp_readdir,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
        return 0;
 }
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index fa3385154023..b4de38cf49f5 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -43,7 +43,7 @@
 #define NCP_DEFAULT_TIME_OUT 10
 #define NCP_DEFAULT_RETRY_COUNT 20
-static void ncp_delete_inode(struct inode *);
+static void ncp_evict_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
 static int  ncp_statfs(struct dentry *, struct kstatfs *);
 static int  ncp_show_options(struct seq_file *, struct vfsmount *);
@@ -100,7 +100,7 @@ static const struct super_operations ncp_sops =
        .alloc_inode    = ncp_alloc_inode,
        .destroy_inode  = ncp_destroy_inode,
        .drop_inode     = generic_delete_inode,
-        .delete_inode   = ncp_delete_inode,
+        .evict_inode    = ncp_evict_inode,
        .put_super      = ncp_put_super,
        .statfs         = ncp_statfs,
        .remount_fs     = ncp_remount,
@@ -282,19 +282,19 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 }
 static void
-ncp_delete_inode(struct inode *inode)
+ncp_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        if (S_ISDIR(inode->i_mode)) {
-                DDPRINTK("ncp_delete_inode: put directory %ld\n", inode->i_ino);
+                DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
        }
        if (ncp_make_closed(inode) != 0) {
                /* We can't do anything but complain. */
-                printk(KERN_ERR "ncp_delete_inode: could not close\n");
+                printk(KERN_ERR "ncp_evict_inode: could not close\n");
        }
-        clear_inode(inode);
 }
 static void ncp_stop_tasks(struct ncp_server *server) {
@@ -728,8 +728,8 @@ out_fput:
 out_bdi:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
-         * The previously used put_filp(ncp_filp); was bogous, since
+         * The previously used put_filp(ncp_filp); was bogus, since
-         * it doesn't proper unlocking.
+         * it doesn't perform proper unlocking.
         */
        fput(ncp_filp);
 out:
@@ -924,9 +924,8 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                                tmpattr.ia_valid = ATTR_MODE;
                                tmpattr.ia_mode = attr->ia_mode;
-                                result = inode_setattr(inode, &tmpattr);
+                                setattr_copy(inode, &tmpattr);
-                                if (result)
+                                mark_inode_dirty(inode);
-                                        goto out;
                        }
                }
 #endif
@@ -954,15 +953,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                result = ncp_make_closed(inode);
                if (result)
                        goto out;
-                {
-                        struct iattr tmpattr;
+                if (attr->ia_size != i_size_read(inode)) {
-                        
+                        result = vmtruncate(inode, attr->ia_size);
-                        tmpattr.ia_valid = ATTR_SIZE;
-                        tmpattr.ia_size = attr->ia_size;
-                        
-                        result = inode_setattr(inode, &tmpattr);
                        if (result)
                                goto out;
+                        mark_inode_dirty(inode);
                }
        }
        if ((attr->ia_valid & ATTR_CTIME) != 0) {
@@ -1002,8 +998,12 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
                        NCP_FINFO(inode)->nwattr = info.attributes;
 #endif
        }
-        if (!result)
+        if (result)
-                result = inode_setattr(inode, attr);
+                goto out;
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
 out:
        unlock_kernel();
        return result;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..84a8cfc4e38e 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -261,9 +261,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
@@ -841,11 +841,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        }
 }
-int ncp_ioctl(struct inode *inode, struct file *filp,
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
-        int ret;
+        long ret;
+        lock_kernel();
        if (ncp_ioctl_need_write(cmd)) {
                /*
                 * inside the ioctl(), any failures which
@@ -853,24 +853,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
                 * -EACCESS, so it seems consistent to keep
                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt))
+                if (mnt_want_write(filp->f_path.mnt)) {
-                        return -EACCES;
+                        ret = -EACCES;
+                        goto out;
+                }
        }
-        ret = __ncp_ioctl(inode, filp, cmd, arg);
+        ret = __ncp_ioctl(filp, cmd, arg);
        if (ncp_ioctl_need_write(cmd))
                mnt_drop_write(filp->f_path.mnt);
+out:
+        unlock_kernel();
        return ret;
 }
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        long ret;
-        int ret;
        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
-        ret = ncp_ioctl(inode, file, cmd, arg);
+        ret = ncp_ioctl(file, cmd, arg);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b924..f7e13db613cb 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,9 +61,9 @@ config NFS_V3_ACL
          If unsure, say N.
 config NFS_V4
-        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
+        bool "NFS client support for NFS version 4"
-        depends on NFS_FS && EXPERIMENTAL
+        depends on NFS_FS
-        select RPCSEC_GSS_KRB5
+        select SUNRPC_GSS
        help
          This option enables support for version 4 of the NFS protocol
          (RFC 3530) in the kernel's NFS client.
@@ -72,16 +72,16 @@ config NFS_V4
          space programs which can be found in the Linux nfs-utils package,
          available from http://linux-nfs.org/.
-          If unsure, say N.
+          If unsure, say Y.
 config NFS_V4_1
-        bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
+        bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
        depends on NFS_V4 && EXPERIMENTAL
        help
          This option enables support for minor version 1 of the NFSv4 protocol
          (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
-          Unless you're an NFS developer, say N.
+          If unsure, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
@@ -100,3 +100,20 @@ config NFS_FSCACHE
        help
          Say Y here if you want NFS data to be cached locally on disc through
          the general filesystem cache manager
+config NFS_USE_LEGACY_DNS
+        bool "Use the legacy NFS DNS resolver"
+        depends on NFS_V4
+        help
+          The kernel now provides a method for translating a host name into an
+          IP address.  Select Y here if you would rather use your own DNS
+          resolver script.
+          If unsure, say N
+config NFS_USE_KERNEL_DNS
+        bool
+        depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+        select DNS_RESOLVER
+        select KEYS
+        default y
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae95123..e17b49e2eabd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
 #define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
        unsigned long num;
        int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
        *((unsigned int *)kp->arg) = num;
        return 0;
 }
+static struct kernel_param_ops param_ops_portnr = {
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
+        .set = param_set_portnr,
-{
+        .get = param_get_uint,
-        return param_get_uint(buffer, kp);
+};
-}
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e857..930d10fecdaf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
        if (inode == NULL)
                goto out_putclient;
        nfsi = NFS_I(inode);
-        down_read(&nfsi->rwsem);
+        rcu_read_lock();
-        delegation = nfsi->delegation;
+        delegation = rcu_dereference(nfsi->delegation);
        if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
                goto out_iput;
        res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
                args->bitmap[1];
        res->status = 0;
 out_iput:
-        up_read(&nfsi->rwsem);
+        rcu_read_unlock();
        iput(inode);
 out_putclient:
        nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
        return res->status;
 }
-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
-{
-#if defined(CONFIG_NFS_V4_1)
-        if (clp->cl_minorversion > 0)
-                return nfs41_validate_delegation_stateid;
-#endif
-        return nfs4_validate_delegation_stateid;
-}
 __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
        struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
                inode = nfs_delegation_find_inode(clp, &args->fh);
                if (inode != NULL) {
                        /* Set up a helper thread to actually return the delegation */
-                        switch (nfs_async_inode_return_delegation(inode, &args->stateid,
+                        switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
-                                                                  nfs_validate_delegation_stateid(clp))) {
                                case 0:
                                        res = 0;
                                        break;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index acc9c4943b84..e7340729af89 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_boot_time = CURRENT_TIME;
        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
        clp->cl_minorversion = cl_init->minorversion;
+        clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
        cred = rpc_lookup_machine_cred();
        if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
                clp->cl_session = NULL;
        }
-        clp->cl_call_sync = _nfs4_call_sync;
+        clp->cl_mvops = nfs_v4_minor_ops[0];
 #endif /* CONFIG_NFS_V4_1 */
 }
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
 static void nfs4_destroy_callback(struct nfs_client *clp)
 {
        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
-                nfs_callback_down(clp->cl_minorversion);
+                nfs_callback_down(clp->cl_mvops->minor_version);
 }
 static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -274,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
            sin1->sin6_scope_id != sin2->sin6_scope_id)
                return 0;
-        return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
+        return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr);
 }
 #else   /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
 static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
@@ -934,7 +935,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
        }
        fsinfo.fattr = fattr;
-        nfs_fattr_init(fattr);
        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
        if (error < 0)
                goto out_error;
@@ -1047,13 +1047,18 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                                     struct nfs_fh *mntfh)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        int error;
        server = nfs_alloc_server();
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto error;
        /* Get a client representation */
        error = nfs_init_server(server, data);
        if (error < 0)
@@ -1064,7 +1069,7 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        /* Probe the root fh to retrieve its FSID */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
        if (error < 0)
                goto error;
        if (server->nfs_client->rpc_ops->version == 3) {
@@ -1077,14 +1082,14 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                        server->namelen = NFS2_MAXNAMLEN;
        }
-        if (!(fattr.valid & NFS_ATTR_FATTR)) {
+        if (!(fattr->valid & NFS_ATTR_FATTR)) {
-                error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+                error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
                if (error < 0) {
                        dprintk("nfs_create_server: getattr error = %d\n", -error);
                        goto error;
                }
        }
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+        memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
        dprintk("Server FSID: %llx:%llx\n",
                (unsigned long long) server->fsid.major,
@@ -1096,9 +1101,11 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
        spin_unlock(&nfs_client_lock);
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr);
        return server;
 error:
+        nfs_free_fattr(fattr);
        nfs_free_server(server);
        return ERR_PTR(error);
 }
@@ -1120,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
                                return error;
                }
-                error = nfs_callback_up(clp->cl_minorversion,
+                error = nfs_callback_up(clp->cl_mvops->minor_version,
                                        clp->cl_rpcclient->cl_xprt);
                if (error < 0) {
                        dprintk("%s: failed to start callback. Error = %d\n",
@@ -1137,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
 */
 static int nfs4_init_client_minor_version(struct nfs_client *clp)
 {
-        clp->cl_call_sync = _nfs4_call_sync;
 #if defined(CONFIG_NFS_V4_1)
-        if (clp->cl_minorversion) {
+        if (clp->cl_mvops->minor_version) {
                struct nfs4_session *session = NULL;
                /*
                 * Create the session and mark it expired.
@@ -1152,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
                        return -ENOMEM;
                clp->cl_session = session;
-                clp->cl_call_sync = _nfs4_call_sync_session;
+                /*
+                 * The create session reply races with the server back
+                 * channel probe. Mark the client NFS_CS_SESSION_INITING
+                 * so that the client back channel can find the
+                 * nfs_client struct
+                 */
+                clp->cl_cons_state = NFS_CS_SESSION_INITING;
        }
 #endif /* CONFIG_NFS_V4_1 */
@@ -1280,6 +1291,55 @@ static void nfs4_session_set_rwsize(struct nfs_server *server)
 #endif /* CONFIG_NFS_V4_1 */
 }
+static int nfs4_server_common_setup(struct nfs_server *server,
+                struct nfs_fh *mntfh)
+{
+        struct nfs_fattr *fattr;
+        int error;
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                return -ENOMEM;
+        /* We must ensure the session is initialised first */
+        error = nfs4_init_session(server);
+        if (error < 0)
+                goto out;
+        /* Probe the root fh to retrieve its FSID and filehandle */
+        error = nfs4_get_rootfh(server, mntfh);
+        if (error < 0)
+                goto out;
+        dprintk("Server FSID: %llx:%llx\n",
+                        (unsigned long long) server->fsid.major,
+                        (unsigned long long) server->fsid.minor);
+        dprintk("Mount FH: %d\n", mntfh->size);
+        nfs4_session_set_rwsize(server);
+        error = nfs_probe_fsinfo(server, mntfh, fattr);
+        if (error < 0)
+                goto out;
+        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
+                server->namelen = NFS4_MAXNAMLEN;
+        spin_lock(&nfs_client_lock);
+        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+        server->mount_time = jiffies;
+out:
+        nfs_free_fattr(fattr);
+        return error;
+}
 /*
 * Create a version 4 volume record
 */
@@ -1340,7 +1400,6 @@ error:
 struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
                                      struct nfs_fh *mntfh)
 {
-        struct nfs_fattr fattr;
        struct nfs_server *server;
        int error;
@@ -1355,39 +1414,10 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        if (error < 0)
                goto error;
-        BUG_ON(!server->nfs_client);
+        error = nfs4_server_common_setup(server, mntfh);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-        error = nfs4_init_session(server);
-        if (error < 0)
-                goto error;
-        /* Probe the root fh to retrieve its FSID */
-        error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
-        if (error < 0)
-                goto error;
-        dprintk("Server FSID: %llx:%llx\n",
-                (unsigned long long) server->fsid.major,
-                (unsigned long long) server->fsid.minor);
-        dprintk("Mount FH: %d\n", mntfh->size);
-        nfs4_session_set_rwsize(server);
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
        if (error < 0)
                goto error;
-        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
-                server->namelen = NFS4_MAXNAMLEN;
-        spin_lock(&nfs_client_lock);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
-        server->mount_time = jiffies;
        dprintk("<-- nfs4_create_server() = %p\n", server);
        return server;
@@ -1405,7 +1435,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 {
        struct nfs_client *parent_client;
        struct nfs_server *server, *parent_server;
-        struct nfs_fattr fattr;
        int error;
        dprintk("--> nfs4_create_referral_server()\n");
@@ -1430,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
                                data->authflavor,
                                parent_server->client->cl_xprt->prot,
                                parent_server->client->cl_timeout,
-                                parent_client->cl_minorversion);
+                                parent_client->cl_mvops->minor_version);
        if (error < 0)
                goto error;
@@ -1438,34 +1467,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (error < 0)
                goto error;
-        BUG_ON(!server->nfs_client);
+        error = nfs4_server_common_setup(server, mntfh);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-        /* Probe the root fh to retrieve its FSID and filehandle */
-        error = nfs4_path_walk(server, mntfh, data->mnt_path);
        if (error < 0)
                goto error;
-        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, mntfh, &fattr);
-        if (error < 0)
-                goto error;
-        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
-                server->namelen = NFS4_MAXNAMLEN;
-        dprintk("Referral FSID: %llx:%llx\n",
-                (unsigned long long) server->fsid.major,
-                (unsigned long long) server->fsid.minor);
-        spin_lock(&nfs_client_lock);
-        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
-        list_add_tail(&server->master_link, &nfs_volume_list);
-        spin_unlock(&nfs_client_lock);
-        server->mount_time = jiffies;
        dprintk("<-- nfs_create_referral_server() = %p\n", server);
        return server;
@@ -1485,7 +1490,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                                    struct nfs_fattr *fattr)
 {
        struct nfs_server *server;
-        struct nfs_fattr fattr_fsinfo;
+        struct nfs_fattr *fattr_fsinfo;
        int error;
        dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
@@ -1496,6 +1501,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        if (!server)
                return ERR_PTR(-ENOMEM);
+        error = -ENOMEM;
+        fattr_fsinfo = nfs_alloc_fattr();
+        if (fattr_fsinfo == NULL)
+                goto out_free_server;
        /* Copy data from the source */
        server->nfs_client = source->nfs_client;
        atomic_inc(&server->nfs_client->cl_count);
@@ -1512,7 +1522,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
                nfs_init_server_aclclient(server);
        /* probe the filesystem info for this server filesystem */
-        error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+        error = nfs_probe_fsinfo(server, fh, fattr_fsinfo);
        if (error < 0)
                goto out_free_server;
@@ -1534,10 +1544,12 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        server->mount_time = jiffies;
+        nfs_free_fattr(fattr_fsinfo);
        dprintk("<-- nfs_clone_server() = %p\n", server);
        return server;
 out_free_server:
+        nfs_free_fattr(fattr_fsinfo);
        nfs_free_server(server);
        dprintk("<-- nfs_clone_server() = error %d\n", error);
        return ERR_PTR(error);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ea61d26e7871..b9c3c43cea1d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -213,7 +213,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        struct nfs_delegation *freeme = NULL;
        int status = 0;
-        delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
+        delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
        if (delegation == NULL)
                return -ENOMEM;
        memcpy(delegation->stateid.data, res->delegation.data,
@@ -268,14 +268,6 @@ out:
        return status;
 }
-/* Sync all data to disk upon delegation return */
-static void nfs_msync_inode(struct inode *inode)
-{
-        filemap_fdatawrite(inode->i_mapping);
-        nfs_wb_all(inode);
-        filemap_fdatawait(inode->i_mapping);
-}
 /*
 * Basic procedure for returning a delegation to the server
 */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
-                        nfs_msync_inode(inode);
+                        nfs_wb_all(inode);
                        err = __nfs_inode_return_delegation(inode, delegation, 1);
                }
        }
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
 /*
 * Asynchronous delegation recall!
 */
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
-                                      int (*validate_stateid)(struct nfs_delegation *delegation,
-                                                              const nfs4_stateid *stateid))
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (!validate_stateid(delegation, stateid)) {
+        if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
                rcu_read_unlock();
                return -ENOENT;
        }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b8140122..2026304bda19 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid,
+int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
-                                      int (*validate_stateid)(struct nfs_delegation *delegation,
-                                                              const nfs4_stateid *stateid));
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a7bb5c694aa3..e257172d438c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 const struct file_operations nfs_dir_operations = {
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
        /* Call generic open code in order to cache credentials */
        res = nfs_open(inode, filp);
+        if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+                /* This is a mountpoint, so d_revalidate will never
+                 * have been called, so we need to refresh the
+                 * inode (for close-open consistency) ourselves.
+                 */
+                __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+        }
        return res;
 }
@@ -530,9 +537,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
        struct nfs_entry my_entry;
-        struct nfs_fh    fh;
+        int res = -ENOMEM;
-        struct nfs_fattr fattr;
-        long            res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
@@ -554,9 +559,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        my_entry.cookie = my_entry.prev_cookie = 0;
        my_entry.eof = 0;
-        my_entry.fh = &fh;
+        my_entry.fh = nfs_alloc_fhandle();
-        my_entry.fattr = &fattr;
+        my_entry.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (my_entry.fh == NULL || my_entry.fattr == NULL)
+                goto out_alloc_failed;
        desc->entry = &my_entry;
        nfs_block_sillyrename(dentry);
@@ -598,7 +605,10 @@ out:
        nfs_unblock_sillyrename(dentry);
        if (res > 0)
                res = 0;
-        dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
+out_alloc_failed:
+        nfs_free_fattr(my_entry.fattr);
+        nfs_free_fhandle(my_entry.fh);
+        dfprintk(FILE, "NFS: readdir(%s/%s) returns %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        res);
        return res;
@@ -638,8 +648,10 @@ out:
 * All directory operations under NFS are synchronous, so fsync()
 * is a dummy operation.
 */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+        struct dentry *dentry = filp->f_path.dentry;
        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
@@ -776,9 +788,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        struct inode *dir;
        struct inode *inode;
        struct dentry *parent;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
@@ -811,14 +823,22 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if (NFS_STALE(inode))
                goto out_bad;
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = -ENOMEM;
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out_error;
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error)
                goto out_bad;
-        if (nfs_compare_fh(NFS_FH(inode), &fhandle))
+        if (nfs_compare_fh(NFS_FH(inode), fhandle))
                goto out_bad;
-        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
+        if ((error = nfs_refresh_inode(inode, fattr)) != 0)
                goto out_bad;
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
 out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
@@ -842,11 +862,21 @@ out_zap_parent:
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
                        dentry->d_name.name);
        return 0;
+out_error:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
+        dput(parent);
+        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
+                        __func__, dentry->d_parent->d_name.name,
+                        dentry->d_name.name, error);
+        return error;
 }
 /*
@@ -911,9 +941,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        struct dentry *res;
        struct dentry *parent;
        struct inode *inode = NULL;
+        struct nfs_fh *fhandle = NULL;
+        struct nfs_fattr *fattr = NULL;
        int error;
-        struct nfs_fh fhandle;
-        struct nfs_fattr fattr;
        dfprintk(VFS, "NFS: lookup(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
@@ -923,7 +953,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
                goto out;
-        res = ERR_PTR(-ENOMEM);
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
        /*
@@ -936,17 +965,23 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                goto out;
        }
+        res = ERR_PTR(-ENOMEM);
+        fhandle = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fhandle == NULL || fattr == NULL)
+                goto out;
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
+        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unblock_sillyrename;
        }
-        inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
+        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
        res = (struct dentry *)inode;
        if (IS_ERR(res))
                goto out_unblock_sillyrename;
@@ -962,6 +997,8 @@ no_entry:
 out_unblock_sillyrename:
        nfs_unblock_sillyrename(parent);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        return res;
 }
@@ -1073,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open_dput;
        /* We can't create new files, or truncate existing ones here */
-        openflags &= ~(O_CREAT|O_TRUNC);
+        openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
        /*
         * Note: we're not holding inode->i_mutex and so may be racing with
@@ -1622,16 +1659,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
-        /*
-         * ... prune child dentries and writebacks if needed.
-         */
-        if (atomic_read(&old_dentry->d_count) > 1) {
-                if (S_ISREG(old_inode->i_mode))
-                        nfs_wb_all(old_inode);
-                shrink_dcache_parent(old_dentry);
-        }
        nfs_inode_return_delegation(old_inode);
        if (new_inode != NULL)
                nfs_inode_return_delegation(new_inode);
@@ -1669,28 +1697,33 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
        smp_mb__after_atomic_dec();
 }
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+static void nfs_access_free_list(struct list_head *head)
+{
+        struct nfs_access_entry *cache;
+        while (!list_empty(head)) {
+                cache = list_entry(head->next, struct nfs_access_entry, lru);
+                list_del(&cache->lru);
+                nfs_access_free_entry(cache);
+        }
+}
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi;
        struct nfs_access_entry *cache;
-restart:
+        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+                return (nr_to_scan == 0) ? 0 : -1;
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
-                struct rw_semaphore *s_umount;
                struct inode *inode;
                if (nr_to_scan-- == 0)
                        break;
-                s_umount = &nfsi->vfs_inode.i_sb->s_umount;
+                inode = &nfsi->vfs_inode;
-                if (!down_read_trylock(s_umount))
-                        continue;
-                inode = igrab(&nfsi->vfs_inode);
-                if (inode == NULL) {
-                        up_read(s_umount);
-                        continue;
-                }
                spin_lock(&inode->i_lock);
                if (list_empty(&nfsi->access_cache_entry_lru))
                        goto remove_lru_entry;
@@ -1704,61 +1737,48 @@ restart:
                else {
 remove_lru_entry:
                        list_del_init(&nfsi->access_cache_inode_lru);
+                        smp_mb__before_clear_bit();
                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+                        smp_mb__after_clear_bit();
                }
                spin_unlock(&inode->i_lock);
-                spin_unlock(&nfs_access_lru_lock);
-                iput(inode);
-                up_read(s_umount);
-                goto restart;
        }
        spin_unlock(&nfs_access_lru_lock);
-        while (!list_empty(&head)) {
+        nfs_access_free_list(&head);
-                cache = list_entry(head.next, struct nfs_access_entry, lru);
-                list_del(&cache->lru);
-                nfs_access_free_entry(cache);
-        }
        return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
 }
-static void __nfs_access_zap_cache(struct inode *inode)
+static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
        struct rb_root *root_node = &nfsi->access_cache;
-        struct rb_node *n, *dispose = NULL;
+        struct rb_node *n;
        struct nfs_access_entry *entry;
        /* Unhook entries from the cache */
        while ((n = rb_first(root_node)) != NULL) {
                entry = rb_entry(n, struct nfs_access_entry, rb_node);
                rb_erase(n, root_node);
-                list_del(&entry->lru);
+                list_move(&entry->lru, head);
-                n->rb_left = dispose;
-                dispose = n;
        }
        nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
-        spin_unlock(&inode->i_lock);
-        /* Now kill them all! */
-        while (dispose != NULL) {
-                n = dispose;
-                dispose = n->rb_left;
-                nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
-        }
 }
 void nfs_access_zap_cache(struct inode *inode)
 {
+        LIST_HEAD(head);
+        if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0)
+                return;
        /* Remove from global LRU init */
-        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        spin_lock(&nfs_access_lru_lock);
-                spin_lock(&nfs_access_lru_lock);
+        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
                list_del_init(&NFS_I(inode)->access_cache_inode_lru);
-                spin_unlock(&nfs_access_lru_lock);
-        }
        spin_lock(&inode->i_lock);
-        /* This will release the spinlock */
+        __nfs_access_zap_cache(NFS_I(inode), &head);
-        __nfs_access_zap_cache(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&nfs_access_lru_lock);
+        nfs_access_free_list(&head);
 }
 static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
@@ -1809,8 +1829,8 @@ out_stale:
        nfs_access_free_entry(cache);
        return -ENOENT;
 out_zap:
-        /* This will release the spinlock */
+        spin_unlock(&inode->i_lock);
-        __nfs_access_zap_cache(inode);
+        nfs_access_zap_cache(inode);
        return -ENOENT;
 }
@@ -1865,9 +1885,11 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s
        smp_mb__after_atomic_inc();
        /* Add inode to global LRU list */
-        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
+        if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
                spin_lock(&nfs_access_lru_lock);
-                list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+                if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags))
+                        list_add_tail(&NFS_I(inode)->access_cache_inode_lru,
+                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
 }
@@ -1929,7 +1951,7 @@ int nfs_permission(struct inode *inode, int mask)
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
                goto out;
        /* Is this sys_access() ? */
-        if (mask & MAY_ACCESS)
+        if (mask & (MAY_ACCESS | MAY_CHDIR))
                goto force_lookup;
        switch (inode->i_mode & S_IFMT) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
        /* I/O parameters */
        struct nfs_open_context *ctx;           /* file open context info */
+        struct nfs_lock_context *l_ctx;         /* Lock context info */
        struct kiocb *          iocb;           /* controlling i/o request */
        struct inode *          inode;          /* target file of i/o */
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        INIT_LIST_HEAD(&dreq->rewrite_list);
        dreq->iocb = NULL;
        dreq->ctx = NULL;
+        dreq->l_ctx = NULL;
        spin_lock_init(&dreq->lock);
        atomic_set(&dreq->io_count, 0);
        dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
 {
        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+        if (dreq->l_ctx != NULL)
+                nfs_put_lock_context(dreq->l_ctx);
        if (dreq->ctx != NULL)
                put_nfs_open_context(dreq->ctx);
        kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
+                data->args.lock_context = dreq->l_ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
                               unsigned long nr_segs, loff_t pos)
 {
-        ssize_t result = 0;
+        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
        dreq = nfs_direct_req_alloc();
-        if (!dreq)
+        if (dreq == NULL)
-                return -ENOMEM;
+                goto out;
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        if (dreq->l_ctx == NULL)
+                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
        result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
        if (!result)
                result = nfs_direct_wait(dreq);
+out_release:
        nfs_direct_req_release(dreq);
+out:
        return result;
 }
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.offset = 0;
        data->args.count = 0;
        data->args.context = dreq->ctx;
+        data->args.lock_context = dreq->l_ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
                data->args.context = ctx;
+                data->args.lock_context = dreq->l_ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos,
                                size_t count)
 {
-        ssize_t result = 0;
+        ssize_t result = -ENOMEM;
        struct inode *inode = iocb->ki_filp->f_mapping->host;
        struct nfs_direct_req *dreq;
        size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq = nfs_direct_req_alloc();
        if (!dreq)
-                return -ENOMEM;
+                goto out;
        nfs_alloc_commit_data(dreq);
        if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
        dreq->inode = inode;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+        dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
+        if (dreq->l_ctx != NULL)
+                goto out_release;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
        result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
        if (!result)
                result = nfs_direct_wait(dreq);
+out_release:
        nfs_direct_req_release(dreq);
+out:
        return result;
 }
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d0024..dba50a5625db 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
 * Resolves DNS hostnames into valid ip addresses
 */
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+                struct sockaddr *sa, size_t salen)
+{
+        ssize_t ret;
+        char *ip_addr = NULL;
+        int ip_len;
+        ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+        if (ip_len > 0)
+                ret = rpc_pton(ip_addr, ip_len, sa, salen);
+        else
+                ret = -ESRCH;
+        kfree(ip_addr);
+        return ret;
+}
+#else
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
        nfs_cache_unregister(&nfs_dns_resolve);
 }
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf7..199bb5543a91 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
 #define NFS_DNS_HOSTNAME_MAXLEN (128)
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+        return 0;
+}
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
 extern int nfs_dns_resolver_init(void);
 extern void nfs_dns_resolver_destroy(void);
+#endif
 extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
                struct sockaddr *sa, size_t salen);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8d965bddb87e..05bf3c0dc751 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/aio.h>
 #include <linux/gfp.h>
+#include <linux/swap.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -53,7 +54,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -161,14 +162,17 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (server->flags & NFS_MOUNT_NOAC)
+        if (nfs_have_delegated_attributes(inode))
-                goto force_reval;
+                goto out_noreval;
        if (filp->f_flags & O_DIRECT)
                goto force_reval;
-        if (nfsi->npages != 0)
+        if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                return 0;
+                goto force_reval;
-        if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
+        if (nfs_attribute_timeout(inode))
-                return 0;
+                goto force_reval;
+out_noreval:
+        return 0;
 force_reval:
        return __nfs_revalidate_inode(server, inode);
 }
@@ -199,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 }
 /*
- * Helper for nfs_file_flush() and nfs_file_fsync()
- *
- * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
- * disk, but it retrieves and clears ctx->error after synching, despite
- * the two being set at the same time in nfs_context_set_write_error().
- * This is because the former is used to notify the _next_ call to
- * nfs_file_write() that a write error occured, and hence cause it to
- * fall back to doing a synchronous write.
- */
-static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
-{
-        int have_error, status;
-        int ret = 0;
-        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-        status = nfs_wb_all(inode);
-        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-        if (have_error)
-                ret = xchg(&ctx->error, 0);
-        if (!ret)
-                ret = status;
-        return ret;
-}
-/*
 * Flush all dirty pages, and check for write errors.
 */
 static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
-        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
@@ -242,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
                return 0;
        /* Flush writes to the server and return any errors */
-        return nfs_do_fsync(ctx, inode);
+        return vfs_fsync(file, 0);
 }
 static ssize_t
@@ -317,19 +295,37 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * Flush any dirty pages for this process, and check for write errors.
 * The return status from this call provides a reliable indication of
 * whether any write errors occurred for this process.
+ *
+ * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
+ * disk, but it retrieves and clears ctx->error after synching, despite
+ * the two being set at the same time in nfs_context_set_write_error().
+ * This is because the former is used to notify the _next_ call to
+ * nfs_file_write() that a write error occured, and hence cause it to
+ * fall back to doing a synchronous write.
 */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
+        int have_error, status;
+        int ret = 0;
        dprintk("NFS: fsync file(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-        return nfs_do_fsync(ctx, inode);
+        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+        status = nfs_commit_inode(inode, FLUSH_SYNC);
+        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+        if (have_error)
+                ret = xchg(&ctx->error, 0);
+        if (!ret && status < 0)
+                ret = status;
+        return ret;
 }
 /*
@@ -489,11 +485,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
+        struct address_space *mapping = page->mapping;
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
        /* Only do I/O if gfp is a superset of GFP_KERNEL */
-        if ((gfp & GFP_KERNEL) == GFP_KERNEL)
+        if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) {
-                nfs_wb_page(page->mapping->host, page);
+                int how = FLUSH_SYNC;
+                /* Don't let kswapd deadlock waiting for OOM RPC calls */
+                if (current_is_kswapd())
+                        how = 0;
+                nfs_commit_inode(mapping->host, how);
+        }
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
                return 0;
@@ -635,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
        /* Return error values for O_DSYNC and IS_SYNC() */
        if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-                int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
+                int err = vfs_fsync(iocb->ki_filp, 0);
                if (err < 0)
                        result = err;
        }
@@ -671,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
                written = ret;
        if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
-                int err = nfs_do_fsync(nfs_file_open_context(filp), inode);
+                int err = vfs_fsync(filp, 0);
                if (err < 0)
                        ret = err;
        }
@@ -719,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
                default:
                        BUG();
        }
-        if (res < 0)
-                dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
-                        " - error %d!\n",
-                                __func__, res);
        return res;
 }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a6b16ed93229..ce153a6b3aec 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -467,7 +467,8 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
                                 struct list_head *pages,
                                 unsigned *nr_pages)
 {
-        int ret, npages = *nr_pages;
+        unsigned npages = *nr_pages;
+        int ret;
        dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
                 NFS_I(inode)->fscache, npages, inode);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b35d2a616066..a70e446e1605 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -78,159 +78,94 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        struct dentry *ret;
-        struct dentry *mntroot;
        struct inode *inode;
        int error;
        /* get the actual root for this mount */
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
+        if (fsinfo.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
        inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
-        return mntroot;
+        if (ret->d_op == NULL)
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fsinfo.fattr);
+        return ret;
 }
 #ifdef CONFIG_NFS_V4
-/*
+int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh)
- * Do a simple pathwalk from the root FH of the server to the nominated target
- * of the mountpoint
- * - give error on symlinks
- * - give error on ".." occurring in the path
- * - follow traversals
- */
-int nfs4_path_walk(struct nfs_server *server,
-                   struct nfs_fh *mntfh,
-                   const char *path)
 {
        struct nfs_fsinfo fsinfo;
-        struct nfs_fattr fattr;
+        int ret = -ENOMEM;
-        struct nfs_fh lastfh;
-        struct qstr name;
-        int ret;
-        dprintk("--> nfs4_path_walk(,,%s)\n", path);
+        dprintk("--> nfs4_get_rootfh()\n");
-        fsinfo.fattr = &fattr;
+        fsinfo.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        if (fsinfo.fattr == NULL)
+                goto out;
-        /* Eat leading slashes */
-        while (*path == '/')
-                path++;
        /* Start by getting the root filehandle from the server */
        ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+                dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
-                return ret;
+                goto out;
        }
-        if (!S_ISDIR(fattr.mode)) {
+        if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
-                printk(KERN_ERR "nfs4_get_root:"
+                        || !S_ISDIR(fsinfo.fattr->mode)) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
                       " getroot encountered non-directory\n");
-                return -ENOTDIR;
+                ret = -ENOTDIR;
+                goto out;
        }
-        /* FIXME: It is quite valid for the server to return a referral here */
+        if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+                printk(KERN_ERR "nfs4_get_rootfh:"
-                printk(KERN_ERR "nfs4_get_root:"
                       " getroot obtained referral\n");
-                return -EREMOTE;
+                ret = -EREMOTE;
-        }
+                goto out;
-next_component:
-        dprintk("Next: %s\n", path);
-        /* extract the next bit of the path */
-        if (!*path)
-                goto path_walk_complete;
-        name.name = path;
-        while (*path && *path != '/')
-                path++;
-        name.len = path - (const char *) name.name;
-        if (name.len > NFS4_MAXNAMLEN)
-                return -ENAMETOOLONG;
-eat_dot_dir:
-        while (*path == '/')
-                path++;
-        if (path[0] == '.' && (path[1] == '/' || !path[1])) {
-                path += 2;
-                goto eat_dot_dir;
-        }
-        /* FIXME: Why shouldn't the user be able to use ".." in the path? */
-        if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
-            ) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " Mount path contains reference to \"..\"\n");
-                return -EINVAL;
        }
-        /* lookup the next FH in the sequence */
+        memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
-        memcpy(&lastfh, mntfh, sizeof(lastfh));
+out:
+        nfs_free_fattr(fsinfo.fattr);
-        dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
+        dprintk("<-- nfs4_get_rootfh() = %d\n", ret);
+        return ret;
-        ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
-                                                    mntfh, &fattr);
-        if (ret < 0) {
-                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
-                return ret;
-        }
-        if (!S_ISDIR(fattr.mode)) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh encountered non-directory\n");
-                return -ENOTDIR;
-        }
-        /* FIXME: Referrals are quite valid here too */
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-                printk(KERN_ERR "nfs4_get_root:"
-                       " lookupfh obtained referral\n");
-                return -EREMOTE;
-        }
-        goto next_component;
-path_walk_complete:
-        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
-        dprintk("<-- nfs4_path_walk() = 0\n");
-        return 0;
 }
 /*
@@ -239,8 +174,8 @@ path_walk_complete:
 struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 {
        struct nfs_server *server = NFS_SB(sb);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
-        struct dentry *mntroot;
+        struct dentry *ret;
        struct inode *inode;
        int error;
@@ -254,40 +189,50 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                return ERR_PTR(error);
        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                return ERR_PTR(-ENOMEM);;
        /* get the actual root for this mount */
-        error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
        if (error < 0) {
                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
        }
-        inode = nfs_fhget(sb, mntfh, &fattr);
+        inode = nfs_fhget(sb, mntfh, fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
-                return ERR_CAST(inode);
+                ret = ERR_CAST(inode);
+                goto out;
        }
        error = nfs_superblock_set_dummy_root(sb, inode);
-        if (error != 0)
+        if (error != 0) {
-                return ERR_PTR(error);
+                ret = ERR_PTR(error);
+                goto out;
+        }
        /* root dentries normally start off anonymous and get spliced in later
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        mntroot = d_obtain_alias(inode);
+        ret = d_obtain_alias(inode);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
-                return mntroot;
+                goto out;
        }
-        security_d_instantiate(mntroot, inode);
+        security_d_instantiate(ret, inode);
-        if (!mntroot->d_op)
+        if (ret->d_op == NULL)
-                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+                ret->d_op = server->nfs_client->rpc_ops->dentry_ops;
+out:
+        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
-        return mntroot;
+        return ret;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a56edca0b5..7d2d6c72aa78 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -98,7 +98,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
        return ino;
 }
-void nfs_clear_inode(struct inode *inode)
+static void nfs_clear_inode(struct inode *inode)
 {
        /*
         * The following should never happen...
@@ -110,6 +110,13 @@ void nfs_clear_inode(struct inode *inode)
        nfs_fscache_release_inode_cookie(inode);
 }
+void nfs_evict_inode(struct inode *inode)
+{
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
+        nfs_clear_inode(inode);
+}
 /**
 * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
 */
@@ -393,8 +400,8 @@ int
 nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
-        int error;
+        int error = -ENOMEM;
        nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -413,18 +420,22 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                return 0;
        /* Write all dirty data */
-        if (S_ISREG(inode->i_mode)) {
+        if (S_ISREG(inode->i_mode))
-                filemap_write_and_wait(inode->i_mapping);
                nfs_wb_all(inode);
-        }
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        /*
         * Return any delegations if we're going to change ACLs
         */
        if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
                nfs_inode_return_delegation(inode);
-        error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
+        error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
        if (error == 0)
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        return error;
 }
@@ -524,6 +535,68 @@ out:
        return err;
 }
+static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
+{
+        atomic_set(&l_ctx->count, 1);
+        l_ctx->lockowner = current->files;
+        l_ctx->pid = current->tgid;
+        INIT_LIST_HEAD(&l_ctx->list);
+}
+static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
+{
+        struct nfs_lock_context *pos;
+        list_for_each_entry(pos, &ctx->lock_context.list, list) {
+                if (pos->lockowner != current->files)
+                        continue;
+                if (pos->pid != current->tgid)
+                        continue;
+                atomic_inc(&pos->count);
+                return pos;
+        }
+        return NULL;
+}
+struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
+{
+        struct nfs_lock_context *res, *new = NULL;
+        struct inode *inode = ctx->path.dentry->d_inode;
+        spin_lock(&inode->i_lock);
+        res = __nfs_find_lock_context(ctx);
+        if (res == NULL) {
+                spin_unlock(&inode->i_lock);
+                new = kmalloc(sizeof(*new), GFP_KERNEL);
+                if (new == NULL)
+                        return NULL;
+                nfs_init_lock_context(new);
+                spin_lock(&inode->i_lock);
+                res = __nfs_find_lock_context(ctx);
+                if (res == NULL) {
+                        list_add_tail(&new->list, &ctx->lock_context.list);
+                        new->open_context = ctx;
+                        res = new;
+                        new = NULL;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        kfree(new);
+        return res;
+}
+void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
+{
+        struct nfs_open_context *ctx = l_ctx->open_context;
+        struct inode *inode = ctx->path.dentry->d_inode;
+        if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
+                return;
+        list_del(&l_ctx->list);
+        spin_unlock(&inode->i_lock);
+        kfree(l_ctx);
+}
 /**
 * nfs_close_context - Common close_context() routine NFSv2/v3
 * @ctx: pointer to context
@@ -560,11 +633,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
                path_get(&ctx->path);
                ctx->cred = get_rpccred(cred);
                ctx->state = NULL;
-                ctx->lockowner = current->files;
                ctx->flags = 0;
                ctx->error = 0;
                ctx->dir_cookie = 0;
-                atomic_set(&ctx->count, 1);
+                nfs_init_lock_context(&ctx->lock_context);
+                ctx->lock_context.open_context = ctx;
        }
        return ctx;
 }
@@ -572,7 +645,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
 struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
 {
        if (ctx != NULL)
-                atomic_inc(&ctx->count);
+                atomic_inc(&ctx->lock_context.count);
        return ctx;
 }
@@ -580,7 +653,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
        struct inode *inode = ctx->path.dentry->d_inode;
-        if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
+        if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
                return;
        list_del(&ctx->list);
        spin_unlock(&inode->i_lock);
@@ -682,7 +755,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        int              status = -ESTALE;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        struct nfs_inode *nfsi = NFS_I(inode);
        dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
@@ -693,8 +766,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
+        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
        if (status != 0) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -707,7 +785,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                goto out;
        }
-        status = nfs_refresh_inode(inode, &fattr);
+        status = nfs_refresh_inode(inode, fattr);
        if (status) {
                dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
                         inode->i_sb->s_id,
@@ -723,6 +801,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                (long long)NFS_FILEID(inode));
 out:
+        nfs_free_fattr(fattr);
        return status;
 }
@@ -730,9 +809,14 @@ int nfs_attribute_timeout(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
+        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+static int nfs_attribute_cache_expired(struct inode *inode)
+{
        if (nfs_have_delegated_attributes(inode))
                return 0;
-        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+        return nfs_attribute_timeout(inode);
 }
 /**
@@ -745,7 +829,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
        if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
-                        && !nfs_attribute_timeout(inode))
+                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
 }
@@ -782,7 +866,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
        int ret = 0;
        if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-                        || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
+                        || nfs_attribute_cache_expired(inode)
+                        || NFS_STALE(inode)) {
                ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
                if (ret < 0)
                        goto out;
@@ -916,6 +1001,26 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
        fattr->gencount = nfs_inc_attr_generation_counter();
 }
+struct nfs_fattr *nfs_alloc_fattr(void)
+{
+        struct nfs_fattr *fattr;
+        fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+        if (fattr != NULL)
+                nfs_fattr_init(fattr);
+        return fattr;
+}
+struct nfs_fh *nfs_alloc_fhandle(void)
+{
+        struct nfs_fh *fh;
+        fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+        if (fh != NULL)
+                fh->size = 0;
+        return fh;
+}
 /**
 * nfs_inode_attrs_need_update - check if the inode attributes need updating
 * @inode - pointer to inode
@@ -1300,8 +1405,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 * to open() calls that passed nfs_atomic_lookup, but failed to call
 * nfs_open().
 */
-void nfs4_clear_inode(struct inode *inode)
+void nfs4_evict_inode(struct inode *inode)
 {
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 11f82f03c5de..c961bc92c107 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,16 +205,17 @@ extern struct rpc_procinfo nfs4_procedures[];
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 /* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+                                        int nr_to_scan, gfp_t gfp_mask);
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
 extern int nfs_write_inode(struct inode *, struct writeback_control *);
-extern void nfs_clear_inode(struct inode *);
+extern void nfs_evict_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
-extern void nfs4_clear_inode(struct inode *);
+extern void nfs4_evict_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
@@ -244,9 +245,7 @@ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
 #ifdef CONFIG_NFS_V4
 extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
-extern int nfs4_path_walk(struct nfs_server *server,
+extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
-                          struct nfs_fh *mntfh,
-                          const char *path);
 #endif
 /* read.c */
@@ -371,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 * Helper for restarting RPC calls in the possible presence of NFSv4.1
 * sessions.
 */
-static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
+static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
 {
        if (nfs4_has_session(clp))
-                rpc_restart_call_prepare(task);
+                return rpc_restart_call_prepare(task);
-        else
+        return rpc_restart_call(task);
-                rpc_restart_call(task);
 }
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 1d8d5c813b01..c5832487c456 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -36,14 +36,14 @@ static inline void nfs_inc_stats(const struct inode *inode,
 static inline void nfs_add_server_stats(const struct nfs_server *server,
                                        enum nfs_stat_bytecounters stat,
-                                        unsigned long addend)
+                                        long addend)
 {
        this_cpu_add(server->io_stats->bytes[stat], addend);
 }
 static inline void nfs_add_stats(const struct inode *inode,
                                 enum nfs_stat_bytecounters stat,
-                                 unsigned long addend)
+                                 long addend)
 {
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
@@ -51,7 +51,7 @@ static inline void nfs_add_stats(const struct inode *inode,
 #ifdef CONFIG_NFS_FSCACHE
 static inline void nfs_add_fscache_stats(struct inode *inode,
                                         enum nfs_stat_fscachecounters stat,
-                                         unsigned long addend)
+                                         long addend)
 {
        this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend);
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 7888cf36022d..db6aa3673cf3 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,8 +105,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        struct vfsmount *mnt;
        struct nfs_server *server = NFS_SERVER(dentry->d_inode);
        struct dentry *parent;
-        struct nfs_fh fh;
+        struct nfs_fh *fh = NULL;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr = NULL;
        int err;
        dprintk("--> nfs_follow_mountpoint()\n");
@@ -115,6 +115,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        if (IS_ROOT(dentry))
                goto out_err;
+        err = -ENOMEM;
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        if (fh == NULL || fattr == NULL)
+                goto out_err;
        dprintk("%s: enter\n", __func__);
        dput(nd->path.dentry);
        nd->path.dentry = dget(dentry);
@@ -123,16 +129,16 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        parent = dget_parent(nd->path.dentry);
        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
                                                  &nd->path.dentry->d_name,
-                                                  &fh, &fattr);
+                                                  fh, fattr);
        dput(parent);
        if (err != 0)
                goto out_err;
-        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
        else
-                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, &fh,
+                mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
-                                      &fattr);
+                                      fattr);
        err = PTR_ERR(mnt);
        if (IS_ERR(mnt))
                goto out_err;
@@ -151,6 +157,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        nd->path.dentry = dget(mnt->mnt_root);
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
        dprintk("%s: done, returned %d\n", __func__, err);
        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
 static int
 nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 offset = (u32)args->offset;
        u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
 static int
 nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
-        struct rpc_task *task = req->rq_task;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
-        struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 static int
 nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d150ae0c5ecd..9f88c5f4c7e2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -185,7 +185,6 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct page *pages[NFSACL_MAXPAGES] = { };
        struct nfs3_getaclargs args = {
                .fh = NFS_FH(inode),
@@ -193,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
                .pages = pages,
        };
        struct nfs3_getaclres res = {
-                .fattr =        &fattr,
+                0
        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
@@ -228,7 +227,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        dprintk("NFS call getacl\n");
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return ERR_PTR(-ENOMEM);
        status = rpc_call_sync(server->client_acl, &msg, 0);
        dprintk("NFS reply getacl: %d\n", status);
@@ -238,7 +240,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, res.fattr);
                        break;
                case -EPFNOSUPPORT:
                case -EPROTONOSUPPORT:
@@ -278,6 +280,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 getout:
        posix_acl_release(res.acl_access);
        posix_acl_release(res.acl_default);
+        nfs_free_fattr(res.fattr);
        if (status != 0) {
                posix_acl_release(acl);
@@ -290,7 +293,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                  struct posix_acl *dfacl)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct page *pages[NFSACL_MAXPAGES];
        struct nfs3_setaclargs args = {
                .inode = inode,
@@ -335,8 +338,13 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        }
        dprintk("NFS call setacl\n");
+        status = -ENOMEM;
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out_freepages;
        msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
-        nfs_fattr_init(&fattr);
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(server->client_acl, &msg, 0);
        nfs_access_zap_cache(inode);
        nfs_zap_acl_cache(inode);
@@ -344,7 +352,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
        switch (status) {
                case 0:
-                        status = nfs_refresh_inode(inode, &fattr);
+                        status = nfs_refresh_inode(inode, fattr);
                        nfs3_cache_acls(inode, acl, dfacl);
                        break;
                case -EPFNOSUPPORT:
@@ -355,6 +363,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
                case -ENOTSUPP:
                        status = -EOPNOTSUPP;
        }
+        nfs_free_fattr(fattr);
 out_freepages:
        while (args.npages != 0) {
                args.npages--;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index e701002694e5..fabb4f2849a1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -144,14 +144,12 @@ static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
                 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
-        struct nfs_fattr        dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
                .len            = name->len
        };
        struct nfs3_diropres    res = {
-                .dir_attr       = &dir_attr,
                .fh             = fhandle,
                .fattr          = fattr
        };
@@ -163,29 +161,30 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
        int                     status;
        dprintk("NFS call  lookup %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                return -ENOMEM;
        nfs_fattr_init(fattr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_refresh_inode(dir, res.dir_attr);
        if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
                msg.rpc_argp = fhandle;
                msg.rpc_resp = fattr;
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
+        nfs_free_fattr(res.dir_attr);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
-        struct nfs_fattr        fattr;
        struct nfs3_accessargs  arg = {
                .fh             = NFS_FH(inode),
        };
-        struct nfs3_accessres   res = {
+        struct nfs3_accessres   res;
-                .fattr          = &fattr,
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_ACCESS],
                .rpc_argp       = &arg,
@@ -193,7 +192,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                .rpc_cred       = entry->cred,
        };
        int mode = entry->mask;
-        int status;
+        int status = -ENOMEM;
        dprintk("NFS call  access\n");
@@ -210,9 +209,13 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (mode & MAY_EXEC)
                        arg.access |= NFS3_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, res.fattr);
        if (status == 0) {
                entry->mask = 0;
                if (res.access & NFS3_ACCESS_READ)
@@ -222,6 +225,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
                if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
        }
+        nfs_free_fattr(res.fattr);
+out:
        dprintk("NFS reply access: %d\n", status);
        return status;
 }
@@ -229,7 +234,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 static int nfs3_proc_readlink(struct inode *inode, struct page *page,
                unsigned int pgbase, unsigned int pglen)
 {
-        struct nfs_fattr        fattr;
+        struct nfs_fattr        *fattr;
        struct nfs3_readlinkargs args = {
                .fh             = NFS_FH(inode),
                .pgbase         = pgbase,
@@ -239,14 +244,19 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_READLINK],
                .rpc_argp       = &args,
-                .rpc_resp       = &fattr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  readlink\n");
-        nfs_fattr_init(&fattr);
+        fattr = nfs_alloc_fattr();
+        if (fattr == NULL)
+                goto out;
+        msg.rpc_resp = fattr;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_refresh_inode(inode, &fattr);
+        nfs_refresh_inode(inode, fattr);
+        nfs_free_fattr(fattr);
+out:
        dprintk("NFS reply readlink: %d\n", status);
        return status;
 }
@@ -396,12 +406,17 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  remove %s\n", name->name);
-        nfs_fattr_init(&res.dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &res.dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply remove: %d\n", status);
        return status;
 }
@@ -419,7 +434,7 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)
        if (nfs3_async_handle_jukebox(task, dir))
                return 0;
        res = task->tk_msg.rpc_resp;
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -427,7 +442,6 @@ static int
 nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                 struct inode *new_dir, struct qstr *new_name)
 {
-        struct nfs_fattr        old_dir_attr, new_dir_attr;
        struct nfs3_renameargs  arg = {
                .fromfh         = NFS_FH(old_dir),
                .fromname       = old_name->name,
@@ -436,23 +450,27 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .toname         = new_name->name,
                .tolen          = new_name->len
        };
-        struct nfs3_renameres   res = {
+        struct nfs3_renameres res;
-                .fromattr       = &old_dir_attr,
-                .toattr         = &new_dir_attr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        nfs_fattr_init(&old_dir_attr);
-        nfs_fattr_init(&new_dir_attr);
+        res.fromattr = nfs_alloc_fattr();
+        res.toattr = nfs_alloc_fattr();
+        if (res.fromattr == NULL || res.toattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, &old_dir_attr);
+        nfs_post_op_update_inode(old_dir, res.fromattr);
-        nfs_post_op_update_inode(new_dir, &new_dir_attr);
+        nfs_post_op_update_inode(new_dir, res.toattr);
+out:
+        nfs_free_fattr(res.toattr);
+        nfs_free_fattr(res.fromattr);
        dprintk("NFS reply rename: %d\n", status);
        return status;
 }
@@ -460,30 +478,32 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 static int
 nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr, fattr;
        struct nfs3_linkargs    arg = {
                .fromfh         = NFS_FH(inode),
                .tofh           = NFS_FH(dir),
                .toname         = name->name,
                .tolen          = name->len
        };
-        struct nfs3_linkres     res = {
+        struct nfs3_linkres     res;
-                .dir_attr       = &dir_attr,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_LINK],
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  link %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        res.fattr = nfs_alloc_fattr();
-        nfs_fattr_init(&fattr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, res.dir_attr);
-        nfs_post_op_update_inode(inode, &fattr);
+        nfs_post_op_update_inode(inode, res.fattr);
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        dprintk("NFS reply link: %d\n", status);
        return status;
 }
@@ -554,7 +574,7 @@ out:
 static int
 nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 {
-        struct nfs_fattr        dir_attr;
+        struct nfs_fattr        *dir_attr;
        struct nfs3_diropargs   arg = {
                .fh             = NFS_FH(dir),
                .name           = name->name,
@@ -563,14 +583,19 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_RMDIR],
                .rpc_argp       = &arg,
-                .rpc_resp       = &dir_attr,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  rmdir %s\n", name->name);
-        nfs_fattr_init(&dir_attr);
+        dir_attr = nfs_alloc_fattr();
+        if (dir_attr == NULL)
+                goto out;
+        msg.rpc_resp = dir_attr;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        nfs_post_op_update_inode(dir, &dir_attr);
+        nfs_post_op_update_inode(dir, dir_attr);
+        nfs_free_fattr(dir_attr);
+out:
        dprintk("NFS reply rmdir: %d\n", status);
        return status;
 }
@@ -589,7 +614,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                  u64 cookie, struct page *page, unsigned int count, int plus)
 {
        struct inode            *dir = dentry->d_inode;
-        struct nfs_fattr        dir_attr;
        __be32                  *verf = NFS_COOKIEVERF(dir);
        struct nfs3_readdirargs arg = {
                .fh             = NFS_FH(dir),
@@ -600,7 +624,6 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .pages          = &page
        };
        struct nfs3_readdirres  res = {
-                .dir_attr       = &dir_attr,
                .verf           = verf,
                .plus           = plus
        };
@@ -610,7 +633,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .rpc_resp       = &res,
                .rpc_cred       = cred
        };
-        int                     status;
+        int status = -ENOMEM;
        if (plus)
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
@@ -618,12 +641,17 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
        dprintk("NFS call  readdir%s %d\n",
                        plus? "plus" : "", (unsigned int) cookie);
-        nfs_fattr_init(&dir_attr);
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_invalidate_atime(dir);
+        nfs_refresh_inode(dir, res.dir_attr);
-        nfs_refresh_inode(dir, &dir_attr);
+        nfs_free_fattr(res.dir_attr);
+out:
        dprintk("NFS reply readdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 56a86f6ac8b5..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
 static int
 nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 static int
 nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        u32 count = args->count;
@@ -675,7 +675,7 @@ static int
 nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_getaclargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
@@ -762,7 +762,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 static int
 nfs3_xdr_removeres(struct rpc_rqst *req, __be32 *p, struct nfs_removeres *res)
 {
-        return nfs3_xdr_wccstat(req, p, &res->dir_attr);
+        return nfs3_xdr_wccstat(req, p, res->dir_attr);
 }
 /*
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 static int
 nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        unsigned int replen;
        p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a187200a7aac..311e15cc8af0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
        NFS4CLNT_RECLAIM_NOGRACE,
        NFS4CLNT_DELEGRETURN,
        NFS4CLNT_SESSION_RESET,
-        NFS4CLNT_SESSION_DRAINING,
        NFS4CLNT_RECALL_SLOT,
 };
+enum nfs4_session_state {
+        NFS4_SESSION_INITING,
+        NFS4_SESSION_DRAINING,
+};
+struct nfs4_minor_version_ops {
+        u32     minor_version;
+        int     (*call_sync)(struct nfs_server *server,
+                        struct rpc_message *msg,
+                        struct nfs4_sequence_args *args,
+                        struct nfs4_sequence_res *res,
+                        int cache_reply);
+        int     (*validate_stateid)(struct nfs_delegation *,
+                        const nfs4_stateid *);
+        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
+        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
+        const struct nfs4_state_maintenance_ops *state_renewal_ops;
+};
 /*
 * struct rpc_sequence ensures that RPC calls are sent in the exact
 * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
 */
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
-        struct nfs_client    *so_client;
        struct nfs_server    *so_server;
        struct rb_node       so_client_node;
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
        atomic_t             so_count;
        unsigned long        so_flags;
        struct list_head     so_states;
-        struct list_head     so_delegations;
        struct nfs_seqid_counter so_seqid;
        struct rpc_sequence  so_sequence;
 };
@@ -125,10 +142,20 @@ enum {
 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
 */
+struct nfs4_lock_owner {
+        unsigned int lo_type;
+#define NFS4_ANY_LOCK_TYPE      (0U)
+#define NFS4_FLOCK_LOCK_TYPE    (1U << 0)
+#define NFS4_POSIX_LOCK_TYPE    (1U << 1)
+        union {
+                fl_owner_t posix_owner;
+                pid_t flock_owner;
+        } lo_u;
+};
 struct nfs4_lock_state {
        struct list_head        ls_locks;       /* Other lock stateids */
        struct nfs4_state *     ls_state;       /* Pointer to open state */
-        fl_owner_t              ls_owner;       /* POSIX lock owner */
 #define NFS_LOCK_INITIALIZED 1
        int                     ls_flags;
        struct nfs_seqid_counter        ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
        struct nfs_unique_id    ls_id;
        nfs4_stateid            ls_stateid;
        atomic_t                ls_count;
+        struct nfs4_lock_owner  ls_owner;
 };
 /* bits for nfs4_state->flags */
@@ -206,24 +234,28 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
-extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait);
+extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
+extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
 #if defined(CONFIG_NFS_V4_1)
-extern int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+        return server->nfs_client->cl_session;
+}
+extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
 #else /* CONFIG_NFS_v4_1 */
-static inline int nfs4_setup_sequence(struct nfs_client *clp,
+static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
+{
+        return NULL;
+}
+static inline int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task)
 {
@@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
 }
 #endif /* CONFIG_NFS_V4_1 */
-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
+extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -284,9 +321,9 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
-extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
+extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index f071d12c613b..3c2a1724fbd2 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -115,6 +115,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                     char *page, char *page2,
                                     const struct nfs4_fs_location *location)
 {
+        const size_t addr_bufsize = sizeof(struct sockaddr_storage);
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        char *mnt_path;
        unsigned int maxbuflen;
@@ -126,9 +127,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
        mountdata->mnt_path = mnt_path;
        maxbuflen = mnt_path - 1 - page2;
+        mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL);
+        if (mountdata->addr == NULL)
+                return ERR_PTR(-ENOMEM);
        for (s = 0; s < location->nservers; s++) {
                const struct nfs4_string *buf = &location->servers[s];
-                struct sockaddr_storage addr;
                if (buf->len <= 0 || buf->len >= maxbuflen)
                        continue;
@@ -137,11 +141,10 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                        continue;
                mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
-                                (struct sockaddr *)&addr, sizeof(addr));
+                                mountdata->addr, addr_bufsize);
                if (mountdata->addrlen == 0)
                        continue;
-                mountdata->addr = (struct sockaddr *)&addr;
                rpc_set_port(mountdata->addr, NFS_PORT);
                memcpy(page2, buf->data, buf->len);
@@ -156,6 +159,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                if (!IS_ERR(mnt))
                        break;
        }
+        kfree(mountdata->addr);
        return mnt;
 }
@@ -221,8 +225,8 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
+ * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
- * @nd - nameidata info
 *
 */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 071fcedd517c..089da5b5d20a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -70,6 +70,9 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+                            struct nfs_fattr *fattr, struct iattr *sattr,
+                            struct nfs4_state *state);
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
@@ -300,15 +303,19 @@ do_state_recovery:
 }
-static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
 {
-        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
        if (time_before(clp->cl_last_renewal,timestamp))
                clp->cl_last_renewal = timestamp;
        spin_unlock(&clp->cl_lock);
 }
+static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
+{
+        do_renew_lease(server->nfs_client, timestamp);
+}
 #if defined(CONFIG_NFS_V4_1)
 /*
@@ -353,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
 {
        struct rpc_task *task;
-        if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
+        if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
                task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
                if (task)
                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -367,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
        complete(&ses->complete);
 }
-static void nfs41_sequence_free_slot(const struct nfs_client *clp,
+static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
-                              struct nfs4_sequence_res *res)
 {
        struct nfs4_slot_table *tbl;
-        tbl = &clp->cl_session->fc_slot_table;
+        tbl = &res->sr_session->fc_slot_table;
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
                /* just wake up the next guy waiting since
                 * we may have not consumed a slot after all */
@@ -382,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slotid);
-        nfs41_check_drain_session_complete(clp->cl_session);
+        nfs41_check_drain_session_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
-static void nfs41_sequence_done(struct nfs_client *clp,
+static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
-                                struct nfs4_sequence_res *res,
-                                int rpc_status)
 {
        unsigned long timestamp;
        struct nfs4_slot_table *tbl;
        struct nfs4_slot *slot;
+        struct nfs_client *clp;
        /*
         * sr_status remains 1 if an RPC level error occurred. The server
@@ -408,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp,
        if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
                goto out;
+        tbl = &res->sr_session->fc_slot_table;
+        slot = tbl->slots + res->sr_slotid;
        /* Check the SEQUENCE operation status */
-        if (res->sr_status == 0) {
+        switch (res->sr_status) {
-                tbl = &clp->cl_session->fc_slot_table;
+        case 0:
-                slot = tbl->slots + res->sr_slotid;
                /* Update the slot's sequence and clientid lease timer */
                ++slot->seq_nr;
                timestamp = res->sr_renewal_time;
-                spin_lock(&clp->cl_lock);
+                clp = res->sr_session->clp;
-                if (time_before(clp->cl_last_renewal, timestamp))
+                do_renew_lease(clp, timestamp);
-                        clp->cl_last_renewal = timestamp;
-                spin_unlock(&clp->cl_lock);
                /* Check sequence flags */
                if (atomic_read(&clp->cl_count) > 1)
                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                break;
+        case -NFS4ERR_DELAY:
+                /* The server detected a resend of the RPC call and
+                 * returned NFS4ERR_DELAY as per Section 2.10.6.2
+                 * of RFC5661.
+                 */
+                dprintk("%s: slot=%d seq=%d: Operation in progress\n",
+                                __func__, res->sr_slotid, slot->seq_nr);
+                goto out_retry;
+        default:
+                /* Just update the slot sequence no. */
+                ++slot->seq_nr;
        }
 out:
        /* The session may be reset by one of the error handlers. */
        dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
-        nfs41_sequence_free_slot(clp, res);
+        nfs41_sequence_free_slot(res);
+        return 1;
+out_retry:
+        if (!rpc_restart_call(task))
+                goto out;
+        rpc_delay(task, NFS4_POLL_RETRY_MAX);
+        return 0;
+}
+static int nfs4_sequence_done(struct rpc_task *task,
+                               struct nfs4_sequence_res *res)
+{
+        if (res->sr_session == NULL)
+                return 1;
+        return nfs41_sequence_done(task, res);
 }
 /*
@@ -477,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
                return 0;
-        memset(res, 0, sizeof(*res));
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
+        if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
                /*
                 * The state manager will wait until the slot table is empty.
@@ -522,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        res->sr_session = session;
        res->sr_slotid = slotid;
        res->sr_renewal_time = jiffies;
+        res->sr_status_flags = 0;
        /*
         * sr_status is only set in decode_sequence, and so will remain
         * set to 1 if an rpc level failure occurs.
@@ -530,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        return 0;
 }
-int nfs4_setup_sequence(struct nfs_client *clp,
+int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
                        struct nfs4_sequence_res *res,
                        int cache_reply,
                        struct rpc_task *task)
 {
+        struct nfs4_session *session = nfs4_get_session(server);
        int ret = 0;
+        if (session == NULL) {
+                args->sa_session = NULL;
+                res->sr_session = NULL;
+                goto out;
+        }
        dprintk("--> %s clp %p session %p sr_slotid %d\n",
-                __func__, clp, clp->cl_session, res->sr_slotid);
+                __func__, session->clp, session, res->sr_slotid);
-        if (!nfs4_has_session(clp))
+        ret = nfs41_setup_sequence(session, args, res, cache_reply,
-                goto out;
-        ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
                                   task);
-        if (ret && ret != -EAGAIN) {
-                /* terminate rpc task */
-                task->tk_status = ret;
-                task->tk_action = NULL;
-        }
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
 struct nfs41_call_sync_data {
-        struct nfs_client *clp;
+        const struct nfs_server *seq_server;
        struct nfs4_sequence_args *seq_args;
        struct nfs4_sequence_res *seq_res;
        int cache_reply;
@@ -566,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
-        dprintk("--> %s data->clp->cl_session %p\n", __func__,
+        dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
-                data->clp->cl_session);
-        if (nfs4_setup_sequence(data->clp, data->seq_args,
+        if (nfs4_setup_sequence(data->seq_server, data->seq_args,
                                data->seq_res, data->cache_reply, task))
                return;
        rpc_call_start(task);
@@ -584,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
-        nfs41_sequence_done(data->clp, data->seq_res, task->tk_status);
+        nfs41_sequence_done(task, data->seq_res);
 }
 struct rpc_call_ops nfs41_call_sync_ops = {
@@ -597,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
-static int nfs4_call_sync_sequence(struct nfs_client *clp,
+static int nfs4_call_sync_sequence(struct nfs_server *server,
-                                   struct rpc_clnt *clnt,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
@@ -608,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
        int ret;
        struct rpc_task *task;
        struct nfs41_call_sync_data data = {
-                .clp = clp,
+                .seq_server = server,
                .seq_args = args,
                .seq_res = res,
                .cache_reply = cache_reply,
        };
        struct rpc_task_setup task_setup = {
-                .rpc_client = clnt,
+                .rpc_client = server->client,
                .rpc_message = msg,
                .callback_ops = &nfs41_call_sync_ops,
                .callback_data = &data
@@ -639,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
                            struct nfs4_sequence_res *res,
                            int cache_reply)
 {
-        return nfs4_call_sync_sequence(server->nfs_client, server->client,
+        return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
-                                       msg, args, res, cache_reply, 0);
 }
+#else
+static int nfs4_sequence_done(struct rpc_task *task,
+                               struct nfs4_sequence_res *res)
+{
+        return 1;
+}
 #endif /* CONFIG_NFS_V4_1 */
 int _nfs4_call_sync(struct nfs_server *server,
@@ -656,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server,
 }
 #define nfs4_call_sync(server, msg, args, res, cache_reply) \
-        (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \
+        (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
                        &(res)->seq_res, (cache_reply))
-static void nfs4_sequence_done(const struct nfs_server *server,
-                               struct nfs4_sequence_res *res, int rpc_status)
-{
-#ifdef CONFIG_NFS_V4_1
-        if (nfs4_has_session(server->nfs_client))
-                nfs41_sequence_done(server->nfs_client, res, rpc_status);
-#endif /* CONFIG_NFS_V4_1 */
-}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
        struct nfs_inode *nfsi = NFS_I(dir);
@@ -714,17 +740,18 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
-                const struct iattr *attrs)
+                const struct iattr *attrs,
+                gfp_t gfp_mask)
 {
        struct dentry *parent = dget_parent(path->dentry);
        struct inode *dir = parent->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
        struct nfs4_opendata *p;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                goto err;
-        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
+        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
        if (p->o_arg.seqid == NULL)
                goto err_free;
        path_get(path);
@@ -741,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->o_arg.server = server;
        p->o_arg.bitmask = server->attr_bitmask;
        p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
-        if (flags & O_EXCL) {
+        if (flags & O_CREAT) {
-                if (nfs4_has_persistent_session(server->nfs_client)) {
+                u32 *s;
-                        /* GUARDED */
-                        p->o_arg.u.attrs = &p->attrs;
-                        memcpy(&p->attrs, attrs, sizeof(p->attrs));
-                } else { /* EXCLUSIVE4_1 */
-                        u32 *s = (u32 *) p->o_arg.u.verifier.data;
-                        s[0] = jiffies;
-                        s[1] = current->pid;
-                }
-        } else if (flags & O_CREAT) {
                p->o_arg.u.attrs = &p->attrs;
                memcpy(&p->attrs, attrs, sizeof(p->attrs));
+                s = (u32 *) p->o_arg.u.verifier.data;
+                s[0] = jiffies;
+                s[1] = current->pid;
        }
        p->c_arg.fh = &p->o_res.fh;
        p->c_arg.stateid = &p->o_res.stateid;
@@ -1060,7 +1082,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -1251,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
        struct nfs4_opendata *data = calldata;
        data->rpc_status = task->tk_status;
-        if (RPC_ASSASSINATED(task))
-                return;
        if (data->rpc_status == 0) {
                memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
                                sizeof(data->o_res.stateid.data));
@@ -1352,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        }
        /* Update sequence id. */
        data->o_arg.id = sp->so_owner_id.id;
-        data->o_arg.clientid = sp->so_client->cl_clientid;
+        data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
        if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
                nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
        }
        data->timestamp = jiffies;
-        if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
+        if (nfs4_setup_sequence(data->o_arg.server,
                                &data->o_arg.seq_args,
                                &data->o_res.seq_res, 1, task))
                return;
@@ -1381,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
        data->rpc_status = task->tk_status;
-        nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res,
+        if (!nfs4_sequence_done(task, &data->o_res.seq_res))
-                        task->tk_status);
-        if (RPC_ASSASSINATED(task))
                return;
        if (task->tk_status == 0) {
                switch (data->o_res.f_attr->mode & S_IFMT) {
                        case S_IFREG:
@@ -1648,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (path->dentry->d_inode != NULL)
                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL);
        if (opendata == NULL)
                goto err_put_state_owner;
@@ -1659,15 +1677,24 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        if (status != 0)
                goto err_opendata_put;
-        if (opendata->o_arg.open_flags & O_EXCL)
-                nfs4_exclusive_attrset(opendata, sattr);
        state = nfs4_opendata_to_nfs4_state(opendata);
        status = PTR_ERR(state);
        if (IS_ERR(state))
                goto err_opendata_put;
        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        if (opendata->o_arg.open_flags & O_EXCL) {
+                nfs4_exclusive_attrset(opendata, sattr);
+                nfs_fattr_init(opendata->o_res.f_attr);
+                status = nfs4_do_setattr(state->inode, cred,
+                                opendata->o_res.f_attr, sattr,
+                                state);
+                if (status == 0)
+                        nfs_setattr_update_inode(state->inode, sattr);
+                nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+        }
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
        *res = state;
@@ -1760,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
                /* Use that stateid */
        } else if (state != NULL) {
-                nfs4_copy_stateid(&arg.stateid, state, current->files);
+                nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1825,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        struct nfs4_state *state = calldata->state;
        struct nfs_server *server = NFS_SERVER(calldata->inode);
-        nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
-        if (RPC_ASSASSINATED(task))
                return;
        /* hmm. we are done with the inode, and in the process of freeing
         * the state_owner. we keep this around to process errors
@@ -1890,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
-        if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client,
+        if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
                                &calldata->arg.seq_args, &calldata->res.seq_res,
                                1, task))
                return;
@@ -1914,7 +1940,7 @@ static const struct rpc_call_ops nfs4_close_ops = {
 *
 * NOTE: Caller must be holding the sp->so_owner semaphore!
 */
-int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
+int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
        struct nfs4_closedata *calldata;
@@ -1933,7 +1959,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        };
        int status = -ENOMEM;
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), gfp_mask);
        if (calldata == NULL)
                goto out;
        calldata->inode = state->inode;
@@ -1941,7 +1967,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
        /* Serialization for the sequence id */
-        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
+        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
        calldata->arg.fmode = 0;
@@ -2010,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct rpc_cred *cred;
        struct nfs4_state *state;
        struct dentry *res;
-        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+        int open_flags = nd->intent.open.flags;
+        fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
@@ -2018,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (!IS_POSIXACL(dir))
                        attr.ia_mode &= ~current_umask();
        } else {
+                open_flags &= ~O_EXCL;
                attr.ia_valid = 0;
-                BUG_ON(nd->intent.open.flags & O_CREAT);
+                BUG_ON(open_flags & O_CREAT);
        }
        cred = rpc_lookup_cred();
@@ -2028,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
+        state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                if (PTR_ERR(state) == -ENOENT) {
@@ -2247,8 +2275,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 out:
        if (page)
                __free_page(page);
-        if (locations)
+        kfree(locations);
-                kfree(locations);
        return status;
 }
@@ -2404,14 +2431,12 @@ static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_fattr fattr;
        struct nfs4_accessargs args = {
                .fh = NFS_FH(inode),
                .bitmask = server->attr_bitmask,
        };
        struct nfs4_accessres res = {
                .server = server,
-                .fattr = &fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
@@ -2438,7 +2463,11 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                if (mode & MAY_EXEC)
                        args.access |= NFS4_ACCESS_EXECUTE;
        }
-        nfs_fattr_init(&fattr);
+        res.fattr = nfs_alloc_fattr();
+        if (res.fattr == NULL)
+                return -ENOMEM;
        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (!status) {
                entry->mask = 0;
@@ -2448,8 +2477,9 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
                        entry->mask |= MAY_WRITE;
                if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
                        entry->mask |= MAY_EXEC;
-                nfs_refresh_inode(inode, &fattr);
+                nfs_refresh_inode(inode, res.fattr);
        }
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -2562,13 +2592,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        }
        d_add(dentry, igrab(state->inode));
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-        if (flags & O_EXCL) {
-                struct nfs_fattr fattr;
-                status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
-                if (status == 0)
-                        nfs_setattr_update_inode(state->inode, sattr);
-                nfs_post_op_update_inode(state->inode, &fattr);
-        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
@@ -2596,14 +2619,19 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(&res.dir_attr);
        status = nfs4_call_sync(server, &msg, &args, &res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
-                nfs_post_op_update_inode(dir, &res.dir_attr);
+                nfs_post_op_update_inode(dir, res.dir_attr);
        }
+        nfs_free_fattr(res.dir_attr);
+out:
        return status;
 }
@@ -2634,11 +2662,12 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
-        nfs4_sequence_done(res->server, &res->seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &res->seq_res))
+                return 0;
        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
-        nfs_post_op_update_inode(dir, &res->dir_attr);
+        nfs_post_op_update_inode(dir, res->dir_attr);
        return 1;
 }
@@ -2653,29 +2682,31 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
                .new_name = new_name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr old_fattr, new_fattr;
        struct nfs4_rename_res res = {
                .server = server,
-                .old_fattr = &old_fattr,
-                .new_fattr = &new_fattr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        
-        nfs_fattr_init(res.old_fattr);
+        res.old_fattr = nfs_alloc_fattr();
-        nfs_fattr_init(res.new_fattr);
+        res.new_fattr = nfs_alloc_fattr();
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        if (res.old_fattr == NULL || res.new_fattr == NULL)
+                goto out;
+        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
                nfs_post_op_update_inode(old_dir, res.old_fattr);
                update_changeattr(new_dir, &res.new_cinfo);
                nfs_post_op_update_inode(new_dir, res.new_fattr);
        }
+out:
+        nfs_free_fattr(res.new_fattr);
+        nfs_free_fattr(res.old_fattr);
        return status;
 }
@@ -2702,28 +2733,30 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
                .name   = name,
                .bitmask = server->attr_bitmask,
        };
-        struct nfs_fattr fattr, dir_attr;
        struct nfs4_link_res res = {
                .server = server,
-                .fattr = &fattr,
-                .dir_attr = &dir_attr,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
                .rpc_argp = &arg,
                .rpc_resp = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
+        res.fattr = nfs_alloc_fattr();
+        res.dir_attr = nfs_alloc_fattr();
+        if (res.fattr == NULL || res.dir_attr == NULL)
+                goto out;
-        nfs_fattr_init(res.fattr);
-        nfs_fattr_init(res.dir_attr);
        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
                nfs_post_op_update_inode(inode, res.fattr);
        }
+out:
+        nfs_free_fattr(res.dir_attr);
+        nfs_free_fattr(res.fattr);
        return status;
 }
@@ -3075,7 +3108,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        dprintk("--> %s\n", __func__);
-        nfs4_sequence_done(server, &data->res.seq_res, task->tk_status);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, server->nfs_client);
@@ -3098,8 +3132,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                           task->tk_status);
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3127,8 +3161,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                           task->tk_status);
+                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
@@ -3146,23 +3181,31 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
+struct nfs4_renewdata {
+        struct nfs_client       *client;
+        unsigned long           timestamp;
+};
 /*
 * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special
 * standalone procedure for queueing an asynchronous RENEW.
 */
-static void nfs4_renew_release(void *data)
+static void nfs4_renew_release(void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
+        struct nfs_client *clp = data->client;
        if (atomic_read(&clp->cl_count) > 1)
                nfs4_schedule_state_renewal(clp);
        nfs_put_client(clp);
+        kfree(data);
 }
-static void nfs4_renew_done(struct rpc_task *task, void *data)
+static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 {
-        struct nfs_client *clp = data;
+        struct nfs4_renewdata *data = calldata;
-        unsigned long timestamp = task->tk_start;
+        struct nfs_client *clp = data->client;
+        unsigned long timestamp = data->timestamp;
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
@@ -3170,10 +3213,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
                        nfs4_schedule_state_recovery(clp);
                return;
        }
-        spin_lock(&clp->cl_lock);
+        do_renew_lease(clp, timestamp);
-        if (time_before(clp->cl_last_renewal,timestamp))
-                clp->cl_last_renewal = timestamp;
-        spin_unlock(&clp->cl_lock);
 }
 static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3188,11 +3228,17 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                .rpc_argp       = clp,
                .rpc_cred       = cred,
        };
+        struct nfs4_renewdata *data;
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        data->client = clp;
+        data->timestamp = jiffies;
        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
-                        &nfs4_renew_ops, clp);
+                        &nfs4_renew_ops, data);
 }
 int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
@@ -3208,10 +3254,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
        status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
        if (status < 0)
                return status;
-        spin_lock(&clp->cl_lock);
+        do_renew_lease(clp, now);
-        if (time_before(clp->cl_last_renewal,now))
-                clp->cl_last_renewal = now;
-        spin_unlock(&clp->cl_lock);
        return 0;
 }
@@ -3432,9 +3475,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
-        if (!clp || task->tk_status >= 0)
+        struct nfs_client *clp = server->nfs_client;
+        if (task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
                case -NFS4ERR_ADMIN_REVOKED:
@@ -3466,8 +3511,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
                case -NFS4ERR_DELAY:
-                        if (server)
+                        nfs_inc_server_stats(server, NFSIOS_DELAY);
-                                nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
                case -EKEYEXPIRED:
                        rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3488,13 +3532,9 @@ do_state_recovery:
        return -EAGAIN;
 }
-static int
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
+                unsigned short port, struct rpc_cred *cred,
-{
+                struct nfs4_setclientid_res *res)
-        return _nfs4_async_handle_error(task, server, server->nfs_client, state);
-}
-int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
        struct nfs4_setclientid setclientid = {
@@ -3504,7 +3544,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
                .rpc_argp = &setclientid,
-                .rpc_resp = clp,
+                .rpc_resp = res,
                .rpc_cred = cred,
        };
        __be32 *p;
@@ -3547,12 +3587,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
        return status;
 }
-static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        struct nfs_fsinfo fsinfo;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
-                .rpc_argp = clp,
+                .rpc_argp = arg,
                .rpc_resp = &fsinfo,
                .rpc_cred = cred,
        };
@@ -3570,12 +3612,14 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
        return status;
 }
-int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
+                struct nfs4_setclientid_res *arg,
+                struct rpc_cred *cred)
 {
        long timeout = 0;
        int err;
        do {
-                err = _nfs4_proc_setclientid_confirm(clp, cred);
+                err = _nfs4_proc_setclientid_confirm(clp, arg, cred);
                switch (err) {
                        case 0:
                                return err;
@@ -3603,8 +3647,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
-        nfs4_sequence_done(data->res.server, &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                        task->tk_status);
+                return;
        switch (task->tk_status) {
        case -NFS4ERR_STALE_STATEID:
@@ -3634,7 +3678,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
        d_data = (struct nfs4_delegreturndata *)data;
-        if (nfs4_setup_sequence(d_data->res.server->nfs_client,
+        if (nfs4_setup_sequence(d_data->res.server,
                                &d_data->args.seq_args,
                                &d_data->res.seq_res, 1, task))
                return;
@@ -3667,7 +3711,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        };
        int status = 0;
-        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        data = kzalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        data->args.fhandle = &data->fh;
@@ -3823,7 +3867,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        struct nfs4_unlockdata *p;
        struct inode *inode = lsp->ls_state->inode;
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), GFP_NOFS);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
@@ -3854,9 +3898,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 {
        struct nfs4_unlockdata *calldata = data;
-        nfs4_sequence_done(calldata->server, &calldata->res.seq_res,
+        if (!nfs4_sequence_done(task, &calldata->res.seq_res))
-                           task->tk_status);
-        if (RPC_ASSASSINATED(task))
                return;
        switch (task->tk_status) {
                case 0:
@@ -3889,7 +3931,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                return;
        }
        calldata->timestamp = jiffies;
-        if (nfs4_setup_sequence(calldata->server->nfs_client,
+        if (nfs4_setup_sequence(calldata->server,
                                &calldata->arg.seq_args,
                                &calldata->res.seq_res, 1, task))
                return;
@@ -3961,7 +4003,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        if (test_bit(NFS_DELEGATED_STATE, &state->flags))
                goto out;
        lsp = request->fl_u.nfs4_fl.owner;
-        seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
        status = -ENOMEM;
        if (seqid == NULL)
                goto out;
@@ -3989,22 +4031,23 @@ struct nfs4_lockdata {
 };
 static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
-                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp)
+                struct nfs_open_context *ctx, struct nfs4_lock_state *lsp,
+                gfp_t gfp_mask)
 {
        struct nfs4_lockdata *p;
        struct inode *inode = lsp->ls_state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
-        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
                return NULL;
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
-        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid);
+        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
        if (p->arg.open_seqid == NULL)
                goto out_free;
-        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid);
+        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
        if (p->arg.lock_seqid == NULL)
                goto out_free_seqid;
        p->arg.lock_stateid = &lsp->ls_stateid;
@@ -4043,7 +4086,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        } else
                data->arg.new_lock_owner = 0;
        data->timestamp = jiffies;
-        if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args,
+        if (nfs4_setup_sequence(data->server,
+                                &data->arg.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
@@ -4062,12 +4106,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
        dprintk("%s: begin!\n", __func__);
-        nfs4_sequence_done(data->server, &data->res.seq_res,
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                        task->tk_status);
+                return;
        data->rpc_status = task->tk_status;
-        if (RPC_ASSASSINATED(task))
-                goto out;
        if (data->arg.new_lock_owner != 0) {
                if (data->rpc_status == 0)
                        nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4158,7 +4200,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
        dprintk("%s: begin!\n", __func__);
        data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
-                        fl->fl_u.nfs4_fl.owner);
+                        fl->fl_u.nfs4_fl.owner,
+                        recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        if (IS_SETLKW(cmd))
@@ -4384,6 +4427,34 @@ out:
        return err;
 }
+static void nfs4_release_lockowner_release(void *calldata)
+{
+        kfree(calldata);
+}
+const struct rpc_call_ops nfs4_release_lockowner_ops = {
+        .rpc_release = nfs4_release_lockowner_release,
+};
+void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
+{
+        struct nfs_server *server = lsp->ls_state->owner->so_server;
+        struct nfs_release_lockowner_args *args;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
+        };
+        if (server->nfs_client->cl_mvops->minor_version != 0)
+                return;
+        args = kmalloc(sizeof(*args), GFP_NOFS);
+        if (!args)
+                return;
+        args->lock_owner.clientid = server->nfs_client->cl_clientid;
+        args->lock_owner.id = lsp->ls_id.id;
+        msg.rpc_argp = args;
+        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
+}
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4571,7 +4642,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
                        (struct nfs4_get_lease_time_data *)calldata;
        dprintk("--> %s\n", __func__);
-        nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status);
+        if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
+                return;
        switch (task->tk_status) {
        case -NFS4ERR_DELAY:
        case -NFS4ERR_GRACE:
@@ -4647,7 +4719,7 @@ static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
        if (max_reqs != tbl->max_slots) {
                ret = -ENOMEM;
                new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
-                              GFP_KERNEL);
+                              GFP_NOFS);
                if (!new)
                        goto out;
                ret = 0;
@@ -4712,7 +4784,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
        dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_KERNEL);
+        slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
        if (!slot)
                goto out;
        ret = 0;
@@ -4761,17 +4833,10 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        struct nfs4_session *session;
        struct nfs4_slot_table *tbl;
-        session = kzalloc(sizeof(struct nfs4_session), GFP_KERNEL);
+        session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
        if (!session)
                return NULL;
-        /*
-         * The create session reply races with the server back
-         * channel probe. Mark the client NFS_CS_SESSION_INITING
-         * so that the client back channel can find the
-         * nfs_client struct
-         */
-        clp->cl_cons_state = NFS_CS_SESSION_INITING;
        init_completion(&session->complete);
        tbl = &session->fc_slot_table;
@@ -4784,6 +4849,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        spin_lock_init(&tbl->slot_tbl_lock);
        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
+        session->session_state = 1<<NFS4_SESSION_INITING;
        session->clp = clp;
        return session;
 }
@@ -5000,6 +5067,10 @@ int nfs4_init_session(struct nfs_server *server)
        if (!nfs4_has_session(clp))
                return 0;
+        session = clp->cl_session;
+        if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+                return 0;
        rsize = server->rsize;
        if (rsize == 0)
                rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5007,7 +5078,6 @@ int nfs4_init_session(struct nfs_server *server)
        if (wsize == 0)
                wsize = NFS_MAX_FILE_IO_SIZE;
-        session = clp->cl_session;
        session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
        session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
@@ -5020,69 +5090,70 @@ int nfs4_init_session(struct nfs_server *server)
 /*
 * Renew the cl_session lease.
 */
-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+struct nfs4_sequence_data {
-{
+        struct nfs_client *clp;
        struct nfs4_sequence_args args;
        struct nfs4_sequence_res res;
+};
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
-                .rpc_argp = &args,
-                .rpc_resp = &res,
-                .rpc_cred = cred,
-        };
-        args.sa_cache_this = 0;
-        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
-                                       &res, args.sa_cache_this, 1);
-}
 static void nfs41_sequence_release(void *data)
 {
-        struct nfs_client *clp = (struct nfs_client *)data;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
        if (atomic_read(&clp->cl_count) > 1)
                nfs4_schedule_state_renewal(clp);
        nfs_put_client(clp);
+        kfree(calldata);
+}
+static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+        switch(task->tk_status) {
+        case -NFS4ERR_DELAY:
+        case -EKEYEXPIRED:
+                rpc_delay(task, NFS4_POLL_RETRY_MAX);
+                return -EAGAIN;
+        default:
+                nfs4_schedule_state_recovery(clp);
+        }
+        return 0;
 }
 static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
 {
-        struct nfs_client *clp = (struct nfs_client *)data;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
-        nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status);
+        if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
+                return;
        if (task->tk_status < 0) {
                dprintk("%s ERROR %d\n", __func__, task->tk_status);
                if (atomic_read(&clp->cl_count) == 1)
                        goto out;
-                if (_nfs4_async_handle_error(task, NULL, clp, NULL)
+                if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
-                                                                == -EAGAIN) {
+                        rpc_restart_call_prepare(task);
-                        nfs_restart_rpc(task, clp);
                        return;
                }
        }
        dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
 out:
-        kfree(task->tk_msg.rpc_argp);
-        kfree(task->tk_msg.rpc_resp);
        dprintk("<-- %s\n", __func__);
 }
 static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
 {
-        struct nfs_client *clp;
+        struct nfs4_sequence_data *calldata = data;
+        struct nfs_client *clp = calldata->clp;
        struct nfs4_sequence_args *args;
        struct nfs4_sequence_res *res;
-        clp = (struct nfs_client *)data;
        args = task->tk_msg.rpc_argp;
        res = task->tk_msg.rpc_resp;
-        if (nfs4_setup_sequence(clp, args, res, 0, task))
+        if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
                return;
        rpc_call_start(task);
 }
@@ -5093,32 +5164,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
        .rpc_release = nfs41_sequence_release,
 };
-static int nfs41_proc_async_sequence(struct nfs_client *clp,
+static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
-                                     struct rpc_cred *cred)
 {
-        struct nfs4_sequence_args *args;
+        struct nfs4_sequence_data *calldata;
-        struct nfs4_sequence_res *res;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
                .rpc_cred = cred,
        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs41_sequence_ops,
+                .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+        };
        if (!atomic_inc_not_zero(&clp->cl_count))
-                return -EIO;
+                return ERR_PTR(-EIO);
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
-        res = kzalloc(sizeof(*res), GFP_KERNEL);
+        if (calldata == NULL) {
-        if (!args || !res) {
-                kfree(args);
-                kfree(res);
                nfs_put_client(clp);
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        }
-        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-        msg.rpc_argp = args;
+        msg.rpc_argp = &calldata->args;
-        msg.rpc_resp = res;
+        msg.rpc_resp = &calldata->res;
+        calldata->clp = clp;
+        task_setup_data.callback_data = calldata;
-        return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+        return rpc_run_task(&task_setup_data);
-                              &nfs41_sequence_ops, (void *)clp);
+}
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct rpc_task *task;
+        int ret = 0;
+        task = _nfs41_proc_sequence(clp, cred);
+        if (IS_ERR(task))
+                ret = PTR_ERR(task);
+        else
+                rpc_put_task(task);
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
+}
+static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+{
+        struct rpc_task *task;
+        int ret;
+        task = _nfs41_proc_sequence(clp, cred);
+        if (IS_ERR(task)) {
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        ret = rpc_wait_for_completion_task(task);
+        if (!ret)
+                ret = task->tk_status;
+        rpc_put_task(task);
+out:
+        dprintk("<-- %s status=%d\n", __func__, ret);
+        return ret;
 }
 struct nfs4_reclaim_complete_data {
@@ -5132,13 +5238,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
        struct nfs4_reclaim_complete_data *calldata = data;
        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
-        if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
+        if (nfs41_setup_sequence(calldata->clp->cl_session,
+                                &calldata->arg.seq_args,
                                &calldata->res.seq_res, 0, task))
                return;
        rpc_call_start(task);
 }
+static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
+{
+        switch(task->tk_status) {
+        case 0:
+        case -NFS4ERR_COMPLETE_ALREADY:
+        case -NFS4ERR_WRONG_CRED: /* What to do here? */
+                break;
+        case -NFS4ERR_DELAY:
+        case -EKEYEXPIRED:
+                rpc_delay(task, NFS4_POLL_RETRY_MAX);
+                return -EAGAIN;
+        default:
+                nfs4_schedule_state_recovery(clp);
+        }
+        return 0;
+}
 static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
 {
        struct nfs4_reclaim_complete_data *calldata = data;
@@ -5146,32 +5270,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
        struct nfs4_sequence_res *res = &calldata->res.seq_res;
        dprintk("--> %s\n", __func__);
-        nfs41_sequence_done(clp, res, task->tk_status);
+        if (!nfs41_sequence_done(task, res))
-        switch (task->tk_status) {
+                return;
-        case 0:
-        case -NFS4ERR_COMPLETE_ALREADY:
-                break;
-        case -NFS4ERR_BADSESSION:
-        case -NFS4ERR_DEADSESSION:
-                /*
-                 * Handle the session error, but do not retry the operation, as
-                 * we have no way of telling whether the clientid had to be
-                 * reset before we got our reply.  If reset, a new wave of
-                 * reclaim operations will follow, containing their own reclaim
-                 * complete.  We don't want our retry to get on the way of
-                 * recovery by incorrectly indicating to the server that we're
-                 * done reclaiming state since the process had to be restarted.
-                 */
-                _nfs4_async_handle_error(task, NULL, clp, NULL);
-                break;
-        default:
-                if (_nfs4_async_handle_error(
-                                task, NULL, clp, NULL) == -EAGAIN) {
-                        rpc_restart_call_prepare(task);
-                        return;
-                }
-        }
+        if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
+                rpc_restart_call_prepare(task);
+                return;
+        }
        dprintk("<-- %s\n", __func__);
 }
@@ -5207,7 +5312,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        int status = -ENOMEM;
        dprintk("--> %s\n", __func__);
-        calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
+        calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
        if (calldata == NULL)
                goto out;
        calldata->clp = clp;
@@ -5285,28 +5390,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 };
 #endif
-/*
+static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
- * Per minor version reboot and network partition recovery ops
+        .minor_version = 0,
- */
+        .call_sync = _nfs4_call_sync,
+        .validate_stateid = nfs4_validate_delegation_stateid,
-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = {
+        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
-        &nfs40_reboot_recovery_ops,
+        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
-#if defined(CONFIG_NFS_V4_1)
+        .state_renewal_ops = &nfs40_state_renewal_ops,
-        &nfs41_reboot_recovery_ops,
-#endif
 };
-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
-        &nfs40_nograce_recovery_ops,
 #if defined(CONFIG_NFS_V4_1)
-        &nfs41_nograce_recovery_ops,
+static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
-#endif
+        .minor_version = 1,
+        .call_sync = _nfs4_call_sync_session,
+        .validate_stateid = nfs41_validate_delegation_stateid,
+        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+        .state_renewal_ops = &nfs41_state_renewal_ops,
 };
+#endif
-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = {
+const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
-        &nfs40_state_renewal_ops,
+        [0] = &nfs_v4_0_minor_ops,
 #if defined(CONFIG_NFS_V4_1)
-        &nfs41_state_renewal_ops,
+        [1] = &nfs_v4_1_minor_ops,
 #endif
 };
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b72..72b6c580af13 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
 void
 nfs4_renew_state(struct work_struct *work)
 {
-        struct nfs4_state_maintenance_ops *ops;
+        const struct nfs4_state_maintenance_ops *ops;
        struct nfs_client *clp =
                container_of(work, struct nfs_client, cl_renewd.work);
        struct rpc_cred *cred;
        long lease;
        unsigned long last, now;
-        ops = nfs4_state_renewal_ops[clp->cl_minorversion];
+        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6c5ed51f105e..3e2f19b04c06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,6 +62,7 @@ static LIST_HEAD(nfs4_clientid_list);
 int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
+        struct nfs4_setclientid_res clid;
        unsigned short port;
        int status;
@@ -69,11 +70,15 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
        if (clp->cl_addr.ss_family == AF_INET6)
                port = nfs_callback_tcpport6;
-        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid);
-        if (status == 0)
+        if (status != 0)
-                status = nfs4_proc_setclientid_confirm(clp, cred);
+                goto out;
-        if (status == 0)
+        status = nfs4_proc_setclientid_confirm(clp, &clid, cred);
-                nfs4_schedule_state_renewal(clp);
+        if (status != 0)
+                goto out;
+        clp->cl_clientid = clid.clientid;
+        nfs4_schedule_state_renewal(clp);
+out:
        return status;
 }
@@ -140,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
        struct nfs4_session *ses = clp->cl_session;
        int max_slots;
-        if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+        if (ses == NULL)
+                return;
+        if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
                spin_lock(&ses->fc_slot_table.slot_tbl_lock);
                max_slots = ses->fc_slot_table.max_slots;
                while (max_slots--) {
@@ -162,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state);
+        set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
        if (tbl->highest_used_slotid != -1) {
                INIT_COMPLETION(ses->complete);
                spin_unlock(&tbl->slot_tbl_lock);
@@ -361,12 +368,11 @@ nfs4_alloc_state_owner(void)
 {
        struct nfs4_state_owner *sp;
-        sp = kzalloc(sizeof(*sp),GFP_KERNEL);
+        sp = kzalloc(sizeof(*sp),GFP_NOFS);
        if (!sp)
                return NULL;
        spin_lock_init(&sp->so_lock);
        INIT_LIST_HEAD(&sp->so_states);
-        INIT_LIST_HEAD(&sp->so_delegations);
        rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
        sp->so_seqid.sequence = &sp->so_sequence;
        spin_lock_init(&sp->so_sequence.lock);
@@ -379,7 +385,7 @@ static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
        if (!RB_EMPTY_NODE(&sp->so_client_node)) {
-                struct nfs_client *clp = sp->so_client;
+                struct nfs_client *clp = sp->so_server->nfs_client;
                spin_lock(&clp->cl_lock);
                rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -401,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        new = nfs4_alloc_state_owner();
        if (new == NULL)
                return NULL;
-        new->so_client = clp;
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
@@ -418,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-        struct nfs_client *clp = sp->so_client;
+        struct nfs_client *clp = sp->so_server->nfs_client;
        struct rpc_cred *cred = sp->so_cred;
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -435,7 +440,7 @@ nfs4_alloc_open_state(void)
 {
        struct nfs4_state *state;
-        state = kzalloc(sizeof(*state), GFP_KERNEL);
+        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (!state)
                return NULL;
        atomic_set(&state->count, 1);
@@ -537,7 +542,8 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state,
+                fmode_t fmode, gfp_t gfp_mask, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
@@ -578,17 +584,17 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fm
                nfs4_put_open_state(state);
                nfs4_put_state_owner(owner);
        } else
-                nfs4_do_close(path, state, wait);
+                nfs4_do_close(path, state, gfp_mask, wait);
 }
 void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 0);
+        __nfs4_close(path, state, fmode, GFP_NOFS, 0);
 }
 void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, fmode, 1);
+        __nfs4_close(path, state, fmode, GFP_KERNEL, 1);
 }
 /*
@@ -596,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 * that is compatible with current->files
 */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *pos;
        list_for_each_entry(pos, &state->lock_states, ls_locks) {
-                if (pos->ls_owner != fl_owner)
+                if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
                        continue;
+                switch (pos->ls_owner.lo_type) {
+                case NFS4_POSIX_LOCK_TYPE:
+                        if (pos->ls_owner.lo_u.posix_owner != fl_owner)
+                                continue;
+                        break;
+                case NFS4_FLOCK_LOCK_TYPE:
+                        if (pos->ls_owner.lo_u.flock_owner != fl_pid)
+                                continue;
+                }
                atomic_inc(&pos->ls_count);
                return pos;
        }
@@ -613,12 +628,12 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs_client *clp = state->owner->so_client;
+        struct nfs_client *clp = state->owner->so_server->nfs_client;
-        lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
+        lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
        if (lsp == NULL)
                return NULL;
        rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue");
@@ -627,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        lsp->ls_seqid.sequence = &lsp->ls_sequence;
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
-        lsp->ls_owner = fl_owner;
+        lsp->ls_owner.lo_type = type;
+        switch (lsp->ls_owner.lo_type) {
+        case NFS4_FLOCK_LOCK_TYPE:
+                lsp->ls_owner.lo_u.flock_owner = fl_pid;
+                break;
+        case NFS4_POSIX_LOCK_TYPE:
+                lsp->ls_owner.lo_u.posix_owner = fl_owner;
+                break;
+        default:
+                kfree(lsp);
+                return NULL;
+        }
        spin_lock(&clp->cl_lock);
        nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
        spin_unlock(&clp->cl_lock);
@@ -637,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 {
-        struct nfs_client *clp = lsp->ls_state->owner->so_client;
+        struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
        spin_lock(&clp->cl_lock);
        nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -651,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
 {
        struct nfs4_lock_state *lsp, *new = NULL;
        
        for(;;) {
                spin_lock(&state->state_lock);
-                lsp = __nfs4_find_lock_state(state, owner);
+                lsp = __nfs4_find_lock_state(state, owner, pid, type);
                if (lsp != NULL)
                        break;
                if (new != NULL) {
@@ -668,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                        break;
                }
                spin_unlock(&state->state_lock);
-                new = nfs4_alloc_lock_state(state, owner);
+                new = nfs4_alloc_lock_state(state, owner, pid, type);
                if (new == NULL)
                        return NULL;
        }
@@ -695,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
+        if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
+                nfs4_release_lockowner(lsp);
        nfs4_free_lock_state(lsp);
 }
@@ -722,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
        if (fl->fl_ops != NULL)
                return 0;
-        lsp = nfs4_get_lock_state(state, fl->fl_owner);
+        if (fl->fl_flags & FL_POSIX)
+                lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
+        else if (fl->fl_flags & FL_FLOCK)
+                lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
+        else
+                return -EINVAL;
        if (lsp == NULL)
                return -ENOMEM;
        fl->fl_u.nfs4_fl.owner = lsp;
@@ -734,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 * Byte-range lock aware utility to initialize the stateid of read/write
 * requests.
 */
-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
+void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
 {
        struct nfs4_lock_state *lsp;
        int seq;
@@ -747,18 +780,18 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
                return;
        spin_lock(&state->state_lock);
-        lsp = __nfs4_find_lock_state(state, fl_owner);
+        lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
        if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
                memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
        spin_unlock(&state->state_lock);
        nfs4_put_lock_state(lsp);
 }
-struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
+struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
 {
        struct nfs_seqid *new;
-        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        new = kmalloc(sizeof(*new), gfp_mask);
        if (new != NULL) {
                new->sequence = counter;
                INIT_LIST_HEAD(&new->list);
@@ -1035,11 +1068,11 @@ restart:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_RECLAIM_BAD:
                        case -NFS4ERR_RECLAIM_CONFLICT:
-                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
                                break;
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
-                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
@@ -1114,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
                return;
-        nfs4_reclaim_complete(clp,
+        nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
-                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1205,8 +1237,8 @@ restart:
 static int nfs4_check_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
-        struct nfs4_state_maintenance_ops *ops =
+        const struct nfs4_state_maintenance_ops *ops =
-                nfs4_state_renewal_ops[clp->cl_minorversion];
+                clp->cl_mvops->state_renewal_ops;
        int status = -NFS4ERR_EXPIRED;
        /* Is the client already known to have an expired lease? */
@@ -1229,8 +1261,8 @@ out:
 static int nfs4_reclaim_lease(struct nfs_client *clp)
 {
        struct rpc_cred *cred;
-        struct nfs4_state_recovery_ops *ops =
+        const struct nfs4_state_recovery_ops *ops =
-                nfs4_reboot_recovery_ops[clp->cl_minorversion];
+                clp->cl_mvops->reboot_recovery_ops;
        int status = -ENOENT;
        cred = ops->get_clid_cred(clp);
@@ -1347,7 +1379,7 @@ static int nfs4_recall_slot(struct nfs_client *clp)
        nfs4_begin_drain_session(clp);
        new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
-                      GFP_KERNEL);
+                      GFP_NOFS);
        if (!new)
                return -ENOMEM;
@@ -1438,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                /* First recover reboot state... */
                if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
-                                nfs4_reboot_recovery_ops[clp->cl_minorversion]);
+                                clp->cl_mvops->reboot_recovery_ops);
                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
                                continue;
@@ -1452,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                /* Now recover expired state... */
                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
                        status = nfs4_do_reclaim(clp,
-                                nfs4_nograce_recovery_ops[clp->cl_minorversion]);
+                                clp->cl_mvops->nograce_recovery_ops);
                        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
                            test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
                            test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 38f3b582e7c2..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int);
 #define encode_link_maxsz       (op_encode_hdr_maxsz + \
                                nfs4_name_maxsz)
 #define decode_link_maxsz       (op_decode_hdr_maxsz + decode_change_info_maxsz)
+#define encode_lockowner_maxsz  (7)
 #define encode_lock_maxsz       (op_encode_hdr_maxsz + \
                                 7 + \
-                                 1 + encode_stateid_maxsz + 8)
+                                 1 + encode_stateid_maxsz + 1 + \
+                                 encode_lockowner_maxsz)
 #define decode_lock_denied_maxsz \
                                (8 + decode_lockowner_maxsz)
 #define decode_lock_maxsz       (op_decode_hdr_maxsz + \
                                 decode_lock_denied_maxsz)
-#define encode_lockt_maxsz      (op_encode_hdr_maxsz + 12)
+#define encode_lockt_maxsz      (op_encode_hdr_maxsz + 5 + \
+                                encode_lockowner_maxsz)
 #define decode_lockt_maxsz      (op_decode_hdr_maxsz + \
                                 decode_lock_denied_maxsz)
 #define encode_locku_maxsz      (op_encode_hdr_maxsz + 3 + \
@@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int);
                                 4)
 #define decode_locku_maxsz      (op_decode_hdr_maxsz + \
                                 decode_stateid_maxsz)
+#define encode_release_lockowner_maxsz \
+                                (op_encode_hdr_maxsz + \
+                                 encode_lockowner_maxsz)
+#define decode_release_lockowner_maxsz \
+                                (op_decode_hdr_maxsz)
 #define encode_access_maxsz     (op_encode_hdr_maxsz + 1)
 #define decode_access_maxsz     (op_decode_hdr_maxsz + 2)
 #define encode_symlink_maxsz    (op_encode_hdr_maxsz + \
@@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int);
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz + \
                                decode_locku_maxsz)
+#define NFS4_enc_release_lockowner_sz \
+                                (compound_encode_hdr_maxsz + \
+                                 encode_lockowner_maxsz)
+#define NFS4_dec_release_lockowner_sz \
+                                (compound_decode_hdr_maxsz + \
+                                 decode_lockowner_maxsz)
 #define NFS4_enc_access_sz      (compound_encode_hdr_maxsz + \
                                encode_sequence_maxsz + \
                                encode_putfh_maxsz + \
@@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
                                struct compound_hdr *hdr)
 {
        __be32 *p;
-        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
+        struct rpc_auth *auth = req->rq_cred->cr_auth;
        /* initialize running count of expected bytes in reply.
         * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -862,8 +876,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
                *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
                *p++ = cpu_to_be32(0);
-                *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+                *p++ = cpu_to_be32(iap->ia_atime.tv_sec);
-                *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
+                *p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
        }
        else if (iap->ia_valid & ATTR_ATIME) {
                bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
@@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
        return fl->fl_end - fl->fl_start + 1;
 }
+static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 28);
+        p = xdr_encode_hyper(p, lowner->clientid);
+        *p++ = cpu_to_be32(16);
+        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+        xdr_encode_hyper(p, lowner->id);
+}
 /*
 * opcode,type,reclaim,offset,length,new_lock_owner = 32
 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
-                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
+                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
                *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
                p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
                *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-                p = xdr_encode_hyper(p, args->lock_owner.clientid);
+                encode_lockowner(xdr, &args->lock_owner);
-                *p++ = cpu_to_be32(16);
-                p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-                xdr_encode_hyper(p, args->lock_owner.id);
        }
        else {
                p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
        __be32 *p;
-        p = reserve_space(xdr, 52);
+        p = reserve_space(xdr, 24);
        *p++ = cpu_to_be32(OP_LOCKT);
        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
        p = xdr_encode_hyper(p, args->fl->fl_start);
        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-        p = xdr_encode_hyper(p, args->lock_owner.clientid);
+        encode_lockowner(xdr, &args->lock_owner);
-        *p++ = cpu_to_be32(16);
-        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-        xdr_encode_hyper(p, args->lock_owner.id);
        hdr->nops++;
        hdr->replen += decode_lockt_maxsz;
 }
@@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        hdr->replen += decode_locku_maxsz;
 }
+static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 4);
+        *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
+        encode_lockowner(xdr, lowner);
+        hdr->nops++;
+        hdr->replen += decode_release_lockowner_maxsz;
+}
 static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        int len = name->len;
@@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
                break;
        default:
                clp = arg->server->nfs_client;
-                if (clp->cl_minorversion > 0) {
+                if (clp->cl_mvops->minor_version > 0) {
                        if (nfs4_has_persistent_session(clp)) {
                                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
                                encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
 {
        nfs4_stateid stateid;
        __be32 *p;
        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
-                nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
+                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_READ);
-        encode_stateid(xdr, args->context);
+        encode_stateid(xdr, args->context, args->lock_context);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
@@ -1504,14 +1534,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        hdr->replen += decode_setclientid_maxsz;
 }
-static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
        p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
        *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-        p = xdr_encode_hyper(p, client_state->cl_clientid);
+        p = xdr_encode_hyper(p, arg->clientid);
-        xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
        hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_WRITE);
-        encode_stateid(xdr, args->context);
+        encode_stateid(xdr, args->context, args->lock_context);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
 {
 #if defined(CONFIG_NFS_V4_1)
        if (args->sa_session)
-                return args->sa_session->clp->cl_minorversion;
+                return args->sa_session->clp->cl_mvops->minor_version;
 #endif /* CONFIG_NFS_V4_1 */
        return 0;
 }
@@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
        return 0;
 }
+static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .minorversion = 0,
+        };
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, req, &hdr);
+        encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 /*
 * Encode a READLINK request
 */
@@ -2324,7 +2368,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid_res *arg)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2334,7 +2378,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, req, &hdr);
-        encode_setclientid_confirm(&xdr, clp, &hdr);
+        encode_setclientid_confirm(&xdr, arg, &hdr);
        encode_putrootfh(&xdr, &hdr);
        encode_fsinfo(&xdr, lease_bitmap, &hdr);
        encode_nops(&hdr);
@@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = args->client->cl_minorversion,
+                .minorversion = args->client->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = args->client->cl_minorversion,
+                .minorversion = args->client->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .minorversion = session->clp->cl_minorversion,
+                .minorversion = session->clp->cl_mvops->minor_version,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
        return status;
 }
+static int decode_release_lockowner(struct xdr_stream *xdr)
+{
+        return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
+}
 static int decode_lookup(struct xdr_stream *xdr)
 {
        return decode_op_hdr(xdr, OP_LOOKUP);
@@ -4397,7 +4446,7 @@ out_overflow:
        return -EIO;
 }
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
 {
        __be32 *p;
        uint32_t opnum;
@@ -4417,8 +4466,8 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
                p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
                if (unlikely(!p))
                        goto out_overflow;
-                p = xdr_decode_hyper(p, &clp->cl_clientid);
+                p = xdr_decode_hyper(p, &res->clientid);
-                memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
+                memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE);
        } else if (nfserr == NFSERR_CLID_INUSE) {
                uint32_t len;
@@ -4815,7 +4864,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
                goto out;
        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
                goto out;
-        decode_getfattr(&xdr, &res->dir_attr, res->server,
+        decode_getfattr(&xdr, res->dir_attr, res->server,
                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
@@ -5259,6 +5308,19 @@ out:
        return status;
 }
+static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr;
+        int status;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        status = decode_compound_hdr(&xdr, &hdr);
+        if (!status)
+                status = decode_release_lockowner(&xdr);
+        return status;
+}
 /*
 * Decode READLINK response
 */
@@ -5498,7 +5560,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 * Decode SETCLIENTID response
 */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
-                struct nfs_client *clp)
+                struct nfs4_setclientid_res *res)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -5507,7 +5569,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
-                status = decode_setclientid(&xdr, clp);
+                status = decode_setclientid(&xdr, res);
        return status;
 }
@@ -5866,6 +5928,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
  PROC(GETACL,          enc_getacl,     dec_getacl),
  PROC(SETACL,          enc_setacl,     dec_setacl),
  PROC(FS_LOCATIONS,    enc_fs_locations, dec_fs_locations),
+  PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
 #if defined(CONFIG_NFS_V4_1)
  PROC(EXCHANGE_ID,     enc_exchange_id,        dec_exchange_id),
  PROC(CREATE_SESSION,  enc_create_session,     dec_create_session),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8c55b27c0de4..df101d9f546a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 /* Name of directory to mount */
-static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, };
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -488,7 +488,6 @@ static int __init root_nfs_ports(void)
 */
 static int __init root_nfs_get_handle(void)
 {
-        struct nfs_fh fh;
        struct sockaddr_in sin;
        unsigned int auth_flav_len = 0;
        struct nfs_mount_request request = {
@@ -499,21 +498,24 @@ static int __init root_nfs_get_handle(void)
                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
-                .fh             = &fh,
                .auth_flav_len  = &auth_flav_len,
        };
-        int status;
+        int status = -ENOMEM;
+        request.fh = nfs_alloc_fhandle();
+        if (!request.fh)
+                goto out;
        set_sockaddr(&sin, servaddr, htons(mount_port));
        status = nfs_mount(&request);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
                                "while mounting %s\n", status, nfs_export_path);
        else {
-                nfs_data.root.size = fh.size;
+                nfs_data.root.size = request.fh->size;
-                memcpy(nfs_data.root.data, fh.data, fh.size);
+                memcpy(&nfs_data.root.data, request.fh->data, request.fh->size);
        }
+        nfs_free_fhandle(request.fh);
+out:
        return status;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 29d9d36cd5f4..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -60,16 +60,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 {
        struct nfs_page         *req;
-        for (;;) {
+        /* try to allocate the request struct */
-                /* try to allocate the request struct */
+        req = nfs_page_alloc();
-                req = nfs_page_alloc();
+        if (req == NULL)
-                if (req != NULL)
+                return ERR_PTR(-ENOMEM);
-                        break;
-                if (fatal_signal_pending(current))
-                        return ERR_PTR(-ERESTARTSYS);
-                yield();
-        }
        /* Initialize the request struct. Initially, we assume a
         * long write-back delay. This will be adjusted in
@@ -85,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
        req->wb_pgbase  = offset;
        req->wb_bytes   = count;
        req->wb_context = get_nfs_open_context(ctx);
+        req->wb_lock_context = nfs_get_lock_context(ctx);
        kref_init(&req->wb_kref);
        return req;
 }
@@ -147,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
        struct nfs_open_context *ctx = req->wb_context;
+        struct nfs_lock_context *l_ctx = req->wb_lock_context;
        if (page != NULL) {
                page_cache_release(page);
                req->wb_page = NULL;
        }
+        if (l_ctx != NULL) {
+                nfs_put_lock_context(l_ctx);
+                req->wb_lock_context = NULL;
+        }
        if (ctx != NULL) {
                put_nfs_open_context(ctx);
                req->wb_context = NULL;
@@ -241,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return 0;
-        if (req->wb_context->lockowner != prev->wb_context->lockowner)
+        if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
                return 0;
        if (req->wb_context->state != prev->wb_context->state)
                return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0288be80444f..611bec22f552 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -224,35 +224,60 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
        return status;
 }
+struct nfs_createdata {
+        struct nfs_createargs arg;
+        struct nfs_diropok res;
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr;
+};
+static struct nfs_createdata *nfs_alloc_createdata(struct inode *dir,
+                struct dentry *dentry, struct iattr *sattr)
+{
+        struct nfs_createdata *data;
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        if (data != NULL) {
+                data->arg.fh = NFS_FH(dir);
+                data->arg.name = dentry->d_name.name;
+                data->arg.len = dentry->d_name.len;
+                data->arg.sattr = sattr;
+                nfs_fattr_init(&data->fattr);
+                data->fhandle.size = 0;
+                data->res.fh = &data->fhandle;
+                data->res.fattr = &data->fattr;
+        }
+        return data;
+};
+static void nfs_free_createdata(const struct nfs_createdata *data)
+{
+        kfree(data);
+}
 static int
 nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                int flags, struct nameidata *nd)
 {
-        struct nfs_fh           fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr        fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
-        nfs_fattr_init(&fattr);
        dprintk("NFS call  create %s\n", dentry->d_name.name);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply create: %d\n", status);
        return status;
 }
@@ -264,24 +289,12 @@ static int
 nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
               dev_t rdev)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_CREATE],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int status, mode;
+        umode_t mode;
+        int status = -ENOMEM;
        dprintk("NFS call  mknod %s\n", dentry->d_name.name);
@@ -294,17 +307,24 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                sattr->ia_size = new_encode_dev(rdev);/* get out your barf bag */
        }
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == -EINVAL && S_ISFIFO(mode)) {
                sattr->ia_mode = mode;
-                nfs_fattr_init(&fattr);
+                nfs_fattr_init(data->res.fattr);
                status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        }
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mknod: %d\n", status);
        return status;
 }
@@ -398,8 +418,8 @@ static int
 nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                 unsigned int len, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_fh *fh;
-        struct nfs_fattr fattr;
+        struct nfs_fattr *fattr;
        struct nfs_symlinkargs  arg = {
                .fromfh         = NFS_FH(dir),
                .fromname       = dentry->d_name.name,
@@ -412,12 +432,18 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
                .rpc_proc       = &nfs_procedures[NFSPROC_SYMLINK],
                .rpc_argp       = &arg,
        };
-        int                     status;
+        int status = -ENAMETOOLONG;
+        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
        if (len > NFS2_MAXPATHLEN)
-                return -ENAMETOOLONG;
+                goto out;
-        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+        fh = nfs_alloc_fhandle();
+        fattr = nfs_alloc_fattr();
+        status = -ENOMEM;
+        if (fh == NULL || fattr == NULL)
+                goto out;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
@@ -427,12 +453,12 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
         * filehandle size to zero indicates to nfs_instantiate that it
         * should fill in the data with a LOOKUP call on the wire.
         */
-        if (status == 0) {
+        if (status == 0)
-                nfs_fattr_init(&fattr);
+                status = nfs_instantiate(dentry, fh, fattr);
-                fhandle.size = 0;
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
-        }
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fh);
+out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
 }
@@ -440,31 +466,25 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 static int
 nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 {
-        struct nfs_fh fhandle;
+        struct nfs_createdata *data;
-        struct nfs_fattr fattr;
-        struct nfs_createargs   arg = {
-                .fh             = NFS_FH(dir),
-                .name           = dentry->d_name.name,
-                .len            = dentry->d_name.len,
-                .sattr          = sattr
-        };
-        struct nfs_diropok      res = {
-                .fh             = &fhandle,
-                .fattr          = &fattr
-        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs_procedures[NFSPROC_MKDIR],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
        };
-        int                     status;
+        int status = -ENOMEM;
        dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
-        nfs_fattr_init(&fattr);
+        data = nfs_alloc_createdata(dir, dentry, sattr);
+        if (data == NULL)
+                goto out;
+        msg.rpc_argp = &data->arg;
+        msg.rpc_resp = &data->res;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
        if (status == 0)
-                status = nfs_instantiate(dentry, &fhandle, &fattr);
+                status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+        nfs_free_createdata(data);
+out:
        dprintk("NFS reply mkdir: %d\n", status);
        return status;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index db9b360ae19d..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,7 +40,7 @@ static mempool_t *nfs_rdata_mempool;
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_KERNEL);
        if (p) {
                memset(p, 0, sizeof(*p));
@@ -50,7 +50,7 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
-                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
                        if (!p->pagevec) {
                                mempool_free(p, nfs_rdata_mempool);
                                p = NULL;
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->args.pages  = data->pagevec;
        data->args.count  = count;
        data->args.context = get_nfs_open_context(req->wb_context);
+        data->args.lock_context = req->wb_lock_context;
        data->res.fattr   = &data->fattr;
        data->res.count   = count;
@@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client,
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
                                &data->args.seq_args, &data->res.seq_res,
                                0, task))
                return;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b4148fc00f9f..f4cbf0c306c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -141,7 +141,6 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
        { Opt_fscache, "fsc" },
-        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_nofscache, "nofsc" },
        { Opt_port, "port=%s" },
@@ -171,6 +170,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_mountaddr, "mountaddr=%s" },
        { Opt_lookupcache, "lookupcache=%s" },
+        { Opt_fscache_uniq, "fsc=%s" },
        { Opt_err, NULL }
 };
@@ -270,7 +270,7 @@ static const struct super_operations nfs_sops = {
        .write_inode    = nfs_write_inode,
        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
-        .clear_inode    = nfs_clear_inode,
+        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
@@ -340,7 +340,7 @@ static const struct super_operations nfs4_sops = {
        .write_inode    = nfs_write_inode,
        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
-        .clear_inode    = nfs4_clear_inode,
+        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
        .show_stats     = nfs_show_stats,
@@ -423,15 +423,27 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        unsigned char blockbits;
        unsigned long blockres;
        struct nfs_fh *fh = NFS_FH(dentry->d_inode);
-        struct nfs_fattr fattr;
+        struct nfs_fsstat res;
-        struct nfs_fsstat res = {
+        int error = -ENOMEM;
-                        .fattr = &fattr,
-        };
+        res.fattr = nfs_alloc_fattr();
-        int error;
+        if (res.fattr == NULL)
+                goto out_err;
        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
+        if (unlikely(error == -ESTALE)) {
+                struct dentry *pd_dentry;
+                pd_dentry = dget_parent(dentry);
+                if (pd_dentry != NULL) {
+                        nfs_zap_caches(pd_dentry->d_inode);
+                        dput(pd_dentry);
+                }
+        }
+        nfs_free_fattr(res.fattr);
        if (error < 0)
                goto out_err;
        buf->f_type = NFS_SUPER_MAGIC;
        /*
@@ -542,6 +554,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 {
        struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
+        if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
+                return;
        switch (sap->sa_family) {
        case AF_INET: {
                struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -566,6 +581,22 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        nfs_show_mountd_netid(m, nfss, showdefaults);
 }
+#ifdef CONFIG_NFS_V4
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+                                    int showdefaults)
+{
+        struct nfs_client *clp = nfss->nfs_client;
+        seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
+        seq_printf(m, ",minorversion=%u", clp->cl_minorversion);
+}
+#else
+static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,
+                                    int showdefaults)
+{
+}
+#endif
 /*
 * Describe the mount options in force on this server representation
 */
@@ -627,13 +658,18 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (version != 4)
                nfs_show_mountd_options(m, nfss, showdefaults);
+        else
+                nfs_show_nfsv4_options(m, nfss, showdefaults);
-#ifdef CONFIG_NFS_V4
-        if (clp->rpc_ops->version == 4)
-                seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
-#endif
        if (nfss->options & NFS_OPTION_FSCACHE)
                seq_printf(m, ",fsc");
+        if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+                if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+                        seq_printf(m, ",lookupcache=none");
+                else
+                        seq_printf(m, ",lookupcache=pos");
+        }
 }
 /*
@@ -1046,14 +1082,6 @@ static int nfs_parse_mount_options(char *raw,
                        kfree(mnt->fscache_uniq);
                        mnt->fscache_uniq = NULL;
                        break;
-                case Opt_fscache_uniq:
-                        string = match_strdup(args);
-                        if (!string)
-                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
-                        mnt->options |= NFS_OPTION_FSCACHE;
-                        break;
                /*
                 * options that take numeric values
@@ -1064,7 +1092,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
@@ -1185,7 +1213,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
@@ -1384,6 +1412,14 @@ static int nfs_parse_mount_options(char *raw,
                                        return 0;
                        };
                        break;
+                case Opt_fscache_uniq:
+                        string = match_strdup(args);
+                        if (string == NULL)
+                                goto out_nomem;
+                        kfree(mnt->fscache_uniq);
+                        mnt->fscache_uniq = string;
+                        mnt->options |= NFS_OPTION_FSCACHE;
+                        break;
                /*
                 * Special options
@@ -1762,6 +1798,7 @@ static int nfs_validate_mount_data(void *options,
                 * can deal with.
                 */
                args->flags             = data->flags & NFS_MOUNT_FLAGMASK;
+                args->flags             |= NFS_MOUNT_LEGACY_INTERFACE;
                args->rsize             = data->rsize;
                args->wsize             = data->wsize;
                args->timeo             = data->timeo;
@@ -2172,7 +2209,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        int error = -ENOMEM;
        data = nfs_alloc_parsed_mount_data(3);
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2247,7 +2284,7 @@ out:
        kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        kfree(data);
        return error;
@@ -2556,7 +2593,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
        };
        int error = -ENOMEM;
-        mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
                goto out_free_fh;
@@ -2614,7 +2651,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 out:
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
-        kfree(mntfh);
+        nfs_free_fhandle(mntfh);
        return error;
 out_free:
@@ -2669,41 +2706,120 @@ out_freepage:
        free_page((unsigned long)page);
 }
+struct nfs_referral_count {
+        struct list_head list;
+        const struct task_struct *task;
+        unsigned int referral_count;
+};
+static LIST_HEAD(nfs_referral_count_list);
+static DEFINE_SPINLOCK(nfs_referral_count_list_lock);
+static struct nfs_referral_count *nfs_find_referral_count(void)
+{
+        struct nfs_referral_count *p;
+        list_for_each_entry(p, &nfs_referral_count_list, list) {
+                if (p->task == current)
+                        return p;
+        }
+        return NULL;
+}
+#define NFS_MAX_NESTED_REFERRALS 2
+static int nfs_referral_loop_protect(void)
+{
+        struct nfs_referral_count *p, *new;
+        int ret = -ENOMEM;
+        new = kmalloc(sizeof(*new), GFP_KERNEL);
+        if (!new)
+                goto out;
+        new->task = current;
+        new->referral_count = 1;
+        ret = 0;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        if (p != NULL) {
+                if (p->referral_count >= NFS_MAX_NESTED_REFERRALS)
+                        ret = -ELOOP;
+                else
+                        p->referral_count++;
+        } else {
+                list_add(&new->list, &nfs_referral_count_list);
+                new = NULL;
+        }
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(new);
+out:
+        return ret;
+}
+static void nfs_referral_loop_unprotect(void)
+{
+        struct nfs_referral_count *p;
+        spin_lock(&nfs_referral_count_list_lock);
+        p = nfs_find_referral_count();
+        p->referral_count--;
+        if (p->referral_count == 0)
+                list_del(&p->list);
+        else
+                p = NULL;
+        spin_unlock(&nfs_referral_count_list_lock);
+        kfree(p);
+}
 static int nfs_follow_remote_path(struct vfsmount *root_mnt,
                const char *export_path, struct vfsmount *mnt_target)
 {
+        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
-        struct nameidata nd;
        struct super_block *s;
        int ret;
+        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+        if (nd == NULL)
+                return -ENOMEM;
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
        if (IS_ERR(ns_private))
                goto out_mntput;
+        ret = nfs_referral_loop_protect();
+        if (ret != 0)
+                goto out_put_mnt_ns;
        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-                        export_path, LOOKUP_FOLLOW, &nd);
+                        export_path, LOOKUP_FOLLOW, nd);
+        nfs_referral_loop_unprotect();
        put_mnt_ns(ns_private);
        if (ret != 0)
                goto out_err;
-        s = nd.path.mnt->mnt_sb;
+        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mnt_target->mnt_sb = s;
-        mnt_target->mnt_root = dget(nd.path.dentry);
+        mnt_target->mnt_root = dget(nd->path.dentry);
        /* Correct the device pathname */
-        nfs_fix_devname(&nd.path, mnt_target);
+        nfs_fix_devname(&nd->path, mnt_target);
-        path_put(&nd.path);
+        path_put(&nd->path);
+        kfree(nd);
        down_write(&s->s_umount);
        return 0;
+out_put_mnt_ns:
+        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
+        kfree(nd);
        return ret;
 }
@@ -2874,17 +2990,21 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        struct super_block *s;
        struct nfs_server *server;
        struct dentry *mntroot;
-        struct nfs_fh mntfh;
+        struct nfs_fh *mntfh;
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error;
+        int error = -ENOMEM;
        dprintk("--> nfs4_referral_get_sb()\n");
+        mntfh = nfs_alloc_fhandle();
+        if (mntfh == NULL)
+                goto out_err_nofh;
        /* create a new volume representation */
-        server = nfs4_create_referral_server(data, &mntfh);
+        server = nfs4_create_referral_server(data, mntfh);
        if (IS_ERR(server)) {
                error = PTR_ERR(server);
                goto out_err_noserver;
@@ -2916,7 +3036,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, &mntfh);
+        mntroot = nfs4_get_root(s, mntfh);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2933,12 +3053,15 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
        security_sb_clone_mnt_opts(data->sb, s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = 0\n");
        return 0;
 out_err_nosb:
        nfs_free_server(server);
 out_err_noserver:
+        nfs_free_fhandle(mntfh);
+out_err_nofh:
        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
        return error;
@@ -2947,6 +3070,7 @@ error_splat_super:
                bdi_unregister(&server->backing_dev_info);
 error_splat_bdi:
        deactivate_locked_super(s);
+        nfs_free_fhandle(mntfh);
        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
        return error;
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6da3d3ff6edd..2f84adaad427 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -23,6 +23,7 @@ struct nfs_unlinkdata {
        struct nfs_removeres res;
        struct inode *dir;
        struct rpc_cred *cred;
+        struct nfs_fattr dir_attr;
 };
 /**
@@ -109,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
        struct nfs_unlinkdata *data = calldata;
        struct nfs_server *server = NFS_SERVER(data->dir);
-        if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args,
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
@@ -169,7 +170,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        }
        nfs_sb_active(dir->i_sb);
        data->args.fh = NFS_FH(dir);
-        nfs_fattr_init(&data->res.dir_attr);
+        nfs_fattr_init(data->res.dir_attr);
        NFS_PROTO(dir)->unlink_setup(&msg, dir);
@@ -259,6 +260,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
                goto out_free;
        }
        data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
+        data->res.dir_attr = &data->dir_attr;
        status = -EBUSY;
        spin_lock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..874972d9427c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
-static struct nfs_page *nfs_find_and_lock_request(struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
 {
        struct inode *inode = page->mapping->host;
        struct nfs_page *req;
@@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
                 *       request as dirty (in which case we don't care).
                 */
                spin_unlock(&inode->i_lock);
-                ret = nfs_wait_on_request(req);
+                if (!nonblock)
+                        ret = nfs_wait_on_request(req);
+                else
+                        ret = -EAGAIN;
                nfs_release_request(req);
                if (ret != 0)
                        return ERR_PTR(ret);
@@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page)
 * May return an error if the user signalled nfs_wait_on_request().
 */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-                                struct page *page)
+                                struct page *page, bool nonblock)
 {
        struct nfs_page *req;
        int ret = 0;
-        req = nfs_find_and_lock_request(page);
+        req = nfs_find_and_lock_request(page, nonblock);
        if (!req)
                goto out;
        ret = PTR_ERR(req);
@@ -283,12 +286,20 @@ out:
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
        struct inode *inode = page->mapping->host;
+        int ret;
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
        nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
        nfs_pageio_cond_complete(pgio, page->index);
-        return nfs_page_async_flush(pgio, page);
+        ret = nfs_page_async_flush(pgio, page,
+                        wbc->sync_mode == WB_SYNC_NONE ||
+                        wbc->nonblocking != 0);
+        if (ret == -EAGAIN) {
+                redirty_page_for_writepage(wbc, page);
+                ret = 0;
+        }
+        return ret;
 }
 /*
@@ -689,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                req = nfs_page_find_request(page);
                if (req == NULL)
                        return 0;
-                do_flush = req->wb_page != page || req->wb_context != ctx;
+                do_flush = req->wb_page != page || req->wb_context != ctx ||
+                        req->wb_lock_context->lockowner != current->files ||
+                        req->wb_lock_context->pid != current->tgid;
                nfs_release_request(req);
                if (!do_flush)
                        return 0;
@@ -813,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.pages  = data->pagevec;
        data->args.count  = count;
        data->args.context = get_nfs_open_context(req->wb_context);
+        data->args.lock_context = req->wb_lock_context;
        data->args.stable  = NFS_UNSTABLE;
        if (how & FLUSH_STABLE) {
                data->args.stable = NFS_DATA_SYNC;
@@ -1036,9 +1050,9 @@ out:
 void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
-        struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
-        if (nfs4_setup_sequence(clp, &data->args.seq_args,
+        if (nfs4_setup_sequence(NFS_SERVER(data->inode),
+                                &data->args.seq_args,
                                &data->res.seq_res, 1, task))
                return;
        rpc_call_start(task);
@@ -1379,14 +1393,14 @@ static const struct rpc_call_ops nfs_commit_ops = {
        .rpc_release = nfs_commit_release,
 };
-static int nfs_commit_inode(struct inode *inode, int how)
+int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
        int may_wait = how & FLUSH_SYNC;
        int res = 0;
        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
-                goto out;
+                goto out_mark_dirty;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1398,9 +1412,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
                                        nfs_wait_bit_killable,
                                        TASK_KILLABLE);
+                else
+                        goto out_mark_dirty;
        } else
                nfs_commit_clear_lock(NFS_I(inode));
-out:
+        return res;
+        /* Note: If we exit without ensuring that the commit is complete,
+         * we must mark the inode as dirty. Otherwise, future calls to
+         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+         * that the data is on the disk.
+         */
+out_mark_dirty:
+        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return res;
 }
@@ -1434,11 +1457,6 @@ out_mark_dirty:
        return ret;
 }
 #else
-static int nfs_commit_inode(struct inode *inode, int how)
-{
-        return 0;
-}
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
        return 0;
@@ -1509,14 +1527,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
        };
        int ret;
-        while(PagePrivate(page)) {
+        for (;;) {
                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
+                        continue;
                }
-                ret = sync_inode(inode, &wbc);
+                if (!PagePrivate(page))
+                        break;
+                ret = nfs_commit_inode(inode, FLUSH_SYNC);
                if (ret < 0)
                        goto out_error;
        }
@@ -1534,7 +1555,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        nfs_fscache_release_page(page, GFP_KERNEL);
-        req = nfs_find_and_lock_request(page);
+        req = nfs_find_and_lock_request(page, false);
        ret = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a3..4264377552e2 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,7 @@ config NFSD_V4
        depends on NFSD && PROC_FS && EXPERIMENTAL
        select NFSD_V3
        select FS_POSIX_ACL
-        select RPCSEC_GSS_KRB5
+        select SUNRPC_GSS
        help
          This option enables support in your system's NFS server for
          version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 872a5ef550c7..c2a4f71d87dd 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -259,10 +259,9 @@ static struct cache_detail svc_expkey_cache = {
        .alloc          = expkey_alloc,
 };
-static struct svc_expkey *
+static int
-svc_expkey_lookup(struct svc_expkey *item)
+svc_expkey_hash(struct svc_expkey *item)
 {
-        struct cache_head *ch;
        int hash = item->ek_fsidtype;
        char * cp = (char*)item->ek_fsid;
        int len = key_len(item->ek_fsidtype);
@@ -270,6 +269,14 @@ svc_expkey_lookup(struct svc_expkey *item)
        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
        hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS);
        hash &= EXPKEY_HASHMASK;
+        return hash;
+}
+static struct svc_expkey *
+svc_expkey_lookup(struct svc_expkey *item)
+{
+        struct cache_head *ch;
+        int hash = svc_expkey_hash(item);
        ch = sunrpc_cache_lookup(&svc_expkey_cache, &item->h,
                                 hash);
@@ -283,13 +290,7 @@ static struct svc_expkey *
 svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 {
        struct cache_head *ch;
-        int hash = new->ek_fsidtype;
+        int hash = svc_expkey_hash(new);
-        char * cp = (char*)new->ek_fsid;
-        int len = key_len(new->ek_fsidtype);
-        hash ^= hash_mem(cp, len, EXPKEY_HASHBITS);
-        hash ^= hash_ptr(new->ek_client, EXPKEY_HASHBITS);
-        hash &= EXPKEY_HASHMASK;
        ch = sunrpc_cache_update(&svc_expkey_cache, &new->h,
                                 &old->h, hash);
@@ -738,14 +739,22 @@ struct cache_detail svc_export_cache = {
        .alloc          = svc_export_alloc,
 };
-static struct svc_export *
+static int
-svc_export_lookup(struct svc_export *exp)
+svc_export_hash(struct svc_export *exp)
 {
-        struct cache_head *ch;
        int hash;
        hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS);
        hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS);
+        return hash;
+}
+static struct svc_export *
+svc_export_lookup(struct svc_export *exp)
+{
+        struct cache_head *ch;
+        int hash = svc_export_hash(exp);
        ch = sunrpc_cache_lookup(&svc_export_cache, &exp->h,
                                 hash);
@@ -759,10 +768,7 @@ static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
        struct cache_head *ch;
-        int hash;
+        int hash = svc_export_hash(old);
-        hash = hash_ptr(old->ex_client, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.dentry, EXPORT_HASHBITS);
-        hash ^= hash_ptr(old->ex_path.mnt, EXPORT_HASHBITS);
        ch = sunrpc_cache_update(&svc_export_cache, &new->h,
                                 &old->h,
@@ -1071,9 +1077,9 @@ exp_export(struct nfsctl_export *nxp)
                err = 0;
 finish:
        kfree(new.ex_pathname);
-        if (exp)
+        if (!IS_ERR_OR_NULL(exp))
                exp_put(exp);
-        if (fsid_key && !IS_ERR(fsid_key))
+        if (!IS_ERR_OR_NULL(fsid_key))
                cache_put(&fsid_key->h, &svc_expkey_cache);
        path_put(&path);
 out_put_clp:
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b9..5b7e3021e06b 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
        svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
        fh_copy(&resp->fh, &argp->fh);
-        nfserr = nfsd_read(rqstp, &resp->fh, NULL,
+        nfserr = nfsd_read(rqstp, &resp->fh,
                                  argp->offset,
                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        fh_init(&resp->fh, NFS3_FHSIZE);
        nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
                                    &argp->attrs, S_IFDIR, 0, &resp->fh);
+        fh_unlock(&resp->dirfh);
        RETURN_STATUS(nfserr);
 }
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
        type = nfs3_ftypes[argp->ftype];
        nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
                                    &argp->attrs, type, rdev, &resp->fh);
+        fh_unlock(&resp->dirfh);
        RETURN_STATUS(nfserr);
 }
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
        /* Unlink. -S_IFDIR means file must not be a directory */
        fh_copy(&resp->fh, &argp->fh);
        nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
+        fh_unlock(&resp->fh);
        RETURN_STATUS(nfserr);
 }
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
        fh_copy(&resp->fh, &argp->fh);
        nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
+        fh_unlock(&resp->fh);
        RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7e32bd394e86..988cbb3a19b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
 */
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
@@ -79,11 +80,6 @@ enum nfs_cb_opnum4 {
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
-struct nfs4_rpc_args {
-        void                            *args_op;
-        struct nfsd4_cb_sequence        args_seq;
-};
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
@@ -147,8 +143,6 @@ struct nfs4_cb_compound_hdr {
        u32             minorversion;
        /* res */
        int             status;
-        u32             taglen;
-        char            *tag;
 };
 static struct {
@@ -209,6 +203,16 @@ nfs_cb_stat_to_errno(int stat)
 */
 static void
+encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+{
+        __be32 *p;
+        RESERVE_SPACE(sizeof(stateid_t));
+        WRITE32(sid->si_generation);
+        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+}
+static void
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
        __be32 * p;
@@ -233,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
        __be32 *p;
        int len = dp->dl_fh.fh_size;
-        RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len);
+        RESERVE_SPACE(4);
        WRITE32(OP_CB_RECALL);
-        WRITE32(dp->dl_stateid.si_generation);
+        encode_stateid(xdr, &dp->dl_stateid);
-        WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t));
+        RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
        WRITE32(0); /* truncate optimization not implemented */
        WRITE32(len);
        WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -297,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
 static int
 decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
        __be32 *p;
+        u32 taglen;
        READ_BUF(8);
        READ32(hdr->status);
-        READ32(hdr->taglen);
+        /* We've got no use for the tag; ignore it: */
-        READ_BUF(hdr->taglen + 4);
+        READ32(taglen);
-        hdr->tag = (char *)p;
+        READ_BUF(taglen + 4);
-        p += XDR_QUADLEN(hdr->taglen);
+        p += XDR_QUADLEN(taglen);
        READ32(hdr->nops);
        return 0;
 }
@@ -428,13 +433,19 @@ static struct rpc_procinfo     nfs4_cb_procedures[] = {
 };
 static struct rpc_version       nfs_cb_version4 = {
+/*
+ * Note on the callback rpc program version number: despite language in rfc
+ * 5661 section 18.36.3 requiring servers to use 4 in this field, the
+ * official xdr descriptions for both 4.0 and 4.1 specify version 1, and
+ * in practice that appears to be what implementations use.  The section
+ * 18.36.3 language is expected to be fixed in an erratum.
+ */
        .number                 = 1,
        .nrprocs                = ARRAY_SIZE(nfs4_cb_procedures),
        .procs                  = nfs4_cb_procedures
 };
 static struct rpc_version *     nfs_cb_version[] = {
-        NULL,
        &nfs_cb_version4,
 };
@@ -456,15 +467,14 @@ static struct rpc_program cb_program = {
 static int max_cb_time(void)
 {
-        return max(NFSD_LEASE_TIME/10, (time_t)1) * HZ;
+        return max(nfsd4_lease/10, (time_t)1) * HZ;
 }
 /* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cb_set an atomic? */
+ * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp)
+int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_timeout      timeparms = {
                .to_initval     = max_cb_time(),
                .to_retries     = 0,
@@ -476,7 +486,7 @@ int setup_callback_client(struct nfs4_client *clp)
                .timeout        = &timeparms,
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
-                .version        = nfs_cb_version[1]->number,
+                .version        = 0,
                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
                .client_name    = clp->cl_principal,
@@ -486,7 +496,7 @@ int setup_callback_client(struct nfs4_client *clp)
        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
                return -EINVAL;
        if (cb->cb_minorversion) {
-                args.bc_xprt = clp->cl_cb_xprt;
+                args.bc_xprt = cb->cb_xprt;
                args.protocol = XPRT_TRANSPORT_BC_TCP;
        }
        /* Create RPC client */
@@ -496,7 +506,7 @@ int setup_callback_client(struct nfs4_client *clp)
                        PTR_ERR(client));
                return PTR_ERR(client);
        }
-        cb->cb_client = client;
+        nfsd4_set_callback_client(clp, client);
        return 0;
 }
@@ -514,8 +524,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
        if (task->tk_status)
                warn_no_callback_path(clp, task->tk_status);
        else
-                atomic_set(&clp->cl_cb_conn.cb_set, 1);
+                atomic_set(&clp->cl_cb_set, 1);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -537,7 +546,6 @@ int set_callback_cred(void)
 void do_probe_callback(struct nfs4_client *clp)
 {
-        struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
@@ -545,34 +553,27 @@ void do_probe_callback(struct nfs4_client *clp)
        };
        int status;
-        status = rpc_call_async(cb->cb_client, &msg,
+        status = rpc_call_async(clp->cl_cb_client, &msg,
                                RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
                                &nfsd4_cb_probe_ops, (void *)clp);
-        if (status) {
+        if (status)
                warn_no_callback_path(clp, status);
-                put_nfs4_client(clp);
-        }
 }
 /*
 * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
 */
-void
+void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *cb)
-nfsd4_probe_callback(struct nfs4_client *clp)
 {
        int status;
-        BUG_ON(atomic_read(&clp->cl_cb_conn.cb_set));
+        BUG_ON(atomic_read(&clp->cl_cb_set));
-        status = setup_callback_client(clp);
+        status = setup_callback_client(clp, cb);
        if (status) {
                warn_no_callback_path(clp, status);
                return;
        }
-        /* the task holds a reference to the nfs4_client struct */
-        atomic_inc(&clp->cl_count);
        do_probe_callback(clp);
 }
@@ -658,47 +659,57 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
        }
 }
 static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
        struct nfs4_client *clp = dp->dl_client;
+        struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
        nfsd4_cb_done(task, calldata);
+        if (current_rpc_client == NULL) {
+                /* We're shutting down; give up. */
+                /* XXX: err, or is it ok just to fall through
+                 * and rpc_restart_call? */
+                return;
+        }
        switch (task->tk_status) {
-        case -EIO:
+        case 0:
-                /* Network partition? */
+                return;
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
-                warn_no_callback_path(clp, task->tk_status);
        case -EBADHANDLE:
        case -NFS4ERR_BAD_STATEID:
                /* Race: client probably got cb_recall
                 * before open reply granting delegation */
                break;
        default:
-                /* success, or error we can't handle */
+                /* Network partition? */
-                goto done;
+                atomic_set(&clp->cl_cb_set, 0);
+                warn_no_callback_path(clp, task->tk_status);
+                if (current_rpc_client != task->tk_client) {
+                        /* queue a callback on the new connection: */
+                        atomic_inc(&dp->dl_count);
+                        nfsd4_cb_recall(dp);
+                        return;
+                }
        }
        if (dp->dl_retries--) {
                rpc_delay(task, 2*HZ);
                task->tk_status = 0;
-                rpc_restart_call(task);
+                rpc_restart_call_prepare(task);
                return;
        } else {
-                atomic_set(&clp->cl_cb_conn.cb_set, 0);
+                atomic_set(&clp->cl_cb_set, 0);
                warn_no_callback_path(clp, task->tk_status);
        }
-done:
-        kfree(task->tk_msg.rpc_argp);
 }
 static void nfsd4_cb_recall_release(void *calldata)
 {
        struct nfs4_delegation *dp = calldata;
-        struct nfs4_client *clp = dp->dl_client;
        nfs4_put_delegation(dp);
-        put_nfs4_client(clp);
 }
 static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -707,33 +718,73 @@ static const struct rpc_call_ops nfsd4_cb_recall_ops = {
        .rpc_release = nfsd4_cb_recall_release,
 };
+static struct workqueue_struct *callback_wq;
+int nfsd4_create_callback_queue(void)
+{
+        callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+        if (!callback_wq)
+                return -ENOMEM;
+        return 0;
+}
+void nfsd4_destroy_callback_queue(void)
+{
+        destroy_workqueue(callback_wq);
+}
+/* must be called under the state lock */
+void nfsd4_set_callback_client(struct nfs4_client *clp, struct rpc_clnt *new)
+{
+        struct rpc_clnt *old = clp->cl_cb_client;
+        clp->cl_cb_client = new;
+        /*
+         * After this, any work that saw the old value of cl_cb_client will
+         * be gone:
+         */
+        flush_workqueue(callback_wq);
+        /* So we can safely shut it down: */
+        if (old)
+                rpc_shutdown_client(old);
+}
 /*
 * called with dp->dl_count inc'ed.
 */
-void
+static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
-nfsd4_cb_recall(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_client;
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
+        struct rpc_clnt *clnt = clp->cl_cb_client;
-        struct nfs4_rpc_args *args;
+        struct nfs4_rpc_args *args = &dp->dl_recall.cb_args;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
                .rpc_cred = callback_cred
        };
-        int status = -ENOMEM;
-        args = kzalloc(sizeof(*args), GFP_KERNEL);
+        if (clnt == NULL) {
-        if (!args)
+                nfs4_put_delegation(dp);
-                goto out;
+                return; /* Client is shutting down; give up. */
+        }
        args->args_op = dp;
        msg.rpc_argp = args;
        dp->dl_retries = 1;
-        status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
+        rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
-                                &nfsd4_cb_recall_ops, dp);
+}
-out:
-        if (status) {
+void nfsd4_do_callback_rpc(struct work_struct *w)
-                kfree(args);
+{
-                put_nfs4_client(clp);
+        /* XXX: for now, just send off delegation recall. */
-                nfs4_put_delegation(dp);
+        /* In future, generalize to handle any sort of callback. */
-        }
+        struct nfsd4_callback *c = container_of(w, struct nfsd4_callback, cb_work);
+        struct nfs4_delegation *dp = container_of(c, struct nfs4_delegation, dl_recall);
+        _nfsd4_cb_recall(dp);
+}
+void nfsd4_cb_recall(struct nfs4_delegation *dp)
+{
+        queue_work(callback_wq, &dp->dl_recall.cb_work);
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2ab9e8501bfe..59ec449b0c7f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -969,20 +969,36 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 /*
- * Enforce NFSv4.1 COMPOUND ordering rules.
+ * Enforce NFSv4.1 COMPOUND ordering rules:
 *
- * TODO:
+ * Also note, enforced elsewhere:
- * - enforce NFS4ERR_NOT_ONLY_OP,
+ *      - SEQUENCE other than as first op results in
- * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ *        NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
+ *      - BIND_CONN_TO_SESSION must be the only op in its compound
+ *        (Will be enforced in nfsd4_bind_conn_to_session().)
+ *      - DESTROY_SESSION must be the final operation in a compound, if
+ *        sessionid's in SEQUENCE and DESTROY_SESSION are the same.
+ *        (Enforced in nfsd4_destroy_session().)
 */
-static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
 {
-        if (args->minorversion && args->opcnt > 0) {
+        struct nfsd4_op *op = &args->ops[0];
-                struct nfsd4_op *op = &args->ops[0];
-                return (op->status == nfserr_op_illegal) ||
+        /* These ordering requirements don't apply to NFSv4.0: */
-                       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+        if (args->minorversion == 0)
-        }
+                return nfs_ok;
-        return true;
+        /* This is weird, but OK, not our problem: */
+        if (args->opcnt == 0)
+                return nfs_ok;
+        if (op->status == nfserr_op_illegal)
+                return nfs_ok;
+        if (!(nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP))
+                return nfserr_op_not_in_session;
+        if (op->opnum == OP_SEQUENCE)
+                return nfs_ok;
+        if (args->opcnt != 1)
+                return nfserr_not_only_op;
+        return nfs_ok;
 }
 /*
@@ -1012,6 +1028,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->rqstp = rqstp;
        resp->cstate.minorversion = args->minorversion;
        resp->cstate.replay_owner = NULL;
+        resp->cstate.session = NULL;
        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
        /* Use the deferral mechanism only for NFSv4.0 compounds */
@@ -1024,13 +1041,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > nfsd_supported_minorversion)
                goto out;
-        if (!nfs41_op_ordering_ok(args)) {
+        status = nfs41_check_op_ordering(args);
+        if (status) {
                op = &args->ops[0];
-                op->status = nfserr_sequence_pos;
+                op->status = status;
                goto encode_op;
        }
-        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -1295,6 +1312,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
                .op_name = "OP_SEQUENCE",
        },
+        [OP_RECLAIM_COMPLETE] = {
+                .op_func = (nfsd4op_func)nfsd4_reclaim_complete,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_RECLAIM_COMPLETE",
+        },
 };
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static struct path rec_dir;
+static struct file *rec_file;
-static int rec_dir_init = 0;
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
        return status;
 }
-static void
-nfsd4_sync_rec_dir(void)
-{
-        vfs_fsync(NULL, rec_dir.dentry, 0);
-}
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
        char *dname = clp->cl_recdir;
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
-        if (!rec_dir_init || clp->cl_firststate)
+        if (!rec_file || clp->cl_firststate)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
+        dir = rec_file->f_path.dentry;
        /* lock the parent */
-        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_lock(&dir->d_inode->i_mutex);
-        dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+        dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
                goto out_put;
        }
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out_put;
-        status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+        status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out_put:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        if (status == 0) {
                clp->cl_firststate = 1;
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
        }
        nfs4_reset_creds(original_cred);
        dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        struct dentry *dentry;
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+        filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
 static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
-        mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        dir = rec_file->f_path.dentry;
-        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+        dentry = lookup_one_len(name, dir, namlen);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
-        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
+        status = vfs_rmdir(dir->d_inode, dentry);
 out:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        return status;
 }
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        const struct cred *original_cred;
        int status;
-        if (!rec_dir_init || !clp->cl_firststate)
+        if (!rec_file || !clp->cl_firststate)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
        clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
        nfs4_reset_creds(original_cred);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
 nfsd4_recdir_purge_old(void) {
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("nfsd4: failed to purge old clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
 }
 static int
@@ -355,10 +350,13 @@ int
 nfsd4_recdir_load(void) {
        int status;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+        if (!rec_file)
+                return 0;
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
        if (status)
                printk("nfsd4: failed loading clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
        return status;
 }
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
                        rec_dirname);
-        BUG_ON(rec_dir_init);
+        BUG_ON(rec_file);
        status = nfs4_save_creds(&original_cred);
        if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
                return;
        }
-        status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+        rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
-                        &rec_dir);
+        if (IS_ERR(rec_file)) {
-        if (status)
                printk("NFSD: unable to find recovery directory %s\n",
                                rec_dirname);
+                rec_file = NULL;
+        }
-        if (!status)
-                rec_dir_init = 1;
        nfs4_reset_creds(original_cred);
 }
 void
 nfsd4_shutdown_recdir(void)
 {
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        rec_dir_init = 0;
+        fput(rec_file);
-        path_put(&rec_dir);
+        rec_file = NULL;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6a8fedaa4f55..cf0d2ffb3c84 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -45,13 +45,12 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static time_t lease_time = 90;     /* default lease time */
+time_t nfsd4_lease = 90;     /* default lease time */
-static time_t user_lease_time = 90;
+time_t nfsd4_grace = 90;
 static time_t boot_time;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
-static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
 static u64 current_sessionid = 1;
@@ -163,6 +162,46 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 static struct list_head file_hashtbl[FILE_HASH_SIZE];
 static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+        BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+        atomic_inc(&fp->fi_access[oflag]);
+}
+static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+{
+        if (oflag == O_RDWR) {
+                __nfs4_file_get_access(fp, O_RDONLY);
+                __nfs4_file_get_access(fp, O_WRONLY);
+        } else
+                __nfs4_file_get_access(fp, oflag);
+}
+static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+{
+        if (fp->fi_fds[oflag]) {
+                fput(fp->fi_fds[oflag]);
+                fp->fi_fds[oflag] = NULL;
+        }
+}
+static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+        if (atomic_dec_and_test(&fp->fi_access[oflag])) {
+                nfs4_file_put_fd(fp, O_RDWR);
+                nfs4_file_put_fd(fp, oflag);
+        }
+}
+static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+{
+        if (oflag == O_RDWR) {
+                __nfs4_file_put_access(fp, O_RDONLY);
+                __nfs4_file_put_access(fp, O_WRONLY);
+        } else
+                __nfs4_file_put_access(fp, oflag);
+}
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
        dprintk("NFSD alloc_init_deleg\n");
+        /*
+         * Major work on the lease subsystem (for example, to support
+         * calbacks on stat) will be required before we can support
+         * write delegations properly.
+         */
+        if (type != NFS4_OPEN_DELEGATE_READ)
+                return NULL;
        if (fp->fi_had_conflict)
                return NULL;
        if (num_delegations > max_delegations)
@@ -185,12 +231,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
+        nfs4_file_get_access(fp, O_RDONLY);
        dp->dl_flock = NULL;
-        get_file(stp->st_vfs_file);
-        dp->dl_vfs_file = stp->st_vfs_file;
        dp->dl_type = type;
        dp->dl_ident = cb->cb_ident;
-        dp->dl_stateid.si_boot = get_seconds();
+        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
        dp->dl_stateid.si_fileid = 0;
        dp->dl_stateid.si_generation = 0;
@@ -199,6 +244,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        atomic_set(&dp->dl_count, 1);
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        list_add(&dp->dl_perclnt, &clp->cl_delegations);
+        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -221,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 static void
 nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        struct file *filp = dp->dl_vfs_file;
+        struct file *filp = find_readable_file(dp->dl_file);
        dprintk("NFSD: close_delegation dp %p\n",dp);
-        dp->dl_vfs_file = NULL;
-        /* The following nfsd_close may not actually close the file,
-         * but we want to remove the lease in any case. */
        if (dp->dl_flock)
                vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
-        nfsd_close(filp);
+        nfs4_file_put_access(dp->dl_file, O_RDONLY);
 }
 /* Called under the state lock. */
@@ -249,6 +292,9 @@ unhash_delegation(struct nfs4_delegation *dp)
 * SETCLIENTID state 
 */
+/* client_lock protects the client lru list and session hash table */
+static DEFINE_SPINLOCK(client_lock);
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
@@ -298,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
 static void release_lock_stateid(struct nfs4_stateid *stp)
 {
+        struct file *file;
        unhash_generic_stateid(stp);
-        locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+        file = find_any_file(stp->st_file);
+        if (file)
+                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
        free_generic_stateid(stp);
 }
@@ -337,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
        }
 }
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *      OPEN allow read, deny write
+ *      OPEN allow both, deny none
+ *      DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
+static void
+set_access(unsigned int *access, unsigned long bmap) {
+        int i;
+        *access = 0;
+        for (i = 1; i < 4; i++) {
+                if (test_bit(i, &bmap))
+                        *access |= i;
+        }
+}
+static void
+set_deny(unsigned int *deny, unsigned long bmap) {
+        int i;
+        *deny = 0;
+        for (i = 0; i < 4; i++) {
+                if (test_bit(i, &bmap))
+                        *deny |= i ;
+        }
+}
+static int
+test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
+        unsigned int access, deny;
+        set_access(&access, stp->st_access_bmap);
+        set_deny(&deny, stp->st_deny_bmap);
+        if ((access & open->op_share_deny) || (deny & open->op_share_access))
+                return 0;
+        return 1;
+}
+static int nfs4_access_to_omode(u32 access)
+{
+        switch (access & NFS4_SHARE_ACCESS_BOTH) {
+        case NFS4_SHARE_ACCESS_READ:
+                return O_RDONLY;
+        case NFS4_SHARE_ACCESS_WRITE:
+                return O_WRONLY;
+        case NFS4_SHARE_ACCESS_BOTH:
+                return O_RDWR;
+        }
+        BUG();
+}
+static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
+{
+        unsigned int access;
+        set_access(&access, stp->st_access_bmap);
+        return nfs4_access_to_omode(access);
+}
 static void release_open_stateid(struct nfs4_stateid *stp)
 {
+        int oflag = nfs4_access_bmap_to_omode(stp);
        unhash_generic_stateid(stp);
        release_stateid_lockowners(stp);
-        nfsd_close(stp->st_vfs_file);
+        nfs4_file_put_access(stp->st_file, oflag);
        free_generic_stateid(stp);
 }
@@ -367,7 +491,6 @@ static void release_openowner(struct nfs4_stateowner *sop)
        nfs4_put_stateowner(sop);
 }
-static DEFINE_SPINLOCK(sessionid_lock);
 #define SESSION_HASH_SIZE       512
 static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
@@ -454,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
        spin_unlock(&nfsd_drc_lock);
        if (fchan->maxreqs == 0)
-                return nfserr_serverfault;
+                return nfserr_jukebox;
        fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
        return 0;
@@ -539,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
                     + sizeof(struct nfsd4_session) > PAGE_SIZE);
-        status = nfserr_serverfault;
+        status = nfserr_jukebox;
        /* allocate struct nfsd4_session and slot table pointers in one piece */
        slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
        new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
@@ -565,10 +688,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        new->se_flags = cses->flags;
        kref_init(&new->se_ref);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        list_add(&new->se_hash, &sessionid_hashtbl[idx]);
        list_add(&new->se_perclnt, &clp->cl_sessions);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
        status = nfs_ok;
 out:
@@ -579,7 +702,7 @@ out_free:
        goto out;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static struct nfsd4_session *
 find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
 {
@@ -588,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
        dump_sessionid(__func__, sessionid);
        idx = hash_sessionid(sessionid);
-        dprintk("%s: idx is %d\n", __func__, idx);
        /* Search in the appropriate list */
        list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
-                dump_sessionid("list traversal", &elem->se_sessionid);
                if (!memcmp(elem->se_sessionid.data, sessionid->data,
                            NFS4_MAX_SESSIONID_LEN)) {
                        return elem;
@@ -602,7 +723,7 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
        return NULL;
 }
-/* caller must hold sessionid_lock */
+/* caller must hold client_lock */
 static void
 unhash_session(struct nfsd4_session *ses)
 {
@@ -610,15 +731,6 @@ unhash_session(struct nfsd4_session *ses)
        list_del(&ses->se_perclnt);
 }
-static void
-release_session(struct nfsd4_session *ses)
-{
-        spin_lock(&sessionid_lock);
-        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
-        nfsd4_put_session(ses);
-}
 void
 free_session(struct kref *kref)
 {
@@ -634,9 +746,18 @@ free_session(struct kref *kref)
        kfree(ses);
 }
+/* must be called under the client_lock */
 static inline void
-renew_client(struct nfs4_client *clp)
+renew_client_locked(struct nfs4_client *clp)
 {
+        if (is_client_expired(clp)) {
+                dprintk("%s: client (clientid %08x/%08x) already expired\n",
+                        __func__,
+                        clp->cl_clientid.cl_boot,
+                        clp->cl_clientid.cl_id);
+                return;
+        }
        /*
        * Move client to the end to the LRU list.
        */
@@ -647,6 +768,14 @@ renew_client(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+        spin_lock(&client_lock);
+        renew_client_locked(clp);
+        spin_unlock(&client_lock);
+}
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid)
@@ -680,27 +809,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        return clp;
 }
-static void
-shutdown_callback_client(struct nfs4_client *clp)
-{
-        struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
-        if (clnt) {
-                /*
-                 * Callback threads take a reference on the client, so there
-                 * should be no outstanding callbacks at this point.
-                 */
-                clp->cl_cb_conn.cb_client = NULL;
-                rpc_shutdown_client(clnt);
-        }
-}
 static inline void
 free_client(struct nfs4_client *clp)
 {
-        shutdown_callback_client(clp);
-        if (clp->cl_cb_xprt)
-                svc_xprt_put(clp->cl_cb_xprt);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
        kfree(clp->cl_principal);
@@ -709,10 +820,33 @@ free_client(struct nfs4_client *clp)
 }
 void
-put_nfs4_client(struct nfs4_client *clp)
+release_session_client(struct nfsd4_session *session)
 {
-        if (atomic_dec_and_test(&clp->cl_count))
+        struct nfs4_client *clp = session->se_client;
+        if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+                return;
+        if (is_client_expired(clp)) {
                free_client(clp);
+                session->se_client = NULL;
+        } else
+                renew_client_locked(clp);
+        spin_unlock(&client_lock);
+}
+/* must be called under the client_lock */
+static inline void
+unhash_client_locked(struct nfs4_client *clp)
+{
+        mark_client_expired(clp);
+        list_del(&clp->cl_lru);
+        while (!list_empty(&clp->cl_sessions)) {
+                struct nfsd4_session  *ses;
+                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                                 se_perclnt);
+                unhash_session(ses);
+                nfsd4_put_session(ses);
+        }
 }
 static void
@@ -722,9 +856,6 @@ expire_client(struct nfs4_client *clp)
        struct nfs4_delegation *dp;
        struct list_head reaplist;
-        dprintk("NFSD: expire_client cl_count %d\n",
-                            atomic_read(&clp->cl_count));
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
@@ -740,20 +871,20 @@ expire_client(struct nfs4_client *clp)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        list_del(&clp->cl_idhash);
-        list_del(&clp->cl_strhash);
-        list_del(&clp->cl_lru);
        while (!list_empty(&clp->cl_openowners)) {
                sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
                release_openowner(sop);
        }
-        while (!list_empty(&clp->cl_sessions)) {
+        nfsd4_set_callback_client(clp, NULL);
-                struct nfsd4_session  *ses;
+        if (clp->cl_cb_conn.cb_xprt)
-                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-                                 se_perclnt);
+        list_del(&clp->cl_idhash);
-                release_session(ses);
+        list_del(&clp->cl_strhash);
-        }
+        spin_lock(&client_lock);
-        put_nfs4_client(clp);
+        unhash_client_locked(clp);
+        if (atomic_read(&clp->cl_refcount) == 0)
+                free_client(clp);
+        spin_unlock(&client_lock);
 }
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -839,14 +970,15 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        }
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
-        atomic_set(&clp->cl_count, 1);
+        atomic_set(&clp->cl_refcount, 0);
-        atomic_set(&clp->cl_cb_conn.cb_set, 0);
+        atomic_set(&clp->cl_cb_set, 0);
        INIT_LIST_HEAD(&clp->cl_idhash);
        INIT_LIST_HEAD(&clp->cl_strhash);
        INIT_LIST_HEAD(&clp->cl_openowners);
        INIT_LIST_HEAD(&clp->cl_delegations);
        INIT_LIST_HEAD(&clp->cl_sessions);
        INIT_LIST_HEAD(&clp->cl_lru);
+        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        copy_verf(clp, verf);
@@ -877,8 +1009,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
        list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
        idhashval = clientid_hashval(clp->cl_clientid.cl_id);
        list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
-        list_add_tail(&clp->cl_lru, &client_lru);
+        renew_client(clp);
-        clp->cl_time = get_seconds();
 }
 static void
@@ -888,10 +1019,9 @@ move_to_confirmed(struct nfs4_client *clp)
        unsigned int strhashval;
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
-        list_del_init(&clp->cl_strhash);
        list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
        strhashval = clientstr_hashval(clp->cl_recdir);
-        list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+        list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
        renew_client(clp);
 }
@@ -1207,7 +1337,7 @@ out_new:
        /* Normal case */
        new = create_client(exid->clname, dname, rqstp, &verf);
        if (new == NULL) {
-                status = nfserr_serverfault;
+                status = nfserr_jukebox;
                goto out;
        }
@@ -1327,15 +1457,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                cs_slot->sl_seqid++; /* from 0 to 1 */
                move_to_confirmed(unconf);
-                /*
-                 * We do not support RDMA or persistent sessions
-                 */
-                cr_ses->flags &= ~SESSION4_PERSIST;
-                cr_ses->flags &= ~SESSION4_RDMA;
                if (cr_ses->flags & SESSION4_BACK_CHAN) {
-                        unconf->cl_cb_xprt = rqstp->rq_xprt;
+                        unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
-                        svc_xprt_get(unconf->cl_cb_xprt);
+                        svc_xprt_get(rqstp->rq_xprt);
                        rpc_copy_addr(
                                (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
                                sa);
@@ -1344,7 +1468,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                                cstate->minorversion;
                        unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
                        unconf->cl_cb_seq_nr = 1;
-                        nfsd4_probe_callback(unconf);
+                        nfsd4_probe_callback(unconf, &unconf->cl_cb_conn);
                }
                conf = unconf;
        } else {
@@ -1352,6 +1476,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                goto out;
        }
+        /*
+         * We do not support RDMA or persistent sessions
+         */
+        cr_ses->flags &= ~SESSION4_PERSIST;
+        cr_ses->flags &= ~SESSION4_RDMA;
        status = alloc_init_session(rqstp, conf, cr_ses);
        if (status)
                goto out;
@@ -1369,6 +1499,21 @@ out:
        return status;
 }
+static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+        return argp->opcnt == resp->opcnt;
+}
+static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
+{
+        if (!session)
+                return 0;
+        return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
+}
 __be32
 nfsd4_destroy_session(struct svc_rqst *r,
                      struct nfsd4_compound_state *cstate,
@@ -1384,19 +1529,25 @@ nfsd4_destroy_session(struct svc_rqst *r,
         * - Do we need to clear any callback info from previous session?
         */
+        if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
+                if (!nfsd4_last_compound_op(r))
+                        return nfserr_not_only_op;
+        }
        dump_sessionid(__func__, &sessionid->sessionid);
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
        if (!ses) {
-                spin_unlock(&sessionid_lock);
+                spin_unlock(&client_lock);
                goto out;
        }
        unhash_session(ses);
-        spin_unlock(&sessionid_lock);
+        spin_unlock(&client_lock);
+        nfs4_lock_state();
        /* wait for callbacks */
-        shutdown_callback_client(ses->se_client);
+        nfsd4_set_callback_client(ses->se_client, NULL);
+        nfs4_unlock_state();
        nfsd4_put_session(ses);
        status = nfs_ok;
 out:
@@ -1417,7 +1568,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
-        spin_lock(&sessionid_lock);
+        spin_lock(&client_lock);
        status = nfserr_badsession;
        session = find_in_sessionid_hashtbl(&seq->sessionid);
        if (!session)
@@ -1456,23 +1607,47 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        cstate->slot = slot;
        cstate->session = session;
-        /* Hold a session reference until done processing the compound:
-         * nfsd4_put_session called only if the cstate slot is set.
-         */
-        nfsd4_get_session(session);
 out:
-        spin_unlock(&sessionid_lock);
+        /* Hold a session reference until done processing the compound. */
-        /* Renew the clientid on success and on replay */
        if (cstate->session) {
-                nfs4_lock_state();
+                nfsd4_get_session(cstate->session);
-                renew_client(session->se_client);
+                atomic_inc(&session->se_client->cl_refcount);
-                nfs4_unlock_state();
        }
+        spin_unlock(&client_lock);
        dprintk("%s: return %d\n", __func__, ntohl(status));
        return status;
 }
 __be32
+nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
+{
+        if (rc->rca_one_fs) {
+                if (!cstate->current_fh.fh_dentry)
+                        return nfserr_nofilehandle;
+                /*
+                 * We don't take advantage of the rca_one_fs case.
+                 * That's OK, it's optional, we can safely ignore it.
+                 */
+                 return nfs_ok;
+        }
+        nfs4_lock_state();
+        if (is_client_expired(cstate->session->se_client)) {
+                nfs4_unlock_state();
+                /*
+                 * The following error isn't really legal.
+                 * But we only get here if the client just explicitly
+                 * destroyed the client.  Surely it no longer cares what
+                 * error it gets back on an operation for the dead
+                 * client.
+                 */
+                return nfserr_stale_clientid;
+        }
+        nfsd4_create_clid_dir(cstate->session->se_client);
+        nfs4_unlock_state();
+        return nfs_ok;
+}
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                  struct nfsd4_setclientid *setclid)
 {
@@ -1631,9 +1806,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
                        status = nfserr_clid_inuse;
                else {
-                        /* XXX: We just turn off callbacks until we can handle
+                        atomic_set(&conf->cl_cb_set, 0);
-                          * change request correctly. */
+                        nfsd4_probe_callback(conf, &unconf->cl_cb_conn);
-                        atomic_set(&conf->cl_cb_conn.cb_set, 0);
                        expire_client(unconf);
                        status = nfs_ok;
@@ -1667,7 +1841,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                        }
                        move_to_confirmed(unconf);
                        conf = unconf;
-                        nfsd4_probe_callback(conf);
+                        nfsd4_probe_callback(conf, &conf->cl_cb_conn);
                        status = nfs_ok;
                }
        } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
@@ -1700,12 +1874,14 @@ alloc_init_file(struct inode *ino)
                INIT_LIST_HEAD(&fp->fi_hash);
                INIT_LIST_HEAD(&fp->fi_stateids);
                INIT_LIST_HEAD(&fp->fi_delegations);
-                spin_lock(&recall_lock);
-                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
-                spin_unlock(&recall_lock);
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+                memset(fp->fi_access, 0, sizeof(fp->fi_access));
+                spin_lock(&recall_lock);
+                list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+                spin_unlock(&recall_lock);
                return fp;
        }
        return NULL;
@@ -1827,7 +2003,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
@@ -1914,57 +2090,6 @@ static inline int deny_valid(u32 x)
 }
 /*
- * We store the NONE, READ, WRITE, and BOTH bits separately in the
- * st_{access,deny}_bmap field of the stateid, in order to track not
- * only what share bits are currently in force, but also what
- * combinations of share bits previous opens have used.  This allows us
- * to enforce the recommendation of rfc 3530 14.2.19 that the server
- * return an error if the client attempt to downgrade to a combination
- * of share bits not explicable by closing some of its previous opens.
- *
- * XXX: This enforcement is actually incomplete, since we don't keep
- * track of access/deny bit combinations; so, e.g., we allow:
- *
- *      OPEN allow read, deny write
- *      OPEN allow both, deny none
- *      DOWNGRADE allow read, deny none
- *
- * which we should reject.
- */
-static void
-set_access(unsigned int *access, unsigned long bmap) {
-        int i;
-        *access = 0;
-        for (i = 1; i < 4; i++) {
-                if (test_bit(i, &bmap))
-                        *access |= i;
-        }
-}
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
-        int i;
-        *deny = 0;
-        for (i = 0; i < 4; i++) {
-                if (test_bit(i, &bmap))
-                        *deny |= i ;
-        }
-}
-static int
-test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
-        unsigned int access, deny;
-        set_access(&access, stp->st_access_bmap);
-        set_deny(&deny, stp->st_deny_bmap);
-        if ((access & open->op_share_deny) || (deny & open->op_share_access))
-                return 0;
-        return 1;
-}
-/*
 * Called to check deny when READ with all zero stateid or
 * WRITE with all zero or all one stateid
 */
@@ -1995,14 +2120,12 @@ out:
 }
 static inline void
-nfs4_file_downgrade(struct file *filp, unsigned int share_access)
+nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 {
-        if (share_access & NFS4_SHARE_ACCESS_WRITE) {
+        if (share_access & NFS4_SHARE_ACCESS_WRITE)
-                drop_file_write_access(filp);
+                nfs4_file_put_access(fp, O_WRONLY);
-                spin_lock(&filp->f_lock);
+        if (share_access & NFS4_SHARE_ACCESS_READ)
-                filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
+                nfs4_file_put_access(fp, O_RDONLY);
-                spin_unlock(&filp->f_lock);
-        }
 }
 /*
@@ -2028,7 +2151,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * lock) we know the server hasn't removed the lease yet, we know
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        atomic_inc(&dp->dl_client->cl_count);
        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
@@ -2199,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
        return NULL;
 }
+int share_access_to_flags(u32 share_access)
+{
+        share_access &= ~NFS4_SHARE_WANT_MASK;
+        return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
+}
 static __be32
 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
                struct nfs4_delegation **dp)
@@ -2209,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
        *dp = find_delegation_file(fp, &open->op_delegate_stateid);
        if (*dp == NULL)
                goto out;
-        flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ?
+        flags = share_access_to_flags(open->op_share_access);
-                                                RD_STATE : WR_STATE;
        status = nfs4_check_delegmode(*dp, flags);
        if (status)
                *dp = NULL;
@@ -2252,30 +2380,53 @@ nfs4_alloc_stateid(void)
        return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
 }
+static inline int nfs4_access_to_access(u32 nfs4_access)
+{
+        int flags = 0;
+        if (nfs4_access & NFS4_SHARE_ACCESS_READ)
+                flags |= NFSD_MAY_READ;
+        if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
+                flags |= NFSD_MAY_WRITE;
+        return flags;
+}
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
+*fp, struct svc_fh *cur_fh, u32 nfs4_access)
+{
+        __be32 status;
+        int oflag = nfs4_access_to_omode(nfs4_access);
+        int access = nfs4_access_to_access(nfs4_access);
+        if (!fp->fi_fds[oflag]) {
+                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
+                        &fp->fi_fds[oflag]);
+                if (status == nfserr_dropit)
+                        status = nfserr_jukebox;
+                if (status)
+                        return status;
+        }
+        nfs4_file_get_access(fp, oflag);
+        return nfs_ok;
+}
 static __be32
 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
-                struct nfs4_delegation *dp,
+                struct nfs4_file *fp, struct svc_fh *cur_fh,
-                struct svc_fh *cur_fh, int flags)
+                struct nfsd4_open *open)
 {
        struct nfs4_stateid *stp;
+        __be32 status;
        stp = nfs4_alloc_stateid();
        if (stp == NULL)
                return nfserr_resource;
-        if (dp) {
+        status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
-                get_file(dp->dl_vfs_file);
+        if (status) {
-                stp->st_vfs_file = dp->dl_vfs_file;
+                kmem_cache_free(stateid_slab, stp);
-        } else {
+                return status;
-                __be32 status;
-                status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
-                                &stp->st_vfs_file);
-                if (status) {
-                        if (status == nfserr_dropit)
-                                status = nfserr_jukebox;
-                        kmem_cache_free(stateid_slab, stp);
-                        return status;
-                }
        }
        *stpp = stp;
        return 0;
@@ -2297,35 +2448,28 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 }
 static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-        struct file *filp = stp->st_vfs_file;
+        u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        bool new_access;
-        unsigned int share_access, new_writer;
        __be32 status;
-        set_access(&share_access, stp->st_access_bmap);
+        new_access = !test_bit(op_share_access, &stp->st_access_bmap);
-        new_writer = (~share_access) & open->op_share_access
+        if (new_access) {
-                        & NFS4_SHARE_ACCESS_WRITE;
+                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
+                if (status)
-        if (new_writer) {
+                        return status;
-                int err = get_write_access(inode);
-                if (err)
-                        return nfserrno(err);
-                err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
-                if (err)
-                        return nfserrno(err);
-                file_take_write(filp);
        }
        status = nfsd4_truncate(rqstp, cur_fh, open);
        if (status) {
-                if (new_writer)
+                if (new_access) {
-                        put_write_access(inode);
+                        int oflag = nfs4_access_to_omode(new_access);
+                        nfs4_file_put_access(fp, oflag);
+                }
                return status;
        }
        /* remember the open */
-        filp->f_mode |= open->op_share_access;
+        __set_bit(op_share_access, &stp->st_access_bmap);
-        __set_bit(open->op_share_access, &stp->st_access_bmap);
        __set_bit(open->op_share_deny, &stp->st_deny_bmap);
        return nfs_ok;
@@ -2347,7 +2491,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 {
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
-        struct nfs4_cb_conn *cb = &sop->so_client->cl_cb_conn;
+        int cb_up = atomic_read(&sop->so_client->cl_cb_set);
        struct file_lock fl, *flp = &fl;
        int status, flag = 0;
@@ -2355,7 +2499,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        open->op_recall = 0;
        switch (open->op_claim_type) {
                case NFS4_OPEN_CLAIM_PREVIOUS:
-                        if (!atomic_read(&cb->cb_set))
+                        if (!cb_up)
                                open->op_recall = 1;
                        flag = open->op_delegate_type;
                        if (flag == NFS4_OPEN_DELEGATE_NONE)
@@ -2366,7 +2510,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                         * had the chance to reclaim theirs.... */
                        if (locks_in_grace())
                                goto out;
-                        if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+                        if (!cb_up || !sop->so_confirmed)
                                goto out;
                        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                                flag = NFS4_OPEN_DELEGATE_WRITE;
@@ -2388,13 +2532,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
        fl.fl_end = OFFSET_MAX;
        fl.fl_owner =  (fl_owner_t)dp;
-        fl.fl_file = stp->st_vfs_file;
+        fl.fl_file = find_readable_file(stp->st_file);
+        BUG_ON(!fl.fl_file);
        fl.fl_pid = current->tgid;
        /* vfs_setlease checks to see if delegation should be handed out.
         * the lock_manager callbacks fl_mylease and fl_change are used
         */
-        if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) {
+        if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
                unhash_delegation(dp);
                flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2458,18 +2603,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
         */
        if (stp) {
                /* Stateid was found, this is an OPEN upgrade */
-                status = nfs4_upgrade_open(rqstp, current_fh, stp, open);
+                status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
                if (status)
                        goto out;
                update_stateid(&stp->st_stateid);
        } else {
-                /* Stateid was not found, this is a new OPEN */
+                status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
-                int flags = 0;
-                if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-                        flags |= NFSD_MAY_READ;
-                if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-                        flags |= NFSD_MAY_WRITE;
-                status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
                if (status)
                        goto out;
                init_stateid(stp, fp, open);
@@ -2483,10 +2622,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        }
        memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
-        if (nfsd4_has_session(&resp->cstate)) {
+        if (nfsd4_has_session(&resp->cstate))
                open->op_stateowner->so_confirmed = 1;
-                nfsd4_create_clid_dir(open->op_stateowner->so_client);
-        }
        /*
        * Attempt to hand out a delegation. No error return, because the
@@ -2537,7 +2674,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        renew_client(clp);
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
-                        && !atomic_read(&clp->cl_cb_conn.cb_set))
+                        && !atomic_read(&clp->cl_cb_set))
                goto out;
        status = nfs_ok;
 out:
@@ -2554,6 +2691,12 @@ nfsd4_end_grace(void)
        dprintk("NFSD: end of grace period\n");
        nfsd4_recdir_purge_old();
        locks_end_grace(&nfsd4_manager);
+        /*
+         * Now that every NFSv4 client has had the chance to recover and
+         * to see the (possibly new, possibly shorter) lease time, we
+         * can safely set the next grace time to the current lease time:
+         */
+        nfsd4_grace = nfsd4_lease;
 }
 static time_t
@@ -2563,15 +2706,17 @@ nfs4_laundromat(void)
        struct nfs4_stateowner *sop;
        struct nfs4_delegation *dp;
        struct list_head *pos, *next, reaplist;
-        time_t cutoff = get_seconds() - NFSD_LEASE_TIME;
+        time_t cutoff = get_seconds() - nfsd4_lease;
-        time_t t, clientid_val = NFSD_LEASE_TIME;
+        time_t t, clientid_val = nfsd4_lease;
-        time_t u, test_val = NFSD_LEASE_TIME;
+        time_t u, test_val = nfsd4_lease;
        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
        if (locks_in_grace())
                nfsd4_end_grace();
+        INIT_LIST_HEAD(&reaplist);
+        spin_lock(&client_lock);
        list_for_each_safe(pos, next, &client_lru) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -2580,12 +2725,22 @@ nfs4_laundromat(void)
                                clientid_val = t;
                        break;
                }
+                if (atomic_read(&clp->cl_refcount)) {
+                        dprintk("NFSD: client in use (clientid %08x)\n",
+                                clp->cl_clientid.cl_id);
+                        continue;
+                }
+                unhash_client_locked(clp);
+                list_add(&clp->cl_lru, &reaplist);
+        }
+        spin_unlock(&client_lock);
+        list_for_each_safe(pos, next, &reaplist) {
+                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
                        clp->cl_clientid.cl_id);
                nfsd4_remove_clid_dir(clp);
                expire_client(clp);
        }
-        INIT_LIST_HEAD(&reaplist);
        spin_lock(&recall_lock);
        list_for_each_safe(pos, next, &del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -2605,7 +2760,7 @@ nfs4_laundromat(void)
                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
-        test_val = NFSD_LEASE_TIME;
+        test_val = nfsd4_lease;
        list_for_each_safe(pos, next, &close_lru) {
                sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
                if (time_after((unsigned long)sop->so_time, (unsigned long)cutoff)) {
@@ -2655,45 +2810,17 @@ search_close_lru(u32 st_id, int flags)
 static inline int
 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
 {
-        return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode;
+        return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
 }
 static int
 STALE_STATEID(stateid_t *stateid)
 {
-        if (time_after((unsigned long)boot_time,
+        if (stateid->si_boot == boot_time)
-                        (unsigned long)stateid->si_boot)) {
+                return 0;
-                dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
+        dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static int
-EXPIRED_STATEID(stateid_t *stateid)
-{
-        if (time_before((unsigned long)boot_time,
-                        ((unsigned long)stateid->si_boot)) &&
-            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-                dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-                        STATEID_VAL(stateid));
-                return 1;
-        }
-        return 0;
-}
-static __be32
-stateid_error_map(stateid_t *stateid)
-{
-        if (STALE_STATEID(stateid))
-                return nfserr_stale_stateid;
-        if (EXPIRED_STATEID(stateid))
-                return nfserr_expired;
-        dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
                STATEID_VAL(stateid));
-        return nfserr_bad_stateid;
+        return 1;
 }
 static inline int
@@ -2716,6 +2843,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
 {
        __be32 status = nfserr_openmode;
+        /* For lock stateid's, we test the parent open, not the lock: */
+        if (stp->st_openstp)
+                stp = stp->st_openstp;
        if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
                goto out;
        if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2817,10 +2947,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
        status = nfserr_bad_stateid;
        if (is_delegation_stateid(stateid)) {
                dp = find_delegation_stateid(ino, stateid);
-                if (!dp) {
+                if (!dp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                status = check_stateid_generation(stateid, &dp->dl_stateid,
                                                  flags);
                if (status)
@@ -2830,13 +2958,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        goto out;
                renew_client(dp->dl_client);
                if (filpp)
-                        *filpp = dp->dl_vfs_file;
+                        *filpp = find_readable_file(dp->dl_file);
+                BUG_ON(!*filpp);
        } else { /* open or lock stateid */
                stp = find_stateid(stateid, flags);
-                if (!stp) {
+                if (!stp)
-                        status = stateid_error_map(stateid);
                        goto out;
-                }
                if (nfs4_check_fh(current_fh, stp))
                        goto out;
                if (!stp->st_stateowner->so_confirmed)
@@ -2849,8 +2976,12 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                renew_client(stp->st_stateowner->so_client);
-                if (filpp)
+                if (filpp) {
-                        *filpp = stp->st_vfs_file;
+                        if (flags & RD_STATE)
+                                *filpp = find_readable_file(stp->st_file);
+                        else
+                                *filpp = find_writeable_file(stp->st_file);
+                }
        }
        status = nfs_ok;
 out:
@@ -2908,7 +3039,7 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
                 */
                sop = search_close_lru(stateid->si_stateownerid, flags);
                if (sop == NULL)
-                        return stateid_error_map(stateid);
+                        return nfserr_bad_stateid;
                *sopp = sop;
                goto check_replay;
        }
@@ -3086,8 +3217,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                goto out;
        }
        set_access(&share_access, stp->st_access_bmap);
-        nfs4_file_downgrade(stp->st_vfs_file,
+        nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
-                            share_access & ~od->od_share_access);
        reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
        reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3175,10 +3305,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!is_delegation_stateid(stateid))
                goto out;
        dp = find_delegation_stateid(inode, stateid);
-        if (!dp) {
+        if (!dp)
-                status = stateid_error_map(stateid);
                goto out;
-        }
        status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
        if (status)
                goto out;
@@ -3308,11 +3436,9 @@ static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
        struct nfs4_stateowner *sop;
-        unsigned int hval;
        if (fl->fl_lmops == &nfsd_posix_mng_ops) {
                sop = (struct nfs4_stateowner *) fl->fl_owner;
-                hval = lockownerid_hashval(sop->so_id);
                kref_get(&sop->so_ref);
                deny->ld_sop = sop;
                deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3404,12 +3530,10 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateowner = sop;
        get_nfs4_file(fp);
        stp->st_file = fp;
-        stp->st_stateid.si_boot = get_seconds();
+        stp->st_stateid.si_boot = boot_time;
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
-        stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
-        stp->st_access_bmap = open_stp->st_access_bmap;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
@@ -3434,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfs4_stateowner *open_sop = NULL;
        struct nfs4_stateowner *lock_sop = NULL;
        struct nfs4_stateid *lock_stp;
-        struct file *filp;
+        struct nfs4_file *fp;
+        struct file *filp = NULL;
        struct file_lock file_lock;
        struct file_lock conflock;
        __be32 status = 0;
@@ -3464,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 * lock stateid.
                 */
                struct nfs4_stateid *open_stp = NULL;
-                struct nfs4_file *fp;
                
                status = nfserr_stale_clientid;
                if (!nfsd4_has_session(cstate) &&
@@ -3507,9 +3631,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                lock_sop = lock->lk_replay_owner;
+                fp = lock_stp->st_file;
        }
        /* lock->lk_replay_owner and lock_stp have been created or found */
-        filp = lock_stp->st_vfs_file;
        status = nfserr_grace;
        if (locks_in_grace() && !lock->lk_reclaim)
@@ -3522,11 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
+                        if (find_readable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+                                filp = find_readable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_RDLCK;
                        cmd = F_SETLK;
                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
+                        if (find_writeable_file(lock_stp->st_file)) {
+                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+                                filp = find_writeable_file(lock_stp->st_file);
+                        }
                        file_lock.fl_type = F_WRLCK;
                        cmd = F_SETLK;
                break;
@@ -3534,6 +3666,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        status = nfserr_inval;
                goto out;
        }
+        if (!filp) {
+                status = nfserr_openmode;
+                goto out;
+        }
        file_lock.fl_owner = (fl_owner_t)lock_sop;
        file_lock.fl_pid = current->tgid;
        file_lock.fl_file = filp;
@@ -3702,7 +3838,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                        &locku->lu_stateowner, &stp, NULL)))
                goto out;
-        filp = stp->st_vfs_file;
+        filp = find_any_file(stp->st_file);
+        if (!filp) {
+                status = nfserr_lock_range;
+                goto out;
+        }
        BUG_ON(!filp);
        locks_init_lock(&file_lock);
        file_lock.fl_type = F_UNLCK;
@@ -3749,10 +3889,10 @@ out_nfserr:
 *      0: no locks held by lockowner
 */
 static int
-check_for_locks(struct file *filp, struct nfs4_stateowner *lowner)
+check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
 {
        struct file_lock **flpp;
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = filp->fi_inode;
        int status = 0;
        lock_kernel();
@@ -3803,7 +3943,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
                                continue;
                        list_for_each_entry(stp, &sop->so_stateids,
                                        st_perstateowner) {
-                                if (check_for_locks(stp->st_vfs_file, sop))
+                                if (check_for_locks(stp->st_file, sop))
                                        goto out;
                                /* Note: so_perclient unused for lockowners,
                                 * so it's OK to fool with here. */
@@ -3976,12 +4116,6 @@ nfsd4_load_reboot_recovery_data(void)
                printk("NFSD: Failure reading reboot recovery data\n");
 }
-unsigned long
-get_nfs4_grace_period(void)
-{
-        return max(user_lease_time, lease_time) * HZ;
-}
 /*
 * Since the lifetime of a delegation isn't limited to that of an open, a
 * client may quite reasonably hang on to a delegation as long as it has
@@ -4008,41 +4142,34 @@ set_max_delegations(void)
 static int
 __nfs4_state_start(void)
 {
-        unsigned long grace_time;
+        int ret;
        boot_time = get_seconds();
-        grace_time = get_nfs4_grace_period();
-        lease_time = user_lease_time;
        locks_start_grace(&nfsd4_manager);
        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
-               grace_time/HZ);
+               nfsd4_grace);
+        ret = set_callback_cred();
+        if (ret)
+                return -ENOMEM;
        laundry_wq = create_singlethread_workqueue("nfsd4");
        if (laundry_wq == NULL)
                return -ENOMEM;
-        queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
+        ret = nfsd4_create_callback_queue();
+        if (ret)
+                goto out_free_laundry;
+        queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
        set_max_delegations();
-        return set_callback_cred();
+        return 0;
+out_free_laundry:
+        destroy_workqueue(laundry_wq);
+        return ret;
 }
 int
 nfs4_state_start(void)
 {
-        int ret;
-        if (nfs4_init)
-                return 0;
        nfsd4_load_reboot_recovery_data();
-        ret = __nfs4_state_start();
+        return __nfs4_state_start();
-        if (ret)
-                return ret;
-        nfs4_init = 1;
-        return 0;
-}
-time_t
-nfs4_lease_time(void)
-{
-        return lease_time;
 }
 static void
@@ -4077,7 +4204,6 @@ __nfs4_state_shutdown(void)
        }
        nfsd4_shutdown_recdir();
-        nfs4_init = 0;
 }
 void
@@ -4090,6 +4216,7 @@ nfs4_state_shutdown(void)
        nfs4_release_reclaim();
        __nfs4_state_shutdown();
        nfs4_unlock_state();
+        nfsd4_destroy_callback_queue();
 }
 /*
@@ -4128,21 +4255,3 @@ nfs4_recoverydir(void)
 {
        return user_recovery_dirname;
 }
-/*
- * Called when leasetime is changed.
- *
- * The only way the protocol gives us to handle on-the-fly lease changes is to
- * simulate a reboot.  Instead of doing that, we just wait till the next time
- * we start to register any changes in lease time.  If the administrator
- * really wants to change the lease time *now*, they can go ahead and bring
- * nfsd down and then back up again after changing the lease time.
- *
- * user_lease_time is protected by nfsd_mutex since it's only really accessed
- * when nfsd is starting
- */
-void
-nfs4_reset_lease(time_t leasetime)
-{
-        user_lease_time = leasetime;
-}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 34ccf815ea8a..1a468bbd330f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1234,6 +1234,16 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
        DECODE_TAIL;
 }
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+{
+        DECODE_HEAD;
+        READ_BUF(4);
+        READ32(rc->rca_one_fs);
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 {
@@ -1346,7 +1356,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
        [OP_TEST_STATEID]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_WANT_DELEGATION]    = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_DESTROY_CLIENTID]   = (nfsd4_dec)nfsd4_decode_notsupp,
-        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_RECLAIM_COMPLETE]   = (nfsd4_dec)nfsd4_decode_reclaim_complete,
 };
 struct nfsd4_minorversion_ops {
@@ -1746,6 +1756,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        struct nfs4_acl *acl = NULL;
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        u32 minorversion = resp->cstate.minorversion;
+        struct path path = {
+                .mnt    = exp->ex_path.mnt,
+                .dentry = dentry,
+        };
        BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
        BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -1766,7 +1780,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
                        FATTR4_WORD0_MAXNAME)) ||
            (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
                       FATTR4_WORD1_SPACE_TOTAL))) {
-                err = vfs_statfs(dentry, &statfs);
+                err = vfs_statfs(&path, &statfs);
                if (err)
                        goto out_nfserr;
        }
@@ -1900,7 +1914,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
        if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
                if ((buflen -= 4) < 0)
                        goto out_resource;
-                WRITE32(NFSD_LEASE_TIME);
+                WRITE32(nfsd4_lease);
        }
        if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
                if ((buflen -= 4) < 0)
@@ -2620,7 +2634,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        }
        read->rd_vlen = v;
-        nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
+        nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
                        read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
                        &maxcount);
@@ -3307,11 +3321,15 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
                iov = &rqstp->rq_res.head[0];
        iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
        BUG_ON(iov->iov_len > PAGE_SIZE);
-        if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+        if (nfsd4_has_session(cs)) {
-                nfsd4_store_cache_entry(resp);
+                if (cs->status != nfserr_replay_cache) {
-                dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+                        nfsd4_store_cache_entry(resp);
-                resp->cstate.slot->sl_inuse = false;
+                        dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
-                nfsd4_put_session(resp->cstate.session);
+                        cs->slot->sl_inuse = false;
+                }
+                /* Renew the clientid on success and on replay */
+                release_session_client(cs->session);
+                nfsd4_put_session(cs->session);
        }
        return 1;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e3591073098f..b53b1d042f1f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -46,6 +46,7 @@ enum {
         */
 #ifdef CONFIG_NFSD_V4
        NFSD_Leasetime,
+        NFSD_Gracetime,
        NFSD_RecoveryDir,
 #endif
 };
@@ -70,6 +71,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
@@ -91,6 +93,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
        [NFSD_Leasetime] = write_leasetime,
+        [NFSD_Gracetime] = write_gracetime,
        [NFSD_RecoveryDir] = write_recoverydir,
 #endif
 };
@@ -946,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf)
        if (err != 0)
                return err;
-        err = lockd_up();
-        if (err != 0)
-                goto out;
        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
-        if (err < 0)
+        if (err < 0) {
-                lockd_down();
+                svc_destroy(nfsd_serv);
+                return err;
+        }
-out:
        /* Decrease the count, but don't shut down the service */
        nfsd_serv->sv_nrthreads--;
        return err;
@@ -975,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf)
        if (nfsd_serv != NULL)
                len = svc_sock_names(nfsd_serv, buf,
                                        SIMPLE_TRANSACTION_LIMIT, toclose);
-        if (len >= 0)
-                lockd_down();
        kfree(toclose);
        return len;
 }
@@ -995,7 +992,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX)
+        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
        err = nfsd_create_serv();
@@ -1011,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf)
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
+        /* Decrease the count, but don't shut down the service */
+        nfsd_serv->sv_nrthreads--;
        return 0;
 out_close:
        xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1019,8 +1019,7 @@ out_close:
                svc_xprt_put(xprt);
        }
 out_err:
-        /* Decrease the count, but don't shut down the service */
+        svc_destroy(nfsd_serv);
-        nfsd_serv->sv_nrthreads--;
        return err;
 }
@@ -1037,7 +1036,7 @@ static ssize_t __write_ports_delxprt(char *buf)
        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
                return -EINVAL;
        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
@@ -1191,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                        bsize = NFSSVC_MAXBLKSIZE;
                bsize &= ~(1024-1);
                mutex_lock(&nfsd_mutex);
-                if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+                if (nfsd_serv) {
                        mutex_unlock(&nfsd_mutex);
                        return -EBUSY;
                }
@@ -1204,29 +1203,45 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 }
 #ifdef CONFIG_NFSD_V4
-extern time_t nfs4_leasetime(void);
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
-static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
-        /* if size > 10 seconds, call
-         * nfs4_reset_lease() then write out the new lease (seconds) as reply
-         */
        char *mesg = buf;
-        int rv, lease;
+        int rv, i;
        if (size > 0) {
                if (nfsd_serv)
                        return -EBUSY;
-                rv = get_int(&mesg, &lease);
+                rv = get_int(&mesg, &i);
                if (rv)
                        return rv;
-                if (lease < 10 || lease > 3600)
+                /*
+                 * Some sanity checking.  We don't have a reason for
+                 * these particular numbers, but problems with the
+                 * extremes are:
+                 *      - Too short: the briefest network outage may
+                 *        cause clients to lose all their locks.  Also,
+                 *        the frequent polling may be wasteful.
+                 *      - Too long: do you really want reboot recovery
+                 *        to take more than an hour?  Or to make other
+                 *        clients wait an hour before being able to
+                 *        revoke a dead client's locks?
+                 */
+                if (i < 10 || i > 3600)
                        return -EINVAL;
-                nfs4_reset_lease(lease);
+                *time = i;
        }
-        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n",
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
-                                                        nfs4_lease_time());
+}
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+{
+        ssize_t rv;
+        mutex_lock(&nfsd_mutex);
+        rv = __nfsd4_write_time(file, buf, size, time);
+        mutex_unlock(&nfsd_mutex);
+        return rv;
 }
 /**
@@ -1252,12 +1267,22 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-        ssize_t rv;
+        return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+}
-        mutex_lock(&nfsd_mutex);
+/**
-        rv = __write_leasetime(file, buf, size);
+ * write_gracetime - Set or report current NFSv4 grace period time
-        mutex_unlock(&nfsd_mutex);
+ *
-        return rv;
+ * As above, but sets the time of the NFSv4 grace period.
+ *
+ * Note this should never be set to less than the *previous*
+ * lease-period time, but we don't try to enforce this.  (In the common
+ * case (a new boot), we don't know what the previous lease time was
+ * anyway.)
+ */
+static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
+{
+        return nfsd4_write_time(file, buf, size, &nfsd4_grace);
 }
 extern char *nfs4_recoverydir(void);
@@ -1281,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
                        return -EINVAL;
                status = nfs4_reset_recoverydir(recdir);
+                if (status)
+                        return status;
        }
        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
@@ -1351,6 +1378,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
 #endif
                /* last one */ {""}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e942a1aaac92..b76ac3a82e39 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -82,7 +82,6 @@ int nfs4_state_init(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
-time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
@@ -90,7 +89,6 @@ static inline int nfs4_state_init(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
-static inline time_t nfs4_lease_time(void) { return 0; }
 static inline void nfs4_reset_lease(time_t leasetime) { }
 static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
 #endif
@@ -155,6 +153,7 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_bad_seqid        cpu_to_be32(NFSERR_BAD_SEQID)
 #define nfserr_symlink          cpu_to_be32(NFSERR_SYMLINK)
 #define nfserr_not_same         cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_lock_range       cpu_to_be32(NFSERR_LOCK_RANGE)
 #define nfserr_restorefh        cpu_to_be32(NFSERR_RESTOREFH)
 #define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
 #define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
@@ -229,6 +228,9 @@ extern struct timeval	nfssvc_boot;
 #ifdef CONFIG_NFSD_V4
+extern time_t nfsd4_lease;
+extern time_t nfsd4_grace;
 /* before processing a COMPOUND operation, we have to check that there
 * is enough space in the buffer for XDR encode to succeed.  otherwise,
 * we might process an operation with side effects, and be unable to
@@ -247,7 +249,6 @@ extern struct timeval	nfssvc_boot;
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
 #define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
-#define NFSD_LEASE_TIME                 (nfs4_lease_time())
 #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
 /*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
 static inline void
 fh_unlock(struct svc_fh *fhp)
 {
-        BUG_ON(!fhp->fh_dentry);
        if (fhp->fh_locked) {
                fill_post_wcc(fhp);
                mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111ef..08e17264784b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
        svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
        resp->count = argp->count;
-        nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
+        nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
                                  argp->offset,
                                  rqstp->rq_vec, argp->vlen,
                                  &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
         * gospel of sun micro
         */
        if (type != S_IFREG) {
-                int     is_borc = 0;
                if (type != S_IFBLK && type != S_IFCHR) {
                        rdev = 0;
                } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
                        type = S_IFIFO;
                } else {
                        /* Okay, char or block special */
-                        is_borc = 1;
                        if (!rdev)
                                rdev = wanted;
                }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 171699eb07c8..e2c43464f237 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -120,7 +120,7 @@ u32 nfsd_supported_minorversion;
 int nfsd_vers(int vers, enum vers_op change)
 {
        if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
-                return -1;
+                return 0;
        switch(change) {
        case NFSD_SET:
                nfsd_versions[vers] = nfsd_version[vers];
@@ -180,15 +180,80 @@ int nfsd_nrthreads(void)
        return rv;
 }
+static int nfsd_init_socks(int port)
+{
+        int error;
+        if (!list_empty(&nfsd_serv->sv_permsocks))
+                return 0;
+        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
+                                        SVC_SOCK_DEFAULTS);
+        if (error < 0)
+                return error;
+        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
+                                        SVC_SOCK_DEFAULTS);
+        if (error < 0)
+                return error;
+        return 0;
+}
+static bool nfsd_up = false;
+static int nfsd_startup(unsigned short port, int nrservs)
+{
+        int ret;
+        if (nfsd_up)
+                return 0;
+        /*
+         * Readahead param cache - will no-op if it already exists.
+         * (Note therefore results will be suboptimal if number of
+         * threads is modified after nfsd start.)
+         */
+        ret = nfsd_racache_init(2*nrservs);
+        if (ret)
+                return ret;
+        ret = nfsd_init_socks(port);
+        if (ret)
+                goto out_racache;
+        ret = lockd_up();
+        if (ret)
+                goto out_racache;
+        ret = nfs4_state_start();
+        if (ret)
+                goto out_lockd;
+        nfsd_up = true;
+        return 0;
+out_lockd:
+        lockd_down();
+out_racache:
+        nfsd_racache_shutdown();
+        return ret;
+}
+static void nfsd_shutdown(void)
+{
+        /*
+         * write_ports can create the server without actually starting
+         * any threads--if we get shut down before any threads are
+         * started, then nfsd_last_thread will be run before any of this
+         * other initialization has been done.
+         */
+        if (!nfsd_up)
+                return;
+        nfs4_state_shutdown();
+        lockd_down();
+        nfsd_racache_shutdown();
+        nfsd_up = false;
+}
 static void nfsd_last_thread(struct svc_serv *serv)
 {
        /* When last nfsd thread exits we need to do some clean-up */
-        struct svc_xprt *xprt;
-        list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
-                lockd_down();
        nfsd_serv = NULL;
-        nfsd_racache_shutdown();
+        nfsd_shutdown();
-        nfs4_state_shutdown();
        printk(KERN_WARNING "nfsd: last server has exited, flushing export "
                            "cache\n");
@@ -263,45 +328,18 @@ int nfsd_create_serv(void)
                       nfsd_max_blksize >= 8*1024*2)
                        nfsd_max_blksize /= 2;
        }
+        nfsd_reset_versions();
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
-                err = -ENOMEM;
+                return -ENOMEM;
-        else
-                set_max_drc();
+        set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return err;
 }
-static int nfsd_init_socks(int port)
-{
-        int error;
-        if (!list_empty(&nfsd_serv->sv_permsocks))
-                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
-                                        SVC_SOCK_DEFAULTS);
-        if (error < 0)
-                return error;
-        error = lockd_up();
-        if (error < 0)
-                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
-                                        SVC_SOCK_DEFAULTS);
-        if (error < 0)
-                return error;
-        error = lockd_up();
-        if (error < 0)
-                return error;
-        return 0;
-}
 int nfsd_nrpools(void)
 {
        if (nfsd_serv == NULL)
@@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
        return err;
 }
+/*
+ * Adjust the number of threads and return the new number of threads.
+ * This is also the function that starts the server if necessary, if
+ * this is the first time nrservs is nonzero.
+ */
 int
 nfsd_svc(unsigned short port, int nrservs)
 {
        int     error;
+        bool    nfsd_up_before;
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
@@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs)
        if (nrservs == 0 && nfsd_serv == NULL)
                goto out;
-        /* Readahead param cache - will no-op if it already exists */
+        error = nfsd_create_serv();
-        error = nfsd_racache_init(2*nrservs);
-        if (error<0)
-                goto out;
-        error = nfs4_state_start();
        if (error)
                goto out;
-        nfsd_reset_versions();
+        nfsd_up_before = nfsd_up;
-        error = nfsd_create_serv();
+        error = nfsd_startup(port, nrservs);
        if (error)
-                goto out;
+                goto out_destroy;
-        error = nfsd_init_socks(port);
-        if (error)
-                goto failure;
        error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
-        if (error == 0)
+        if (error)
-                /* We are holding a reference to nfsd_serv which
+                goto out_shutdown;
-                 * we don't want to count in the return value,
+        /* We are holding a reference to nfsd_serv which
-                 * so subtract 1
+         * we don't want to count in the return value,
-                 */
+         * so subtract 1
-                error = nfsd_serv->sv_nrthreads - 1;
+         */
- failure:
+        error = nfsd_serv->sv_nrthreads - 1;
+out_shutdown:
+        if (error < 0 && !nfsd_up_before)
+                nfsd_shutdown();
+out_destroy:
        svc_destroy(nfsd_serv);         /* Release server */
- out:
+out:
        mutex_unlock(&nfsd_mutex);
        return error;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index fefeae27f25e..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,16 @@ struct nfsd4_cb_sequence {
        struct nfs4_client      *cbs_clp;
 };
+struct nfs4_rpc_args {
+        void                            *args_op;
+        struct nfsd4_cb_sequence        args_seq;
+};
+struct nfsd4_callback {
+        struct nfs4_rpc_args cb_args;
+        struct work_struct cb_work;
+};
 struct nfs4_delegation {
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
@@ -78,7 +88,6 @@ struct nfs4_delegation {
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
        struct file_lock        *dl_flock;
-        struct file             *dl_vfs_file;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -86,6 +95,7 @@ struct nfs4_delegation {
        stateid_t               dl_stateid;
        struct knfsd_fh         dl_fh;
        int                     dl_retries;
+        struct nfsd4_callback   dl_recall;
 };
 /* client delegation callback info */
@@ -96,9 +106,7 @@ struct nfs4_cb_conn {
        u32                     cb_prog;
        u32                     cb_minorversion;
        u32                     cb_ident;       /* minorversion 0 only */
-        /* RPC client info */
+        struct svc_xprt         *cb_xprt;       /* minorversion 1 only */
-        atomic_t                cb_set;     /* successful CB_NULL call */
-        struct rpc_clnt *       cb_client;
 };
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
@@ -157,7 +165,7 @@ struct nfsd4_session {
        struct list_head        se_hash;        /* hash by sessionid */
        struct list_head        se_perclnt;
        u32                     se_flags;
-        struct nfs4_client      *se_client;     /* for expire_client */
+        struct nfs4_client      *se_client;
        struct nfs4_sessionid   se_sessionid;
        struct nfsd4_channel_attrs se_fchannel;
        struct nfsd4_channel_attrs se_bchannel;
@@ -212,25 +220,41 @@ struct nfs4_client {
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
-        struct nfs4_cb_conn     cl_cb_conn;     /* callback info */
-        atomic_t                cl_count;       /* ref count */
        u32                     cl_firststate;  /* recovery dir creation */
+        /* for v4.0 and v4.1 callbacks: */
+        struct nfs4_cb_conn     cl_cb_conn;
+        struct rpc_clnt         *cl_cb_client;
+        atomic_t                cl_cb_set;
        /* for nfs41 */
        struct list_head        cl_sessions;
        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
        u32                     cl_exchange_flags;
        struct nfs4_sessionid   cl_sessionid;
+        /* number of rpc's in progress over an associated session: */
+        atomic_t                cl_refcount;
        /* for nfs41 callbacks */
        /* We currently support a single back channel with a single slot */
        unsigned long           cl_cb_slot_busy;
        u32                     cl_cb_seq_nr;
-        struct svc_xprt         *cl_cb_xprt;    /* 4.1 callback transport */
        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
                                                /* wait here for slots */
 };
+static inline void
+mark_client_expired(struct nfs4_client *clp)
+{
+        clp->cl_time = 0;
+}
+static inline bool
+is_client_expired(struct nfs4_client *clp)
+{
+        return clp->cl_time == 0;
+}
 /* struct nfs4_client_reset
 * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
 * upon lease reset, or from upcall to state_daemon (to read in state
@@ -317,12 +341,50 @@ struct nfs4_file {
        struct list_head        fi_hash;    /* hash by "struct inode *" */
        struct list_head        fi_stateids;
        struct list_head        fi_delegations;
+        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
+        struct file *           fi_fds[3];
+        /* One each for O_RDONLY, O_WRONLY: */
+        atomic_t                fi_access[2];
+        /*
+         * Each open stateid contributes 1 to either fi_readers or
+         * fi_writers, or both, depending on the open mode.  A
+         * delegation also takes an fi_readers reference.  Lock
+         * stateid's take none.
+         */
+        atomic_t                fi_readers;
+        atomic_t                fi_writers;
        struct inode            *fi_inode;
        u32                     fi_id;      /* used with stateowner->so_id 
                                             * for stateid_hashtbl hash */
        bool                    fi_had_conflict;
 };
+/* XXX: for first cut may fall back on returning file that doesn't work
+ * at all? */
+static inline struct file *find_writeable_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_WRONLY])
+                return f->fi_fds[O_WRONLY];
+        return f->fi_fds[O_RDWR];
+}
+static inline struct file *find_readable_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_RDONLY])
+                return f->fi_fds[O_RDONLY];
+        return f->fi_fds[O_RDWR];
+}
+static inline struct file *find_any_file(struct nfs4_file *f)
+{
+        if (f->fi_fds[O_RDWR])
+                return f->fi_fds[O_RDWR];
+        else if (f->fi_fds[O_WRONLY])
+                return f->fi_fds[O_WRONLY];
+        else
+                return f->fi_fds[O_RDONLY];
+}
 /*
 * nfs4_stateid can either be an open stateid or (eventually) a lock stateid
 *
@@ -348,7 +410,6 @@ struct nfs4_stateid {
        struct nfs4_stateowner      * st_stateowner;
        struct nfs4_file            * st_file;
        stateid_t                     st_stateid;
-        struct file                 * st_vfs_file;
        unsigned long                 st_access_bmap;
        unsigned long                 st_deny_bmap;
        struct nfs4_stateid         * st_openstp;
@@ -377,11 +438,14 @@ extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
 extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
-extern void put_nfs4_client(struct nfs4_client *clp);
 extern void nfs4_free_stateowner(struct kref *kref);
 extern int set_callback_cred(void);
-extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd4_do_callback_rpc(struct work_struct *);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern int nfsd4_create_callback_queue(void);
+extern void nfsd4_destroy_callback_queue(void);
+extern void nfsd4_set_callback_client(struct nfs4_client *, struct rpc_clnt *);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 extern void nfsd4_init_recdir(char *recdir_name);
@@ -392,6 +456,7 @@ extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
 extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+extern void release_session_client(struct nfsd4_session *);
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6dd5f1970e01..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        if (size_change)
                put_write_access(inode);
        if (!err)
-                if (EX_ISSYNC(fhp->fh_export))
+                commit_metadata(fhp);
-                        write_inode_now(inode, 1);
 out:
        return err;
@@ -605,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
        return error;
 }
-#endif /* defined(CONFIG_NFS_V4) */
+#endif /* defined(CONFIG_NFSD_V4) */
 #ifdef CONFIG_NFSD_V3
 /*
@@ -724,7 +723,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
        __be32          err;
-        int             host_err;
+        int             host_err = 0;
        validate_process_creds();
@@ -761,7 +760,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
         * Check to see if there are any leases on this file.
         * This may block while leases are broken.
         */
-        host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
+        if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
+                host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
        if (host_err == -EWOULDBLOCK)
                host_err = -ETIMEDOUT;
        if (host_err) /* NOMEM or WOULDBLOCK */
@@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
        struct inode *inode;
-        struct raparms  *ra;
        mm_segment_t    oldfs;
        __be32          err;
        int             host_err;
@@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
        if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
                goto out;
-        /* Get readahead parameters */
-        ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-        if (ra && ra->p_set)
-                file->f_ra = ra->p_ra;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
                        .len            = 0,
@@ -937,21 +930,11 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                set_fs(oldfs);
        }
-        /* Write back readahead params */
-        if (ra) {
-                struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-                spin_lock(&rab->pb_lock);
-                ra->p_ra = file->f_ra;
-                ra->p_set = 1;
-                ra->p_count--;
-                spin_unlock(&rab->pb_lock);
-        }
        if (host_err >= 0) {
                nfsdstats.io_read += host_err;
                *count = host_err;
                err = 0;
-                fsnotify_access(file->f_path.dentry);
+                fsnotify_access(file);
        } else 
                err = nfserrno(host_err);
 out:
@@ -998,7 +981,7 @@ static int wait_for_concurrent_writes(struct file *file)
        if (inode->i_state & I_DIRTY) {
                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                err = vfs_fsync(file, file->f_path.dentry, 0);
+                err = vfs_fsync(file, 0);
        }
        last_ino = inode->i_ino;
        last_dev = inode->i_sb->s_dev;
@@ -1062,7 +1045,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                goto out_nfserr;
        *cnt = host_err;
        nfsdstats.io_write += host_err;
-        fsnotify_modify(file->f_path.dentry);
+        fsnotify_modify(file);
        /* clear setuid/setgid flag after write */
        if (inode->i_mode & (S_ISUID | S_ISGID))
@@ -1086,8 +1069,45 @@ out:
 * on entry. On return, *count contains the number of bytes actually read.
 * N.B. After this call fhp needs an fh_put
 */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+        loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+        struct file *file;
+        struct inode *inode;
+        struct raparms  *ra;
+        __be32 err;
+        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+        if (err)
+                return err;
+        inode = file->f_path.dentry->d_inode;
+        /* Get readahead parameters */
+        ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+        if (ra && ra->p_set)
+                file->f_ra = ra->p_ra;
+        err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+        /* Write back readahead params */
+        if (ra) {
+                struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+                spin_lock(&rab->pb_lock);
+                ra->p_ra = file->f_ra;
+                ra->p_set = 1;
+                ra->p_count--;
+                spin_unlock(&rab->pb_lock);
+        }
+        nfsd_close(file);
+        return err;
+}
+/* As above, but use the provided file descriptor. */
 __be32
-nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
+nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                loff_t offset, struct kvec *vec, int vlen,
                unsigned long *count)
 {
@@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
                if (err)
                        goto out;
                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-        } else {
+        } else /* Note file may still be NULL in NFSv4 special stateid case: */
-                err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+                err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
-                if (err)
-                        goto out;
-                err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
-                nfsd_close(file);
-        }
 out:
        return err;
 }
@@ -1169,12 +1184,12 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        goto out;
        }
-        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+        err = nfsd_open(rqstp, fhp, S_IFREG,
+                        NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
        if (err)
                goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
-                int err2 = vfs_fsync_range(file, file->f_path.dentry,
+                int err2 = vfs_fsync_range(file, offset, end, 0);
-                                offset, end, 0);
                if (err2 != -EINVAL)
                        err = nfserrno(err2);
@@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                                char *name, int len, struct svc_fh *tfhp)
 {
        struct dentry   *ddir, *dnew, *dold;
-        struct inode    *dirp, *dest;
+        struct inode    *dirp;
        __be32          err;
        int             host_err;
@@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                goto out_nfserr;
        dold = tfhp->fh_dentry;
-        dest = dold->d_inode;
        host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
        if (host_err) {
@@ -2019,9 +2033,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-        __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+        __be32 err;
-        if (!err && vfs_statfs(fhp->fh_dentry,stat))
-                err = nfserr_io;
+        err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
+        if (!err) {
+                struct path path = {
+                        .mnt    = fhp->fh_export->ex_path.mnt,
+                        .dentry = fhp->fh_dentry,
+                };
+                if (vfs_statfs(&path, stat))
+                        err = nfserr_io;
+        }
        return err;
 }
@@ -2038,7 +2060,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
-        struct path     path;
        int             err;
        if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2132,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
-        if (err)
-                goto nfsd_out;
-        /* Do integrity (permission) checking now, but defer incrementing
-         * IMA counts to the actual file open.
-         */
-        path.mnt = exp->ex_path.mnt;
-        path.dentry = dentry;
-nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 4b1de0a9ea75..9a370a5e36b7 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -20,6 +20,7 @@
 #define NFSD_MAY_OWNER_OVERRIDE 64
 #define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_NOT_BREAK_LEASE 512
 #define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -63,7 +64,9 @@ __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 __be32          nfsd_open(struct svc_rqst *, struct svc_fh *, int,
                                int, struct file **);
 void            nfsd_close(struct file *);
-__be32          nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+__be32          nfsd_read(struct svc_rqst *, struct svc_fh *,
+                                loff_t, struct kvec *, int, unsigned long *);
+__be32          nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
                                loff_t, struct kvec *, int, unsigned long *);
 __be32          nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
                                loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index efa337739534..4d476ff08ae6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -381,6 +381,10 @@ struct nfsd4_destroy_session {
        struct nfs4_sessionid   sessionid;
 };
+struct nfsd4_reclaim_complete {
+        u32 rca_one_fs;
+};
 struct nfsd4_op {
        int                                     opnum;
        __be32                                  status;
@@ -421,6 +425,7 @@ struct nfsd4_op {
                struct nfsd4_create_session     create_session;
                struct nfsd4_destroy_session    destroy_session;
                struct nfsd4_sequence           sequence;
+                struct nfsd4_reclaim_complete   reclaim_complete;
        } u;
        struct nfs4_replay *                    replay;
 };
@@ -513,9 +518,8 @@ extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
-                struct nfsd4_compound_state *,
+                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
-struct nfsd4_exchange_id *);
+extern __be32 nfsd4_create_session(struct svc_rqst *,
-                extern __be32 nfsd4_create_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_create_session *);
 extern __be32 nfsd4_sequence(struct svc_rqst *,
@@ -524,6 +528,7 @@ extern __be32 nfsd4_sequence(struct svc_rqst *,
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_destroy_session *);
+__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
 extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
                struct nfsd4_open *open);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7cfb87e692da..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -31,6 +31,11 @@
 #include "alloc.h"
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *                                      descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
@@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
                sizeof(struct nilfs_palloc_group_desc);
 }
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
 {
        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
 }
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
        return 0;
 }
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
                                        unsigned long *offset)
 {
@@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
        return group;
 }
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
 }
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
 }
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
                               const struct nilfs_palloc_group_desc *desc)
@@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
        return nfree;
 }
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
 static void
 nilfs_palloc_group_desc_add_entries(struct inode *inode,
                                    unsigned long group,
@@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
 }
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 {
@@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
 }
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static void nilfs_palloc_desc_block_init(struct inode *inode,
                                         struct buffer_head *bh, void *kaddr)
 {
@@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
        return ret;
 }
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
@@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
                                      bhp, &cache->prev_desc, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
@@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                      &cache->prev_bitmap, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
@@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                      &cache->prev_entry, &cache->lock);
 }
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static struct nilfs_palloc_group_desc *
 nilfs_palloc_block_get_group_desc(const struct inode *inode,
                                  unsigned long group,
@@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                entry_offset * NILFS_MDT(inode)->mi_entry_size;
 }
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
 static int nilfs_palloc_find_available_slot(struct inode *inode,
                                            unsigned long group,
                                            unsigned long target,
                                            unsigned char *bitmap,
-                                            int bsize)  /* size in bits */
+                                            int bsize)
 {
        int curr, pos, end, i;
@@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
        return -ENOSPC;
 }
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *                                          in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                                       unsigned long curr, unsigned long max)
@@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                     max - curr + 1);
 }
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
        return ret;
 }
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
        return 0;
 }
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_abort_free_entry(struct inode *inode,
                                   struct nilfs_palloc_req *req)
 {
@@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static int
 nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 {
@@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
        return (nr >= first) && (nr <= last);
 }
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
        struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
 {
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c11..3dbdc1d356bf 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
 #include "nilfs.h"
 #include "bmap.h"
 #include "sb.h"
+#include "btree.h"
+#include "direct.h"
 #include "btnode.h"
 #include "mdt.h"
 #include "dat.h"
@@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
 void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
-        memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+        memcpy(gcbmap, bmap, sizeof(*bmap));
        init_rwsem(&gcbmap->b_sem);
        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
@@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
 {
-        memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+        memcpy(bmap, gcbmap, sizeof(*bmap));
        init_rwsem(&bmap->b_sem);
        lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab91..a20569b19929 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
 #define NILFS_BMAP_INVALID_PTR  0
-#define nilfs_bmap_dkey_to_key(dkey)    le64_to_cpu(dkey)
-#define nilfs_bmap_key_to_dkey(key)     cpu_to_le64(key)
-#define nilfs_bmap_dptr_to_ptr(dptr)    le64_to_cpu(dptr)
-#define nilfs_bmap_ptr_to_dptr(ptr)     cpu_to_le64(ptr)
 #define nilfs_bmap_keydiff_abs(diff)    ((diff) < 0 ? -(diff) : (diff))
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
        int (*bop_delete)(struct nilfs_bmap *, __u64);
        void (*bop_clear)(struct nilfs_bmap *);
-        int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+        int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
        void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
                                         struct list_head *);
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
 * @b_last_allocated_ptr: last allocated ptr for data block
 * @b_ptr_type: pointer type
 * @b_state: state
+ * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
 */
 struct nilfs_bmap {
        union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
        __u64 b_last_allocated_ptr;
        int b_ptr_type;
        int b_state;
+        __u16 b_nchildren_per_block;
 };
 /* pointer type */
@@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
                nilfs_dat_abort_end(dat, &req->bpr_req);
 }
+static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
+                                           __u64 ptr)
+{
+        bmap->b_last_allocated_key = key;
+        bmap->b_last_allocated_ptr = ptr;
+}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
                              const struct buffer_head *);
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47b..000000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * bmap_union.h - NILFS block mapping.
- *
- * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Koji Sato <koji@osrg.net>.
- */
-#ifndef _NILFS_BMAP_UNION_H
-#define _NILFS_BMAP_UNION_H
-#include "bmap.h"
-#include "direct.h"
-#include "btree.h"
-/**
- * nilfs_bmap_union -
- * @bi_bmap: bmap structure
- * @bi_btree: direct map structure
- * @bi_direct: B-tree structure
- */
-union nilfs_bmap_union {
-        struct nilfs_bmap bi_bmap;
-        struct nilfs_direct bi_direct;
-        struct nilfs_btree bi_btree;
-};
-#endif  /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a3306..f78ab1044d1d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 }
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-                              sector_t pblocknr, struct buffer_head **pbh)
+                              sector_t pblocknr, int mode,
+                              struct buffer_head **pbh, sector_t *submit_ptr)
 {
        struct buffer_head *bh;
        struct inode *inode = NILFS_BTNC_I(btnc);
+        struct page *page;
        int err;
        bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                return -ENOMEM;
        err = -EEXIST; /* internal code */
+        page = bh->b_page;
        if (buffer_uptodate(bh) || buffer_dirty(bh))
                goto found;
@@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
                        }
                }
        }
-        lock_buffer(bh);
+        if (mode == READA) {
+                if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
+                        err = -EBUSY; /* internal code */
+                        brelse(bh);
+                        goto out_locked;
+                }
+        } else { /* mode == READ */
+                lock_buffer(bh);
+        }
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                err = -EEXIST; /* internal code */
@@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
        bh->b_blocknr = pblocknr; /* set block address for read */
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
-        submit_bh(READ, bh);
+        submit_bh(mode, bh);
        bh->b_blocknr = blocknr; /* set back to the given block address */
+        *submit_ptr = pblocknr;
        err = 0;
 found:
        *pbh = bh;
 out_locked:
-        unlock_page(bh->b_page);
+        unlock_page(page);
-        page_cache_release(bh->b_page);
+        page_cache_release(page);
        return err;
 }
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f07712..79037494f1e0 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
                                              __u64 blocknr);
-int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
-                              struct buffer_head **);
+                              struct buffer_head **, sector_t *);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
                                    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 76c38e3e19d2..300c2bc00c3f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
 #include "alloc.h"
 #include "dat.h"
-/**
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
-        struct buffer_head *bp_bh;
-        struct buffer_head *bp_sib_bh;
-        int bp_index;
-        union nilfs_bmap_ptr_req bp_oldreq;
-        union nilfs_bmap_ptr_req bp_newreq;
-        struct nilfs_btnode_chkey_ctxt bp_ctxt;
-        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
-                      int, __u64 *, __u64 *);
-};
-/*
- * B-tree path operations
- */
-static struct kmem_cache *nilfs_btree_path_cache;
-int __init nilfs_btree_path_cache_init(void)
-{
-        nilfs_btree_path_cache =
-                kmem_cache_create("nilfs2_btree_path_cache",
-                                  sizeof(struct nilfs_btree_path) *
-                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
-        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-void nilfs_btree_path_cache_destroy(void)
-{
-        kmem_cache_destroy(nilfs_btree_path_cache);
-}
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
-        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        kmem_cache_free(nilfs_btree_path_cache, path);
+        struct nilfs_btree_path *path;
-}
+        int level = NILFS_BTREE_LEVEL_DATA;
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
+        path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-{
+        if (path == NULL)
-        int level;
+                goto out;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
-             level < NILFS_BTREE_LEVEL_MAX;
-             level++) {
                path[level].bp_bh = NULL;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index = 0;
@@ -95,44 +48,28 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_op = NULL;
        }
+out:
+        return path;
 }
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        int level;
+        int level = NILFS_BTREE_LEVEL_DATA;
-        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++)
-             level++)
                brelse(path[level].bp_bh);
+        kmem_cache_free(nilfs_btree_path_cache, path);
 }
 /*
 * B-tree node operations
 */
-static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
-                                 struct buffer_head **bhp)
-{
-        struct address_space *btnc =
-                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-        int err;
-        err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
-        if (err)
-                return err == -EEXIST ? 0 : err;
-        wait_on_buffer(*bhp);
-        if (!buffer_uptodate(*bhp)) {
-                brelse(*bhp);
-                return -EIO;
-        }
-        return 0;
-}
-static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
                                     __u64 ptr, struct buffer_head **bhp)
 {
-        struct address_space *btnc =
+        struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
-                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
        struct buffer_head *bh;
        bh = nilfs_btnode_create_block(btnc, ptr);
@@ -144,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
        return 0;
 }
-static inline int
+static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
 {
        return node->bn_flags;
 }
-static inline void
+static void
 nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
 {
        node->bn_flags = flags;
 }
-static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
+static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
 {
        return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
-static inline int
+static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
 {
        return node->bn_level;
 }
-static inline void
+static void
 nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
 {
        node->bn_level = level;
 }
-static inline int
+static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
-nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
 {
        return le16_to_cpu(node->bn_nchildren);
 }
-static inline void
+static void
 nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
 {
        node->bn_nchildren = cpu_to_le16(nchildren);
 }
-static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
+static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
-{
-        return 1 << btree->bt_bmap.b_inode->i_blkbits;
-}
-static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(node) ?
+        return 1 << btree->b_inode->i_blkbits;
-                NILFS_BTREE_ROOT_NCHILDREN_MIN :
-                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
-static inline int
+static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
-nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(node) ?
+        return btree->b_nchildren_per_block;
-                NILFS_BTREE_ROOT_NCHILDREN_MAX :
-                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
 }
-static inline __le64 *
+static __le64 *
 nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
 {
        return (__le64 *)((char *)(node + 1) +
@@ -216,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
-static inline __le64 *
+static __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
-                       const struct nilfs_btree *btree)
 {
-        return (__le64 *)(nilfs_btree_node_dkeys(node) +
+        return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
-                          nilfs_btree_node_nchildren_max(node, btree));
 }
-static inline __u64
+static __u64
 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
 {
-        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
+        return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
 }
-static inline void
+static void
 nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
 {
-        *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
+        *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
 }
-static inline __u64
+static __u64
-nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
-                         const struct nilfs_btree_node *node, int index)
+                         int ncmax)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
+        return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
-                                        index));
 }
-static inline void
+static void
-nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
-                         struct nilfs_btree_node *node, int index, __u64 ptr)
+                         int ncmax)
 {
-        *(nilfs_btree_node_dptrs(node, btree) + index) =
+        *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
-                nilfs_bmap_ptr_to_dptr(ptr);
 }
-static void nilfs_btree_node_init(struct nilfs_btree *btree,
+static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
-                                  struct nilfs_btree_node *node,
+                                  int level, int nchildren, int ncmax,
-                                  int flags, int level, int nchildren,
                                  const __u64 *keys, const __u64 *ptrs)
 {
        __le64 *dkeys;
@@ -266,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
        nilfs_btree_node_set_nchildren(node, nchildren);
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nchildren; i++) {
-                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+                dkeys[i] = cpu_to_le64(keys[i]);
-                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+                dptrs[i] = cpu_to_le64(ptrs[i]);
        }
 }
 /* Assume the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
-                                       struct nilfs_btree_node *left,
                                       struct nilfs_btree_node *right,
-                                       int n)
+                                       int n, int lncmax, int rncmax)
 {
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(left, btree);
+        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(right, btree);
+        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -303,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
 }
 /* Assume that the buffer heads corresponding to left and right are locked. */
-static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
-                                        struct nilfs_btree_node *left,
                                        struct nilfs_btree_node *right,
-                                        int n)
+                                        int n, int lncmax, int rncmax)
 {
        __le64 *ldkeys, *rdkeys;
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(left, btree);
+        ldptrs = nilfs_btree_node_dptrs(left, lncmax);
        lnchildren = nilfs_btree_node_get_nchildren(left);
        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(right, btree);
+        rdptrs = nilfs_btree_node_dptrs(right, rncmax);
        rnchildren = nilfs_btree_node_get_nchildren(right);
        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -332,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
 }
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
-                                    struct nilfs_btree_node *node,
+                                    __u64 key, __u64 ptr, int ncmax)
-                                    __u64 key, __u64 ptr, int index)
 {
        __le64 *dkeys;
        __le64 *dptrs;
        int nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (index < nchildren) {
                memmove(dkeys + index + 1, dkeys + index,
@@ -349,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
                memmove(dptrs + index + 1, dptrs + index,
                        (nchildren - index) * sizeof(*dptrs));
        }
-        dkeys[index] = nilfs_bmap_key_to_dkey(key);
+        dkeys[index] = cpu_to_le64(key);
-        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+        dptrs[index] = cpu_to_le64(ptr);
        nchildren++;
        nilfs_btree_node_set_nchildren(node, nchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
-static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
-                                    struct nilfs_btree_node *node,
+                                    __u64 *keyp, __u64 *ptrp, int ncmax)
-                                    __u64 *keyp, __u64 *ptrp, int index)
 {
        __u64 key;
        __u64 ptr;
@@ -367,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
        int nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
-        key = nilfs_bmap_dkey_to_key(dkeys[index]);
+        key = le64_to_cpu(dkeys[index]);
-        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+        ptr = le64_to_cpu(dptrs[index]);
        nchildren = nilfs_btree_node_get_nchildren(node);
        if (keyp != NULL)
                *keyp = key;
@@ -425,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
        return s == 0;
 }
-static inline struct nilfs_btree_node *
+/**
-nilfs_btree_get_root(const struct nilfs_btree *btree)
+ * nilfs_btree_node_broken - verify consistency of btree node
+ * @node: btree node block to be examined
+ * @size: node size (in bytes)
+ * @blocknr: block number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
+                                   size_t size, sector_t blocknr)
+{
+        int level, flags, nchildren;
+        int ret = 0;
+        level = nilfs_btree_node_get_level(node);
+        flags = nilfs_btree_node_get_flags(node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
+        if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+                     level >= NILFS_BTREE_LEVEL_MAX ||
+                     (flags & NILFS_BTREE_NODE_ROOT) ||
+                     nchildren < 0 ||
+                     nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
+                printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
+                       "level = %d, flags = 0x%x, nchildren = %d\n",
+                       (unsigned long long)blocknr, level, flags, nchildren);
+                ret = 1;
+        }
+        return ret;
+}
+int nilfs_btree_broken_node_block(struct buffer_head *bh)
 {
-        return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+        int ret;
+        if (buffer_nilfs_checked(bh))
+                return 0;
+        ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
+                                       bh->b_size, bh->b_blocknr);
+        if (likely(!ret))
+                set_buffer_nilfs_checked(bh);
+        return ret;
 }
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_bmap *btree)
+{
+        return (struct nilfs_btree_node *)btree->b_u.u_data;
+}
+static struct nilfs_btree_node *
 nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
 nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
-static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+static int nilfs_btree_height(const struct nilfs_bmap *btree)
 {
        return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
 }
-static inline struct nilfs_btree_node *
+static struct nilfs_btree_node *
-nilfs_btree_get_node(const struct nilfs_btree *btree,
+nilfs_btree_get_node(const struct nilfs_bmap *btree,
                     const struct nilfs_btree_path *path,
-                     int level)
+                     int level, int *ncmaxp)
 {
-        return (level == nilfs_btree_height(btree) - 1) ?
+        struct nilfs_btree_node *node;
-                nilfs_btree_get_root(btree) :
-                nilfs_btree_get_nonroot_node(path, level);
+        if (level == nilfs_btree_height(btree) - 1) {
+                node = nilfs_btree_get_root(btree);
+                *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
+        } else {
+                node = nilfs_btree_get_nonroot_node(path, level);
+                *ncmaxp = nilfs_btree_nchildren_per_block(btree);
+        }
+        return node;
 }
-static inline int
+static int
 nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
 {
        if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -470,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
        return 0;
 }
-static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+struct nilfs_btree_readahead_info {
+        struct nilfs_btree_node *node;  /* parent node */
+        int max_ra_blocks;              /* max nof blocks to read ahead */
+        int index;                      /* current index on the parent node */
+        int ncmax;                      /* nof children in the parent node */
+};
+static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+                                   struct buffer_head **bhp,
+                                   const struct nilfs_btree_readahead_info *ra)
+{
+        struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+        struct buffer_head *bh, *ra_bh;
+        sector_t submit_ptr = 0;
+        int ret;
+        ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
+        if (ret) {
+                if (ret != -EEXIST)
+                        return ret;
+                goto out_check;
+        }
+        if (ra) {
+                int i, n;
+                __u64 ptr2;
+                /* read ahead sibling nodes */
+                for (n = ra->max_ra_blocks, i = ra->index + 1;
+                     n > 0 && i < ra->ncmax; n--, i++) {
+                        ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
+                        ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
+                                                        &ra_bh, &submit_ptr);
+                        if (likely(!ret || ret == -EEXIST))
+                                brelse(ra_bh);
+                        else if (ret != -EBUSY)
+                                break;
+                        if (!buffer_locked(bh))
+                                goto out_no_wait;
+                }
+        }
+        wait_on_buffer(bh);
+ out_no_wait:
+        if (!buffer_uptodate(bh)) {
+                brelse(bh);
+                return -EIO;
+        }
+ out_check:
+        if (nilfs_btree_broken_node_block(bh)) {
+                clear_buffer_uptodate(bh);
+                brelse(bh);
+                return -EINVAL;
+        }
+        *bhp = bh;
+        return 0;
+}
+static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
+                                   struct buffer_head **bhp)
+{
+        return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
+}
+static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
                                 struct nilfs_btree_path *path,
-                                 __u64 key, __u64 *ptrp, int minlevel)
+                                 __u64 key, __u64 *ptrp, int minlevel,
+                                 int readahead)
 {
        struct nilfs_btree_node *node;
+        struct nilfs_btree_readahead_info p, *ra;
        __u64 ptr;
-        int level, index, found, ret;
+        int level, index, found, ncmax, ret;
        node = nilfs_btree_get_root(btree);
        level = nilfs_btree_node_get_level(node);
@@ -484,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                return -ENOENT;
        found = nilfs_btree_node_lookup(node, key, &index);
-        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        ptr = nilfs_btree_node_get_ptr(node, index,
+                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
-        for (level--; level >= minlevel; level--) {
+        ncmax = nilfs_btree_nchildren_per_block(btree);
-                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
+        while (--level >= minlevel) {
+                ra = NULL;
+                if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
+                        p.node = nilfs_btree_get_node(btree, path, level + 1,
+                                                      &p.ncmax);
+                        p.index = index;
+                        p.max_ra_blocks = 7;
+                        ra = &p;
+                }
+                ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
+                                              ra);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(path, level);
                if (nilfs_btree_bad_node(node, level))
                        return -EINVAL;
@@ -499,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                        found = nilfs_btree_node_lookup(node, key, &index);
                else
                        index = 0;
-                if (index < nilfs_btree_node_nchildren_max(node, btree))
+                if (index < ncmax) {
-                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                        ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
-                else {
+                } else {
                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
                        /* insert */
                        ptr = NILFS_BMAP_INVALID_PTR;
@@ -517,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
        __u64 ptr;
-        int index, level, ret;
+        int index, level, ncmax, ret;
        node = nilfs_btree_get_root(btree);
        index = nilfs_btree_node_get_nchildren(node) - 1;
        if (index < 0)
                return -ENOENT;
        level = nilfs_btree_node_get_level(node);
-        ptr = nilfs_btree_node_get_ptr(btree, node, index);
+        ptr = nilfs_btree_node_get_ptr(node, index,
+                                       NILFS_BTREE_ROOT_NCHILDREN_MAX);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
+        ncmax = nilfs_btree_nchildren_per_block(btree);
        for (level--; level > 0; level--) {
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -542,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                if (nilfs_btree_bad_node(node, level))
                        return -EINVAL;
                index = nilfs_btree_node_get_nchildren(node) - 1;
-                ptr = nilfs_btree_node_get_ptr(btree, node, index);
+                ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
                path[level].bp_index = index;
        }
@@ -554,53 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
                              __u64 key, int level, __u64 *ptrp)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
-        __u64 ptr;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
-        if (ptrp != NULL)
-                *ptrp = ptr;
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
                                     __u64 key, __u64 *ptrp, unsigned maxblocks)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
        int level = NILFS_BTREE_LEVEL_NODE_MIN;
-        int ret, cnt, index, maxlevel;
+        int ret, cnt, index, maxlevel, ncmax;
+        struct nilfs_btree_readahead_info p;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
        if (ret < 0)
                goto out;
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(btree);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        goto out;
@@ -611,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                goto end;
        maxlevel = nilfs_btree_height(btree) - 1;
-        node = nilfs_btree_get_node(btree, path, level);
+        node = nilfs_btree_get_node(btree, path, level, &ncmax);
        index = path[level].bp_index + 1;
        for (;;) {
                while (index < nilfs_btree_node_get_nchildren(node)) {
                        if (nilfs_btree_node_get_key(node, index) !=
                            key + cnt)
                                goto end;
-                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                        ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
                        if (dat) {
                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
                                if (ret < 0)
@@ -634,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                        break;
                /* look-up right sibling node */
-                node = nilfs_btree_get_node(btree, path, level + 1);
+                p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
-                index = path[level + 1].bp_index + 1;
+                p.index = path[level + 1].bp_index + 1;
-                if (index >= nilfs_btree_node_get_nchildren(node) ||
+                p.max_ra_blocks = 7;
-                    nilfs_btree_node_get_key(node, index) != key + cnt)
+                if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
+                    nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
                        break;
-                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
-                path[level + 1].bp_index = index;
+                path[level + 1].bp_index = p.index;
                brelse(path[level].bp_bh);
                path[level].bp_bh = NULL;
-                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+                ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
+                                              &p);
                if (ret < 0)
                        goto out;
                node = nilfs_btree_get_nonroot_node(path, level);
+                ncmax = nilfs_btree_nchildren_per_block(btree);
                index = 0;
                path[level].bp_index = index;
        }
@@ -655,12 +700,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
 {
@@ -682,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
        }
 }
-static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
+        int ncblk;
        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                                        path[level].bp_index);
+                nilfs_btree_node_insert(node, path[level].bp_index,
+                                        *keyp, *ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -701,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
                                                                         0));
        } else {
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+                nilfs_btree_node_insert(node, path[level].bp_index,
-                                        path[level].bp_index);
+                                        *keyp, *ptrp,
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
 }
-static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int nchildren, lnchildren, n, move;
+        int nchildren, lnchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -726,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_left(btree, left, node, n);
+        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
-static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int nchildren, rnchildren, n, move;
+        int nchildren, rnchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -771,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_right(btree, node, right, n);
+        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -797,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
 }
-static void nilfs_btree_split(struct nilfs_btree *btree,
+static void nilfs_btree_split(struct nilfs_bmap *btree,
                              struct nilfs_btree_path *path,
                              int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
        __u64 newkey;
        __u64 newptr;
-        int nchildren, n, move;
+        int nchildren, n, move, ncblk;
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        move = 0;
        n = (nchildren + 1) / 2;
@@ -817,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                move = 1;
        }
-        nilfs_btree_node_move_right(btree, node, right, n);
+        nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -829,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        if (move) {
                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+                nilfs_btree_node_insert(right, path[level].bp_index,
-                                        path[level].bp_index);
+                                        *keyp, *ptrp, ncblk);
                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
@@ -851,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
 }
-static void nilfs_btree_grow(struct nilfs_btree *btree,
+static void nilfs_btree_grow(struct nilfs_bmap *btree,
                             struct nilfs_btree_path *path,
                             int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *root, *child;
-        int n;
+        int n, ncblk;
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(root);
-        nilfs_btree_node_move_right(btree, root, child, n);
+        nilfs_btree_node_move_right(root, child, n,
+                                    NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
        nilfs_btree_node_set_level(root, level + 1);
        if (!buffer_dirty(path[level].bp_sib_bh))
@@ -878,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        *ptrp = path[level].bp_newreq.bpr_ptr;
 }
-static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
                                   const struct nilfs_btree_path *path)
 {
        struct nilfs_btree_node *node;
-        int level;
+        int level, ncmax;
        if (path == NULL)
                return NILFS_BMAP_INVALID_PTR;
@@ -890,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
        /* left sibling */
        level = NILFS_BTREE_LEVEL_NODE_MIN;
        if (path[level].bp_index > 0) {
-                node = nilfs_btree_get_node(btree, path, level);
+                node = nilfs_btree_get_node(btree, path, level, &ncmax);
-                return nilfs_btree_node_get_ptr(btree, node,
+                return nilfs_btree_node_get_ptr(node,
-                                                path[level].bp_index - 1);
+                                                path[level].bp_index - 1,
+                                                ncmax);
        }
        /* parent */
        level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
        if (level <= nilfs_btree_height(btree) - 1) {
-                node = nilfs_btree_get_node(btree, path, level);
+                node = nilfs_btree_get_node(btree, path, level, &ncmax);
-                return nilfs_btree_node_get_ptr(btree, node,
+                return nilfs_btree_node_get_ptr(node, path[level].bp_index,
-                                                path[level].bp_index);
+                                                ncmax);
        }
        return NILFS_BMAP_INVALID_PTR;
 }
-static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
                                       const struct nilfs_btree_path *path,
                                       __u64 key)
 {
        __u64 ptr;
-        ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+        ptr = nilfs_bmap_find_target_seq(btree, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;
@@ -923,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
                        return ptr;
        }
        /* block group */
-        return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+        return nilfs_bmap_find_target_in_group(btree);
 }
-static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
+static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
-                                     __u64 ptr)
-{
-        btree->bt_bmap.b_last_allocated_key = key;
-        btree->bt_bmap.b_last_allocated_ptr = ptr;
-}
-static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp, __u64 key, __u64 ptr,
                                      struct nilfs_bmap_stats *stats)
@@ -941,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
-        int pindex, level, ret;
+        int pindex, level, ncmax, ncblk, ret;
        struct inode *dat = NULL;
        stats->bs_nblocks = 0;
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
                path[level].bp_newreq.bpr_ptr =
                        nilfs_btree_find_target_v(btree, path, key);
-                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
-        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_data;
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                if (nilfs_btree_node_get_nchildren(node) <
+                if (nilfs_btree_node_get_nchildren(node) < ncblk) {
-                    nilfs_btree_node_nchildren_max(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_insert;
                        stats->bs_nblocks++;
                        goto out;
                }
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
                /* left sibling */
                if (pindex > 0) {
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
-                                                          pindex - 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
-                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_left;
                                stats->bs_nblocks++;
                                goto out;
-                        } else
+                        } else {
                                brelse(bh);
+                        }
                }
                /* right sibling */
-                if (pindex <
+                if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
-                    nilfs_btree_node_get_nchildren(parent) - 1) {
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                                                          ncmax);
-                                                          pindex + 1);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
-                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_right;
                                stats->bs_nblocks++;
                                goto out;
-                        } else
+                        } else {
                                brelse(bh);
+                        }
                }
                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+                ret = nilfs_bmap_prepare_alloc_ptr(btree,
                                                   &path[level].bp_newreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
@@ -1025,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                stats->bs_nblocks++;
-                nilfs_btree_node_init(btree,
+                sib = (struct nilfs_btree_node *)bh->b_data;
-                                      (struct nilfs_btree_node *)bh->b_data,
+                nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
-                                      0, level, 0, NULL, NULL);
                path[level].bp_sib_bh = bh;
                path[level].bp_op = nilfs_btree_split;
        }
@@ -1035,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* root */
        node = nilfs_btree_get_root(btree);
        if (nilfs_btree_node_get_nchildren(node) <
-            nilfs_btree_node_nchildren_max(node, btree)) {
+            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                path[level].bp_op = nilfs_btree_do_insert;
                stats->bs_nblocks++;
                goto out;
@@ -1043,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_child_node;
        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1052,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        if (ret < 0)
                goto err_out_curr_node;
-        nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+        nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
-                              0, level, 0, NULL, NULL);
+                              0, level, 0, ncblk, NULL, NULL);
        path[level].bp_sib_bh = bh;
        path[level].bp_op = nilfs_btree_grow;
@@ -1070,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                   dat);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
                nilfs_btnode_delete(path[level].bp_sib_bh);
-                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
+                nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                           &path[level].bp_newreq, dat);
        }
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+        nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
-                                   dat);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
 }
-static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, __u64 key, __u64 ptr)
 {
@@ -1097,36 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
-                nilfs_btree_set_target_v(btree, key, ptr);
+                nilfs_bmap_set_target_v(btree, key, ptr);
-                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
+                nilfs_bmap_commit_alloc_ptr(btree,
                                            &path[level - 1].bp_newreq, dat);
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 }
-static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-                                    NILFS_BTREE_LEVEL_NODE_MIN);
+                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret != -ENOENT) {
                if (ret == 0)
                        ret = -EEXIST;
@@ -1137,24 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_insert(btree, path, level, key, ptr);
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
                                  struct nilfs_btree_path *path,
                                  int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node;
+        int ncblk;
        if (level < nilfs_btree_height(btree) - 1) {
                node = nilfs_btree_get_nonroot_node(path, level);
-                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                                        path[level].bp_index);
+                nilfs_btree_node_delete(node, path[level].bp_index,
+                                        keyp, ptrp, ncblk);
                if (!buffer_dirty(path[level].bp_bh))
                        nilfs_btnode_mark_dirty(path[level].bp_bh);
                if (path[level].bp_index == 0)
@@ -1162,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(node, 0));
        } else {
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_delete(btree, node, keyp, ptrp,
+                nilfs_btree_node_delete(node, path[level].bp_index,
-                                        path[level].bp_index);
+                                        keyp, ptrp,
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
        }
 }
-static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int nchildren, lnchildren, n;
+        int nchildren, lnchildren, n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
@@ -1180,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        left = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        lnchildren = nilfs_btree_node_get_nchildren(left);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = (nchildren + lnchildren) / 2 - nchildren;
-        nilfs_btree_node_move_right(btree, left, node, n);
+        nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1198,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        path[level].bp_index += n;
 }
-static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int nchildren, rnchildren, n;
+        int nchildren, rnchildren, n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
@@ -1211,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        right = nilfs_btree_get_sib_node(path, level);
        nchildren = nilfs_btree_node_get_nchildren(node);
        rnchildren = nilfs_btree_node_get_nchildren(right);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = (nchildren + rnchildren) / 2 - nchildren;
-        nilfs_btree_node_move_left(btree, node, right, n);
+        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1230,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        path[level].bp_sib_bh = NULL;
 }
-static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *left;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        node = nilfs_btree_get_nonroot_node(path, level);
        left = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(node);
-        nilfs_btree_node_move_left(btree, left, node, n);
+        nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1255,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
-static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
                                     struct nilfs_btree_path *path,
                                     int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        node = nilfs_btree_get_nonroot_node(path, level);
        right = nilfs_btree_get_sib_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        n = nilfs_btree_node_get_nchildren(right);
-        nilfs_btree_node_move_left(btree, node, right, n);
+        nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
        if (!buffer_dirty(path[level].bp_bh))
                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1279,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
 }
-static void nilfs_btree_shrink(struct nilfs_btree *btree,
+static void nilfs_btree_shrink(struct nilfs_bmap *btree,
                               struct nilfs_btree_path *path,
                               int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *root, *child;
-        int n;
+        int n, ncblk;
        nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
        root = nilfs_btree_get_root(btree);
        child = nilfs_btree_get_nonroot_node(path, level);
+        ncblk = nilfs_btree_nchildren_per_block(btree);
-        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+        nilfs_btree_node_delete(root, 0, NULL, NULL,
+                                NILFS_BTREE_ROOT_NCHILDREN_MAX);
        nilfs_btree_node_set_level(root, level);
        n = nilfs_btree_node_get_nchildren(child);
-        nilfs_btree_node_move_left(btree, root, child, n);
+        nilfs_btree_node_move_left(root, child, n,
+                                   NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
 }
-static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp,
                                      struct nilfs_bmap_stats *stats,
@@ -1310,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
-        int pindex, level, ret;
+        int pindex, level, ncmin, ncmax, ncblk, ret;
        ret = 0;
        stats->bs_nblocks = 0;
+        ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+        ncblk = nilfs_btree_nchildren_per_block(btree);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
-                        nilfs_btree_node_get_ptr(btree, node,
+                        nilfs_btree_node_get_ptr(node, path[level].bp_index,
-                                                 path[level].bp_index);
+                                                 ncblk);
-                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+                ret = nilfs_bmap_prepare_end_ptr(btree,
                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
-                if (nilfs_btree_node_get_nchildren(node) >
+                if (nilfs_btree_node_get_nchildren(node) > ncmin) {
-                    nilfs_btree_node_nchildren_min(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_delete;
                        stats->bs_nblocks++;
                        goto out;
                }
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
                pindex = path[level + 1].bp_index;
                if (pindex > 0) {
                        /* left sibling */
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
-                                                          pindex - 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
-                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_left;
                                stats->bs_nblocks++;
@@ -1359,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                } else if (pindex <
                           nilfs_btree_node_get_nchildren(parent) - 1) {
                        /* right sibling */
-                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
+                        sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
-                                                          pindex + 1);
+                                                          ncmax);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
-                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_right;
                                stats->bs_nblocks++;
@@ -1397,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
-                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+                nilfs_btree_node_get_ptr(node, path[level].bp_index,
+                                         NILFS_BTREE_ROOT_NCHILDREN_MAX);
-        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                         &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;
@@ -1415,99 +1461,87 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
+        nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
                brelse(path[level].bp_sib_bh);
-                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
+                nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                         &path[level].bp_oldreq, dat);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
        return ret;
 }
-static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, struct inode *dat)
 {
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
+                nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
-                                          &path[level].bp_oldreq, dat);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 }
-static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
        struct inode *dat;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
-                                    NILFS_BTREE_LEVEL_NODE_MIN);
+                                    NILFS_BTREE_LEVEL_NODE_MIN, 0);
        if (ret < 0)
                goto out;
-        dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-                nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
        if (ret < 0)
                goto out;
        nilfs_btree_commit_delete(btree, path, level, dat);
-        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *root, *node;
        __u64 maxkey, nextmaxkey;
        __u64 ptr;
        int nchildren, ret;
-        btree = (struct nilfs_btree *)bmap;
        root = nilfs_btree_get_root(btree);
        switch (nilfs_btree_height(btree)) {
        case 2:
@@ -1518,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                nchildren = nilfs_btree_node_get_nchildren(root);
                if (nchildren > 1)
                        return 0;
-                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
@@ -1538,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
-static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
                                   __u64 *keys, __u64 *ptrs, int nitems)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *node, *root;
        __le64 *dkeys;
        __le64 *dptrs;
        __u64 ptr;
-        int nchildren, i, ret;
+        int nchildren, ncmax, i, ret;
-        btree = (struct nilfs_btree *)bmap;
        root = nilfs_btree_get_root(btree);
        switch (nilfs_btree_height(btree)) {
        case 2:
                bh = NULL;
                node = root;
+                ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
                break;
        case 3:
                nchildren = nilfs_btree_node_get_nchildren(root);
                WARN_ON(nchildren > 1);
-                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+                ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
+                                               NILFS_BTREE_ROOT_NCHILDREN_MAX);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
+                ncmax = nilfs_btree_nchildren_per_block(btree);
                break;
        default:
                node = NULL;
@@ -1574,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
        if (nchildren < nitems)
                nitems = nchildren;
        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(node, btree);
+        dptrs = nilfs_btree_node_dptrs(node, ncmax);
        for (i = 0; i < nitems; i++) {
-                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+                keys[i] = le64_to_cpu(dkeys[i]);
-                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+                ptrs[i] = le64_to_cpu(dptrs[i]);
        }
        if (bh != NULL)
@@ -1587,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 }
 static int
-nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
                                       union nilfs_bmap_ptr_req *dreq,
                                       union nilfs_bmap_ptr_req *nreq,
                                       struct buffer_head **bhp,
                                       struct nilfs_bmap_stats *stats)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct inode *dat = NULL;
        int ret;
@@ -1602,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* for data */
        /* cannot find near ptr */
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(btree)) {
                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(btree);
        }
-        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
+        ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
        if (ret < 0)
                return ret;
@@ -1615,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
+                ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
                if (ret < 0)
                        goto err_out_dreq;
@@ -1632,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
+        nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
 err_out_dreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
+        nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
        stats->bs_nblocks = 0;
        return ret;
 }
 static void
-nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
                                      int n,
@@ -1649,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_node *node;
        struct inode *dat;
        __u64 tmpptr;
+        int ncblk;
        /* free resources */
-        if (bmap->b_ops->bop_clear != NULL)
+        if (btree->b_ops->bop_clear != NULL)
-                bmap->b_ops->bop_clear(bmap);
+                btree->b_ops->bop_clear(btree);
        /* ptr must be a pointer to a buffer head. */
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        /* convert and insert */
-        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+        dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
-        nilfs_btree_init(bmap);
+        nilfs_btree_init(btree);
        if (nreq != NULL) {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
-                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
                /* create child node at level 1 */
                node = (struct nilfs_btree_node *)bh->b_data;
-                nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+                ncblk = nilfs_btree_nchildren_per_block(btree);
-                nilfs_btree_node_insert(btree, node,
+                nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
-                                        key, dreq->bpr_ptr, n);
+                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
                if (!buffer_dirty(bh))
                        nilfs_btnode_mark_dirty(bh);
-                if (!nilfs_bmap_dirty(bmap))
+                if (!nilfs_bmap_dirty(btree))
-                        nilfs_bmap_set_dirty(bmap);
+                        nilfs_bmap_set_dirty(btree);
                brelse(bh);
                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
                tmpptr = nreq->bpr_ptr;
-                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
-                                      2, 1, &keys[0], &tmpptr);
+                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
+                                      &keys[0], &tmpptr);
        } else {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+                nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
-                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+                nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
-                                      1, n, keys, ptrs);
+                                      NILFS_BTREE_ROOT_NCHILDREN_MAX,
-                nilfs_btree_node_insert(btree, node,
+                                      keys, ptrs);
-                                        key, dreq->bpr_ptr, n);
+                nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
-                if (!nilfs_bmap_dirty(bmap))
+                                        NILFS_BTREE_ROOT_NCHILDREN_MAX);
-                        nilfs_bmap_set_dirty(bmap);
+                if (!nilfs_bmap_dirty(btree))
+                        nilfs_bmap_set_dirty(btree);
        }
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (NILFS_BMAP_USE_VBN(btree))
-                nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
+                nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
 }
 /**
@@ -1711,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 * @ptrs:
 * @n:
 */
-int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                                   __u64 key, __u64 ptr,
                                   const __u64 *keys, const __u64 *ptrs, int n)
 {
@@ -1724,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                di = &dreq;
                ni = NULL;
        } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
-                           1 << bmap->b_inode->i_blkbits)) {
+                           1 << btree->b_inode->i_blkbits)) {
                di = &dreq;
                ni = &nreq;
        } else {
@@ -1733,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                BUG();
        }
-        ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+        ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
                                                     &stats);
        if (ret < 0)
                return ret;
-        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+        nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
                                              di, ni, bh);
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
        return 0;
 }
-static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level,
                                   struct buffer_head *bh)
@@ -1755,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
        return 0;
 }
-static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
        path[level].bp_oldreq.bpr_ptr =
-                nilfs_btree_node_get_ptr(btree, parent,
+                nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                         path[level + 1].bp_index);
+                                         ncmax);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
        ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
                                       &path[level].bp_newreq.bpr_req);
@@ -1777,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
                path[level].bp_ctxt.bh = path[level].bp_bh;
                ret = nilfs_btnode_prepare_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
                        nilfs_dat_abort_update(dat,
@@ -1790,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
        return 0;
 }
-static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
                                        struct nilfs_btree_path *path,
                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
+        int ncmax;
        nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
                                &path[level].bp_newreq.bpr_req,
-                                btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
+                                btree->b_ptr_type == NILFS_BMAP_PTR_VS);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                path[level].bp_bh = path[level].bp_ctxt.bh;
        }
        set_buffer_nilfs_volatile(path[level].bp_bh);
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
-                                 path[level].bp_newreq.bpr_ptr);
+                                 path[level].bp_newreq.bpr_ptr, ncmax);
 }
-static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
                                       struct nilfs_btree_path *path,
                                       int level, struct inode *dat)
 {
@@ -1821,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                               &path[level].bp_newreq.bpr_req);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
 }
-static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int *maxlevelp,
                                           struct inode *dat)
@@ -1860,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
        return ret;
 }
-static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
                                           struct nilfs_btree_path *path,
                                           int minlevel, int maxlevel,
                                           struct buffer_head *bh,
@@ -1875,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
                nilfs_btree_commit_update_v(btree, path, level, dat);
 }
-static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
                                   struct nilfs_btree_path *path,
                                   int level, struct buffer_head *bh)
 {
        int maxlevel = 0, ret;
        struct nilfs_btree_node *parent;
-        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 ptr;
+        int ncmax;
        get_bh(bh);
        path[level].bp_bh = bh;
@@ -1892,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                goto out;
        if (buffer_nilfs_volatile(path[level].bp_bh)) {
-                parent = nilfs_btree_get_node(btree, path, level + 1);
+                parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-                ptr = nilfs_btree_node_get_ptr(btree, parent,
+                ptr = nilfs_btree_node_get_ptr(parent,
-                                               path[level + 1].bp_index);
+                                               path[level + 1].bp_index,
+                                               ncmax);
                ret = nilfs_dat_mark_dirty(dat, ptr);
                if (ret < 0)
                        goto out;
@@ -1908,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
        return ret;
 }
-static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate(struct nilfs_bmap *btree,
                                 struct buffer_head *bh)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
@@ -1919,22 +1958,20 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        WARN_ON(!buffer_dirty(bh));
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
-                key = nilfs_bmap_data_get_key(bmap, bh);
+                key = nilfs_bmap_data_get_key(btree, bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }
-        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                if (unlikely(ret == -ENOENT))
                        printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1942,24 +1979,23 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = NILFS_BMAP_USE_VBN(bmap) ?
+        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_propagate_v(btree, path, level, bh) :
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
                                    struct buffer_head *bh)
 {
-        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
 }
-static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
                                         struct list_head *lists,
                                         struct buffer_head *bh)
 {
@@ -1973,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        node = (struct nilfs_btree_node *)bh->b_data;
        key = nilfs_btree_node_get_key(node, 0);
        level = nilfs_btree_node_get_level(node);
+        if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
+            level >= NILFS_BTREE_LEVEL_MAX) {
+                dump_stack();
+                printk(KERN_WARNING
+                       "%s: invalid btree level: %d (key=%llu, ino=%lu, "
+                       "blocknr=%llu)\n",
+                       __func__, level, (unsigned long long)key,
+                       NILFS_BMAP_I(btree)->vfs_inode.i_ino,
+                       (unsigned long long)bh->b_blocknr);
+                return;
+        }
        list_for_each(head, &lists[level]) {
                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
                cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1983,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        list_add_tail(&bh->b_assoc_buffers, head);
 }
-static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
                                             struct list_head *listp)
 {
-        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
-        struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
        struct list_head lists[NILFS_BTREE_LEVEL_MAX];
        struct pagevec pvec;
        struct buffer_head *bh, *head;
@@ -2021,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
                list_splice_tail(&lists[level], listp);
 }
-static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
@@ -2031,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
        struct nilfs_btree_node *parent;
        __u64 key;
        __u64 ptr;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        ptr = nilfs_btree_node_get_ptr(btree, parent,
+        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                       path[level + 1].bp_index);
+                                       ncmax);
        if (buffer_nilfs_node(*bh)) {
                path[level].bp_ctxt.oldkey = ptr;
                path[level].bp_ctxt.newkey = blocknr;
                path[level].bp_ctxt.bh = *bh;
                ret = nilfs_btnode_prepare_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0)
                        return ret;
                nilfs_btnode_commit_change_key(
-                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+                        &NILFS_BMAP_I(btree)->i_btnode_cache,
                        &path[level].bp_ctxt);
                *bh = path[level].bp_ctxt.bh;
        }
-        nilfs_btree_node_set_ptr(btree, parent,
+        nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
-                                 path[level + 1].bp_index, blocknr);
+                                 ncmax);
        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
-        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = level;
        return 0;
 }
-static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
                                struct nilfs_btree_path *path,
                                int level,
                                struct buffer_head **bh,
@@ -2070,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
                                union nilfs_binfo *binfo)
 {
        struct nilfs_btree_node *parent;
-        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(btree);
        __u64 key;
        __u64 ptr;
        union nilfs_bmap_ptr_req req;
-        int ret;
+        int ncmax, ret;
-        parent = nilfs_btree_get_node(btree, path, level + 1);
+        parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
-        ptr = nilfs_btree_node_get_ptr(btree, parent,
+        ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
-                                       path[level + 1].bp_index);
+                                       ncmax);
        req.bpr_ptr = ptr;
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (ret < 0)
@@ -2087,56 +2134,52 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
        /* on-disk format */
-        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+        binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        return 0;
 }
-static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign(struct nilfs_bmap *btree,
                              struct buffer_head **bh,
                              sector_t blocknr,
                              union nilfs_binfo *binfo)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_btree_node *node;
        __u64 key;
        int level, ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
                key = nilfs_btree_node_get_key(node, 0);
                level = nilfs_btree_node_get_level(node);
        } else {
-                key = nilfs_bmap_data_get_key(bmap, *bh);
+                key = nilfs_bmap_data_get_key(btree, *bh);
                level = NILFS_BTREE_LEVEL_DATA;
        }
-        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
        }
-        ret = NILFS_BMAP_USE_VBN(bmap) ?
+        ret = NILFS_BMAP_USE_VBN(btree) ?
                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
-static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
@@ -2145,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
        __u64 key;
        int ret;
-        ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+        ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
                             blocknr);
        if (ret < 0)
                return ret;
@@ -2154,30 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
                node = (struct nilfs_btree_node *)(*bh)->b_data;
                key = nilfs_btree_node_get_key(node, 0);
        } else
-                key = nilfs_bmap_data_get_key(bmap, *bh);
+                key = nilfs_bmap_data_get_key(btree, *bh);
        /* on-disk format */
        binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        return 0;
 }
-static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        __u64 ptr;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
-        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
@@ -2191,11 +2231,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        if (!buffer_dirty(bh))
                nilfs_btnode_mark_dirty(bh);
        brelse(bh);
-        if (!nilfs_bmap_dirty(&btree->bt_bmap))
+        if (!nilfs_bmap_dirty(btree))
-                nilfs_bmap_set_dirty(&btree->bt_bmap);
+                nilfs_bmap_set_dirty(btree);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -2243,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 int nilfs_btree_init(struct nilfs_bmap *bmap)
 {
        bmap->b_ops = &nilfs_btree_ops;
+        bmap->b_nchildren_per_block =
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
        return 0;
 }
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
        bmap->b_ops = &nilfs_btree_ops_gc;
+        bmap->b_nchildren_per_block =
+                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..22c02e35b6ef 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,18 +30,26 @@
 #include "btnode.h"
 #include "bmap.h"
-struct nilfs_btree;
-struct nilfs_btree_path;
 /**
- * struct nilfs_btree - B-tree structure
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bt_bmap: bmap base structure
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
 */
-struct nilfs_btree {
+struct nilfs_btree_path {
-        struct nilfs_bmap bt_bmap;
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
 };
 #define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
 #define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
        ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /    \
@@ -57,12 +65,13 @@ struct nilfs_btree {
 #define NILFS_BTREE_KEY_MIN     ((__u64)0)
 #define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+extern struct kmem_cache *nilfs_btree_path_cache;
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
 int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
                                   const __u64 *, const __u64 *, int);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
+int nilfs_btree_broken_node_block(struct buffer_head *bh);
 #endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f0..cb003c8ee1f6 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -80,23 +80,10 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
        return last_byte;
 }
-static int nilfs_prepare_chunk_uninterruptible(struct page *page,
+static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to)
-                                               struct address_space *mapping,
-                                               unsigned from, unsigned to)
 {
        loff_t pos = page_offset(page) + from;
-        return block_write_begin(NULL, mapping, pos, to - from,
+        return __block_write_begin(page, pos, to - from, nilfs_get_block);
-                                 AOP_FLAG_UNINTERRUPTIBLE, &page,
-                                 NULL, nilfs_get_block);
-}
-static int nilfs_prepare_chunk(struct page *page,
-                               struct address_space *mapping,
-                               unsigned from, unsigned to)
-{
-        loff_t pos = page_offset(page) + from;
-        return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
-                                 NULL, nilfs_get_block);
 }
 static void nilfs_commit_chunk(struct page *page,
@@ -141,7 +128,7 @@ static void nilfs_check_page(struct page *page)
        }
        for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
                p = (struct nilfs_dir_entry *)(kaddr + offs);
-                rec_len = le16_to_cpu(p->rec_len);
+                rec_len = nilfs_rec_len_from_disk(p->rec_len);
                if (rec_len < NILFS_DIR_REC_LEN(1))
                        goto Eshort;
@@ -199,13 +186,10 @@ fail:
 static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
-        struct page *page = read_cache_page(mapping, n,
+        struct page *page = read_mapping_page(mapping, n, NULL);
-                                (filler_t *)mapping->a_ops->readpage, NULL);
        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
                if (!PageChecked(page))
                        nilfs_check_page(page);
                if (PageError(page))
@@ -238,7 +222,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
 */
 static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
 {
-        return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+        return (struct nilfs_dir_entry *)((char *)p +
+                                          nilfs_rec_len_from_disk(p->rec_len));
 }
 static unsigned char
@@ -329,7 +314,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                        goto success;
                                }
                        }
-                        filp->f_pos += le16_to_cpu(de->rec_len);
+                        filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
                }
                nilfs_put_page(page);
        }
@@ -444,12 +429,12 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
                    struct page *page, struct inode *inode)
 {
        unsigned from = (char *) de - (char *) page_address(page);
-        unsigned to = from + le16_to_cpu(de->rec_len);
+        unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
        struct address_space *mapping = page->mapping;
        int err;
        lock_page(page);
-        err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+        err = nilfs_prepare_chunk(page, from, to);
        BUG_ON(err);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
@@ -500,7 +485,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
                                /* We hit i_size */
                                name_len = 0;
                                rec_len = chunk_size;
-                                de->rec_len = cpu_to_le16(chunk_size);
+                                de->rec_len = nilfs_rec_len_to_disk(chunk_size);
                                de->inode = 0;
                                goto got_it;
                        }
@@ -514,7 +499,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
                        if (nilfs_match(namelen, name, de))
                                goto out_unlock;
                        name_len = NILFS_DIR_REC_LEN(de->name_len);
-                        rec_len = le16_to_cpu(de->rec_len);
+                        rec_len = nilfs_rec_len_from_disk(de->rec_len);
                        if (!de->inode && rec_len >= reclen)
                                goto got_it;
                        if (rec_len >= name_len + reclen)
@@ -530,15 +515,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 got_it:
        from = (char *)de - (char *)page_address(page);
        to = from + rec_len;
-        err = nilfs_prepare_chunk(page, page->mapping, from, to);
+        err = nilfs_prepare_chunk(page, from, to);
        if (err)
                goto out_unlock;
        if (de->inode) {
                struct nilfs_dir_entry *de1;
                de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
-                de1->rec_len = cpu_to_le16(rec_len - name_len);
+                de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
-                de->rec_len = cpu_to_le16(name_len);
+                de->rec_len = nilfs_rec_len_to_disk(name_len);
                de = de1;
        }
        de->name_len = namelen;
@@ -569,7 +554,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        struct inode *inode = mapping->host;
        char *kaddr = page_address(page);
        unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
-        unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+        unsigned to = ((char *)dir - kaddr) +
+                nilfs_rec_len_from_disk(dir->rec_len);
        struct nilfs_dir_entry *pde = NULL;
        struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
        int err;
@@ -587,10 +573,10 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        if (pde)
                from = (char *)pde - (char *)page_address(page);
        lock_page(page);
-        err = nilfs_prepare_chunk(page, mapping, from, to);
+        err = nilfs_prepare_chunk(page, from, to);
        BUG_ON(err);
        if (pde)
-                pde->rec_len = cpu_to_le16(to - from);
+                pde->rec_len = nilfs_rec_len_to_disk(to - from);
        dir->inode = 0;
        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -615,7 +601,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
        if (!page)
                return -ENOMEM;
-        err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+        err = nilfs_prepare_chunk(page, 0, chunk_size);
        if (unlikely(err)) {
                unlock_page(page);
                goto fail;
@@ -624,14 +610,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
        memset(kaddr, 0, chunk_size);
        de = (struct nilfs_dir_entry *)kaddr;
        de->name_len = 1;
-        de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+        de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
        memcpy(de->name, ".\0\0", 4);
        de->inode = cpu_to_le64(inode->i_ino);
        nilfs_set_de_type(de, inode);
        de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
        de->name_len = 2;
-        de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+        de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
        de->inode = cpu_to_le64(parent->i_ino);
        memcpy(de->name, "..\0", 4);
        nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cdf..324d80c57518 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
 #include "alloc.h"
 #include "dat.h"
-static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
 {
        return (__le64 *)
-                ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+                ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
 }
 static inline __u64
-nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+        return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
 }
-static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
                                        __u64 key, __u64 ptr)
 {
-        *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+        *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
 }
-static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
                               __u64 key, int level, __u64 *ptrp)
 {
-        struct nilfs_direct *direct;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;  /* XXX: use macro for level 1 */
        if (key > NILFS_DIRECT_KEY_MAX || level != 1)
                return -ENOENT;
        ptr = nilfs_direct_get_ptr(direct, key);
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        if (ptrp != NULL)
+        *ptrp = ptr;
-                *ptrp = ptr;
        return 0;
 }
-static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
                                      __u64 key, __u64 *ptrp,
                                      unsigned maxblocks)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        struct inode *dat = NULL;
        __u64 ptr, ptr2;
        sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        if (NILFS_BMAP_USE_VBN(bmap)) {
+        if (NILFS_BMAP_USE_VBN(direct)) {
-                dat = nilfs_bmap_get_dat(bmap);
+                dat = nilfs_bmap_get_dat(direct);
                ret = nilfs_dat_translate(dat, ptr, &blocknr);
                if (ret < 0)
                        return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
 }
 static __u64
-nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
 {
        __u64 ptr;
-        ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+        ptr = nilfs_bmap_find_target_seq(direct, key);
        if (ptr != NILFS_BMAP_INVALID_PTR)
                /* sequential access */
                return ptr;
        else
                /* block group */
-                return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+                return nilfs_bmap_find_target_in_group(direct);
-}
-static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
-                                      __u64 key, __u64 ptr)
-{
-        direct->d_bmap.b_last_allocated_key = key;
-        direct->d_bmap.b_last_allocated_ptr = ptr;
 }
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
        struct inode *dat = NULL;
        struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
-        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+        if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
                return -EEXIST;
        if (NILFS_BMAP_USE_VBN(bmap)) {
-                req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+                req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
                dat = nilfs_bmap_get_dat(bmap);
        }
        ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                set_buffer_nilfs_volatile(bh);
                nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
-                nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
+                nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
                if (!nilfs_bmap_dirty(bmap))
                        nilfs_bmap_set_dirty(bmap);
                if (NILFS_BMAP_USE_VBN(bmap))
-                        nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
+                        nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
                nilfs_bmap_add_blocks(bmap, 1);
        }
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
        struct inode *dat;
        int ret;
        if (key > NILFS_DIRECT_KEY_MAX ||
-            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+            nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-        req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
+        req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
        ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
        if (!ret) {
                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
-                nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+                nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
                nilfs_bmap_sub_blocks(bmap, 1);
        }
        return ret;
 }
-static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
 {
-        struct nilfs_direct *direct;
        __u64 key, lastkey;
-        direct = (struct nilfs_direct *)bmap;
        lastkey = NILFS_DIRECT_KEY_MAX + 1;
        for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
                if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
        return key > NILFS_DIRECT_KEY_MAX;
 }
-static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
                                    __u64 *keys, __u64 *ptrs, int nitems)
 {
-        struct nilfs_direct *direct;
        __u64 key;
        __u64 ptr;
        int n;
-        direct = (struct nilfs_direct *)bmap;
        if (nitems > NILFS_DIRECT_NBLOCKS)
                nitems = NILFS_DIRECT_NBLOCKS;
        n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
 {
-        struct nilfs_direct *direct;
        __le64 *dptrs;
        int ret, i, j;
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                bmap->b_ops->bop_clear(bmap);
        /* convert */
-        direct = (struct nilfs_direct *)bmap;
+        dptrs = nilfs_direct_dptrs(bmap);
-        dptrs = nilfs_direct_dptrs(direct);
        for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
                if ((j < n) && (i == keys[j])) {
                        dptrs[i] = (i != key) ?
-                                nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+                                cpu_to_le64(ptrs[j]) :
                                NILFS_BMAP_INVALID_PTR;
                        j++;
                } else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
        return 0;
 }
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
 {
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        struct nilfs_palloc_req oldreq, newreq;
        struct inode *dat;
        __u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
        dat = nilfs_bmap_get_dat(bmap);
        key = nilfs_bmap_data_get_key(bmap, bh);
-        ptr = nilfs_direct_get_ptr(direct, key);
+        ptr = nilfs_direct_get_ptr(bmap, key);
        if (!buffer_nilfs_volatile(bh)) {
                oldreq.pr_entry_nr = ptr;
                newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
                nilfs_dat_commit_update(dat, &oldreq, &newreq,
                                        bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
                set_buffer_nilfs_volatile(bh);
-                nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
+                nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
        } else
                ret = nilfs_dat_mark_dirty(dat, ptr);
        return ret;
 }
-static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
-        struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
+        struct inode *dat = nilfs_bmap_get_dat(direct);
        union nilfs_bmap_ptr_req req;
        int ret;
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
        if (!ret) {
                nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
-                binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+                binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
-                binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+                binfo->bi_v.bi_blkoff = cpu_to_le64(key);
        }
        return ret;
 }
-static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
 {
        nilfs_direct_set_ptr(direct, key, blocknr);
-        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
        binfo->bi_dat.bi_level = 0;
        return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                               sector_t blocknr,
                               union nilfs_binfo *binfo)
 {
-        struct nilfs_direct *direct;
        __u64 key;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;
        key = nilfs_bmap_data_get_key(bmap, *bh);
        if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
                printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
                       (unsigned long long)key);
                return -EINVAL;
        }
-        ptr = nilfs_direct_get_ptr(direct, key);
+        ptr = nilfs_direct_get_ptr(bmap, key);
        if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
                printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
                       (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
        }
        return NILFS_BMAP_USE_VBN(bmap) ?
-                nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+                nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
-                nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
+                nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
 }
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d0..dc643de20a25 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
 #include "bmap.h"
-struct nilfs_direct;
 /**
 * struct nilfs_direct_node - direct node
 * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
        __u8 pad[7];
 };
-/**
- * struct nilfs_direct - direct mapping
- * @d_bmap: bmap structure
- */
-struct nilfs_direct {
-        struct nilfs_bmap d_bmap;
-};
 #define NILFS_DIRECT_NBLOCKS    (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
 #define NILFS_DIRECT_KEY_MIN    0
 #define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
        /*
         * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * This function should be implemented when the writeback function
         * will be implemented.
         */
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index dd5f7e0a95f6..84a45d1d5464 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -78,7 +78,7 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
        struct inode *gcdat = nilfs->ns_gc_dat;
        struct nilfs_inode_info *gii = NILFS_I(gcdat);
-        gcdat->i_state = I_CLEAR;
+        gcdat->i_state = I_FREEING | I_CLEAR;
        gii->i_flags = 0;
        nilfs_palloc_clear_cache(gcdat);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3e..bed3a783129b 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,6 +48,8 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "mdt.h"
 #include "dat.h"
@@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
                                   __u64 vbn, struct buffer_head **out_bh)
 {
-        int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+        int ret;
-                                            vbn ? : pbn, pbn, out_bh);
+        ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+                                        vbn ? : pbn, pbn, READ, out_bh, &pbn);
        if (ret == -EEXIST) /* internal code (cache hit) */
                ret = 0;
        return ret;
@@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
        if (buffer_dirty(bh))
                return -EEXIST;
-        if (buffer_nilfs_node(bh))
+        if (buffer_nilfs_node(bh)) {
+                if (nilfs_btree_broken_node_block(bh)) {
+                        clear_buffer_uptodate(bh);
+                        return -EIO;
+                }
                nilfs_btnode_mark_dirty(bh);
-        else
+        } else {
                nilfs_mdt_mark_buffer_dirty(bh);
+        }
        return 0;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0957b58f909d..eccb2f2e2315 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/uio.h>
 #include "nilfs.h"
+#include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
@@ -197,11 +198,15 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(err))
                return err;
-        *pagep = NULL;
+        err = block_write_begin(mapping, pos, len, flags, pagep,
-        err = block_write_begin(file, mapping, pos, len, flags, pagep,
+                                nilfs_get_block);
-                                fsdata, nilfs_get_block);
+        if (unlikely(err)) {
-        if (unlikely(err))
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
                nilfs_transaction_abort(inode->i_sb);
+        }
        return err;
 }
@@ -237,6 +242,19 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        /* Needs synchronization with the cleaner */
        size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs, nilfs_get_block, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && size < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
        return size;
 }
@@ -280,16 +298,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
        atomic_inc(&sbi->s_inodes_count);
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -346,7 +355,6 @@ void nilfs_free_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
-        clear_inode(inode);
        /* XXX: check error code? Is there any thing I can do? */
        (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
        atomic_dec(&sbi->s_inodes_count);
@@ -451,7 +459,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                inode->i_op = &nilfs_special_inode_operations;
                init_special_inode(
                        inode, inode->i_mode,
-                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
        brelse(bh);
@@ -511,7 +519,7 @@ void nilfs_write_inode_common(struct inode *inode,
                nilfs_bmap_write(ii->i_bmap, raw_inode);
        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_device_code =
-                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+                        cpu_to_le64(huge_encode_dev(inode->i_rdev));
        /* When extending inode, nilfs->ns_inode_size should be checked
           for substitutions of appended fields */
 }
@@ -606,16 +614,34 @@ void nilfs_truncate(struct inode *inode)
           But truncate has no return value. */
 }
-void nilfs_delete_inode(struct inode *inode)
+static void nilfs_clear_inode(struct inode *inode)
+{
+        struct nilfs_inode_info *ii = NILFS_I(inode);
+        /*
+         * Free resources allocated in nilfs_read_inode(), here.
+         */
+        BUG_ON(!list_empty(&ii->i_dirty));
+        brelse(ii->i_bh);
+        ii->i_bh = NULL;
+        if (test_bit(NILFS_I_BMAP, &ii->i_state))
+                nilfs_bmap_clear(ii->i_bmap);
+        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+void nilfs_evict_inode(struct inode *inode)
 {
        struct nilfs_transaction_info ti;
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        if (unlikely(is_bad_inode(inode))) {
+        if (inode->i_nlink || unlikely(is_bad_inode(inode))) {
                if (inode->i_data.nrpages)
                        truncate_inode_pages(&inode->i_data, 0);
-                clear_inode(inode);
+                end_writeback(inode);
+                nilfs_clear_inode(inode);
                return;
        }
        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
@@ -625,6 +651,8 @@ void nilfs_delete_inode(struct inode *inode)
        nilfs_truncate_bmap(ii, 0);
        nilfs_mark_inode_dirty(inode);
+        end_writeback(inode);
+        nilfs_clear_inode(inode);
        nilfs_free_inode(inode);
        /* nilfs_free_inode() marks inode buffer dirty */
        if (IS_SYNC(inode))
@@ -648,14 +676,27 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
        err = nilfs_transaction_begin(sb, &ti, 0);
        if (unlikely(err))
                return err;
-        err = inode_setattr(inode, iattr);
-        if (!err && (iattr->ia_valid & ATTR_MODE))
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(inode)) {
+                err = vmtruncate(inode, iattr->ia_size);
+                if (unlikely(err))
+                        goto out_err;
+        }
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
+        if (iattr->ia_valid & ATTR_MODE) {
                err = nilfs_acl_chmod(inode);
-        if (likely(!err))
+                if (unlikely(err))
-                err = nilfs_transaction_commit(sb);
+                        goto out_err;
-        else
+        }
-                nilfs_transaction_abort(sb);
+        return nilfs_transaction_commit(sb);
+out_err:
+        nilfs_transaction_abort(sb);
        return err;
 }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb6..d01aff4957d9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include "nilfs.h"
+#include "btnode.h"
 #include "segment.h"
 #include "page.h"
 #include "mdt.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..d3d54046e5f8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
 #include "the_nilfs.h"
 #include "sb.h"
 #include "bmap.h"
-#include "bmap_union.h"
 /*
 * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
        __u32 i_flags;
        unsigned long  i_state;         /* Dynamic state flags */
        struct nilfs_bmap *i_bmap;
-        union nilfs_bmap_union i_bmap_union;
+        struct nilfs_bmap i_bmap_data;
        __u64 i_xattr;  /* sector_t ??? */
        __u32 i_dir_start_lookup;
        __u64 i_cno;            /* check point number for GC inode */
@@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
 static inline struct nilfs_inode_info *
 NILFS_BMAP_I(const struct nilfs_bmap *bmap)
 {
-        return container_of((union nilfs_bmap_union *)bmap,
+        return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
-                            struct nilfs_inode_info,
-                            i_bmap_union);
 }
 static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -107,6 +104,14 @@ enum {
 };
 /*
+ * commit flags for nilfs_commit_super and nilfs_sync_super
+ */
+enum {
+        NILFS_SB_COMMIT = 0,    /* Commit a super block alternately */
+        NILFS_SB_COMMIT_ALL     /* Commit both super blocks */
+};
+/*
 * Macros to check inode numbers
 */
 #define NILFS_MDT_INO_BITS   \
@@ -228,7 +233,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
                           struct page *, struct inode *);
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
@@ -245,7 +250,7 @@ extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
 extern struct inode *nilfs_iget(struct super_block *, unsigned long);
 extern void nilfs_update_inode(struct inode *, struct buffer_head *);
 extern void nilfs_truncate(struct inode *);
-extern void nilfs_delete_inode(struct inode *);
+extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
 extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
                                  struct buffer_head **);
@@ -270,7 +275,14 @@ extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
                                        struct nilfs_super_block *, char *);
+extern int nilfs_check_feature_compatibility(struct super_block *,
+                                             struct nilfs_super_block *);
+extern void nilfs_set_log_cursor(struct nilfs_super_block *,
+                                 struct the_nilfs *);
+extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
+                                                      int flip);
 extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_cleanup_super(struct nilfs_sb_info *);
 extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
 extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e48130..aab11db2cb08 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
 #define NILFS_BUFFER_INHERENT_BITS  \
        ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
-         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+         (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
+         (1UL << BH_NILFS_Checked))
 static struct buffer_head *
 __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
        lock_buffer(bh);
        clear_buffer_nilfs_volatile(bh);
+        clear_buffer_nilfs_checked(bh);
        clear_buffer_dirty(bh);
        if (nilfs_page_buffers_clean(page))
                __nilfs_clear_page_dirty(page);
@@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
                                lock_buffer(bh);
                                clear_buffer_dirty(bh);
                                clear_buffer_nilfs_volatile(bh);
+                                clear_buffer_nilfs_checked(bh);
                                clear_buffer_uptodate(bh);
                                clear_buffer_mapped(bh);
                                unlock_buffer(bh);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f8..f53d8da41ed7 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,13 @@ enum {
        BH_NILFS_Allocated = BH_PrivateStart,
        BH_NILFS_Node,
        BH_NILFS_Volatile,
+        BH_NILFS_Checked,
 };
 BUFFER_FNS(NILFS_Allocated, nilfs_allocated)    /* nilfs private buffers */
 BUFFER_FNS(NILFS_Node, nilfs_node)              /* nilfs node buffers */
 BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+BUFFER_FNS(NILFS_Checked, nilfs_checked)        /* buffer is verified */
 void nilfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba43146f3c30..d0c35ef39f6a 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,25 +91,9 @@ static int nilfs_warn_segment_error(int err)
        return -EINVAL;
 }
-static void store_segsum_info(struct nilfs_segsum_info *ssi,
-                              struct nilfs_segment_summary *sum,
-                              unsigned int blocksize)
-{
-        ssi->flags = le16_to_cpu(sum->ss_flags);
-        ssi->seg_seq = le64_to_cpu(sum->ss_seq);
-        ssi->ctime = le64_to_cpu(sum->ss_create);
-        ssi->next = le64_to_cpu(sum->ss_next);
-        ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
-        ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
-        ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
-        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
-        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
-}
 /**
- * calc_crc_cont - check CRC of blocks continuously
+ * nilfs_compute_checksum - compute checksum of blocks continuously
- * @sbi: nilfs_sb_info
+ * @nilfs: nilfs object
 * @bhs: buffer head of start block
 * @sum: place to store result
 * @offset: offset bytes in the first block
@@ -117,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
 * @start: DBN of start block
 * @nblock: number of blocks to be checked
 */
-static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+static int nilfs_compute_checksum(struct the_nilfs *nilfs,
-                         u32 *sum, unsigned long offset, u64 check_bytes,
+                                  struct buffer_head *bhs, u32 *sum,
-                         sector_t start, unsigned long nblock)
+                                  unsigned long offset, u64 check_bytes,
+                                  sector_t start, unsigned long nblock)
 {
-        unsigned long blocksize = sbi->s_super->s_blocksize;
+        unsigned int blocksize = nilfs->ns_blocksize;
        unsigned long size;
        u32 crc;
        BUG_ON(offset >= blocksize);
        check_bytes -= offset;
        size = min_t(u64, check_bytes, blocksize - offset);
-        crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+        crc = crc32_le(nilfs->ns_crc_seed,
                       (unsigned char *)bhs->b_data + offset, size);
        if (--nblock > 0) {
                do {
-                        struct buffer_head *bh
+                        struct buffer_head *bh;
-                                = sb_bread(sbi->s_super, ++start);
+                        bh = __bread(nilfs->ns_bdev, ++start, blocksize);
                        if (!bh)
                                return -EIO;
                        check_bytes -= size;
@@ -148,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
 /**
 * nilfs_read_super_root_block - read super root block
- * @sb: super_block
+ * @nilfs: nilfs object
 * @sr_block: disk block number of the super root block
 * @pbh: address of a buffer_head pointer to return super root buffer
 * @check: CRC check flag
 */
-int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
                                struct buffer_head **pbh, int check)
 {
        struct buffer_head *bh_sr;
@@ -162,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
        int ret;
        *pbh = NULL;
-        bh_sr = sb_bread(sb, sr_block);
+        bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
        if (unlikely(!bh_sr)) {
                ret = NILFS_SEG_FAIL_IO;
                goto failed;
@@ -172,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
        if (check) {
                unsigned bytes = le16_to_cpu(sr->sr_bytes);
-                if (bytes == 0 || bytes > sb->s_blocksize) {
+                if (bytes == 0 || bytes > nilfs->ns_blocksize) {
                        ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
                        goto failed_bh;
                }
-                if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+                if (nilfs_compute_checksum(
-                                  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+                            nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
+                            sr_block, 1)) {
                        ret = NILFS_SEG_FAIL_IO;
                        goto failed_bh;
                }
@@ -197,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
 }
 /**
- * load_segment_summary - read segment summary of the specified partial segment
+ * nilfs_read_log_header - read summary header of the specified log
- * @sbi: nilfs_sb_info
+ * @nilfs: nilfs object
- * @pseg_start: start disk block number of partial segment
+ * @start_blocknr: start block number of the log
- * @seg_seq: sequence number requested
+ * @sum: pointer to return segment summary structure
- * @ssi: pointer to nilfs_segsum_info struct to store information
 */
-static int
+static struct buffer_head *
-load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
-                     u64 seg_seq, struct nilfs_segsum_info *ssi)
+                      struct nilfs_segment_summary **sum)
 {
        struct buffer_head *bh_sum;
-        struct nilfs_segment_summary *sum;
+        bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
+        if (bh_sum)
+                *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        return bh_sum;
+}
+/**
+ * nilfs_validate_log - verify consistency of log
+ * @nilfs: nilfs object
+ * @seg_seq: sequence number of segment
+ * @bh_sum: buffer head of summary block
+ * @sum: segment summary struct
+ */
+static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
+                              struct buffer_head *bh_sum,
+                              struct nilfs_segment_summary *sum)
+{
        unsigned long nblock;
        u32 crc;
-        int ret = NILFS_SEG_FAIL_IO;
+        int ret;
-        bh_sum = sb_bread(sbi->s_super, pseg_start);
+        ret = NILFS_SEG_FAIL_MAGIC;
-        if (!bh_sum)
+        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
                goto out;
-        sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+        ret = NILFS_SEG_FAIL_SEQ;
+        if (le64_to_cpu(sum->ss_seq) != seg_seq)
-        /* Check consistency of segment summary */
+                goto out;
-        if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
-                ret = NILFS_SEG_FAIL_MAGIC;
-                goto failed;
-        }
-        store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
-        if (seg_seq != ssi->seg_seq) {
-                ret = NILFS_SEG_FAIL_SEQ;
-                goto failed;
-        }
-        nblock = ssi->nblocks;
+        nblock = le32_to_cpu(sum->ss_nblocks);
-        if (unlikely(nblock == 0 ||
+        ret = NILFS_SEG_FAIL_CONSISTENCY;
-                     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+        if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
                /* This limits the number of blocks read in the CRC check */
-                ret = NILFS_SEG_FAIL_CONSISTENCY;
+                goto out;
-                goto failed;
-        }
+        ret = NILFS_SEG_FAIL_IO;
-        if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
+        if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
-                          ((u64)nblock << sbi->s_super->s_blocksize_bits),
+                                   ((u64)nblock << nilfs->ns_blocksize_bits),
-                          pseg_start, nblock)) {
+                                   bh_sum->b_blocknr, nblock))
-                ret = NILFS_SEG_FAIL_IO;
+                goto out;
-                goto failed;
-        }
+        ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
-        if (crc == le32_to_cpu(sum->ss_datasum))
+        if (crc != le32_to_cpu(sum->ss_datasum))
-                ret = 0;
+                goto out;
-        else
+        ret = 0;
-                ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+out:
- failed:
-        brelse(bh_sum);
- out:
        return ret;
 }
-static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+/**
-                        unsigned int *offset, unsigned int bytes)
+ * nilfs_read_summary_info - read an item on summary blocks of a log
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be read
+ */
+static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
+                                     struct buffer_head **pbh,
+                                     unsigned int *offset, unsigned int bytes)
 {
        void *ptr;
        sector_t blocknr;
@@ -263,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
        if (bytes > (*pbh)->b_size - *offset) {
                blocknr = (*pbh)->b_blocknr;
                brelse(*pbh);
-                *pbh = sb_bread(sb, blocknr + 1);
+                *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
+                               nilfs->ns_blocksize);
                if (unlikely(!*pbh))
                        return NULL;
                *offset = 0;
@@ -273,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
        return ptr;
 }
-static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+/**
-                        unsigned int *offset, unsigned int bytes,
+ * nilfs_skip_summary_info - skip items on summary blocks of a log
-                        unsigned long count)
+ * @nilfs: nilfs object
+ * @pbh: the current buffer head on summary blocks [in, out]
+ * @offset: the current byte offset on summary blocks [in, out]
+ * @bytes: byte size of the item to be skipped
+ * @count: number of items to be skipped
+ */
+static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
+                                    struct buffer_head **pbh,
+                                    unsigned int *offset, unsigned int bytes,
+                                    unsigned long count)
 {
        unsigned int rest_item_in_current_block
                = ((*pbh)->b_size - *offset) / bytes;
@@ -292,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
                *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
                brelse(*pbh);
-                *pbh = sb_bread(sb, blocknr + bcnt);
+                *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
+                               nilfs->ns_blocksize);
        }
 }
-static int
+/**
-collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+ * nilfs_scan_dsync_log - get block information of a log written for data sync
-                           struct nilfs_segsum_info *ssi,
+ * @nilfs: nilfs object
-                           struct list_head *head)
+ * @start_blocknr: start block number of the log
+ * @sum: log summary information
+ * @head: list head to add nilfs_recovery_block struct
+ */
+static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
+                                struct nilfs_segment_summary *sum,
+                                struct list_head *head)
 {
        struct buffer_head *bh;
        unsigned int offset;
-        unsigned long nfinfo = ssi->nfinfo;
+        u32 nfinfo, sumbytes;
-        sector_t blocknr = sum_blocknr + ssi->nsumblk;
+        sector_t blocknr;
        ino_t ino;
        int err = -EIO;
+        nfinfo = le32_to_cpu(sum->ss_nfinfo);
        if (!nfinfo)
                return 0;
-        bh = sb_bread(sbi->s_super, sum_blocknr);
+        sumbytes = le32_to_cpu(sum->ss_sumbytes);
+        blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
+        bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh))
                goto out;
-        offset = le16_to_cpu(
+        offset = le16_to_cpu(sum->ss_bytes);
-                ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
        for (;;) {
                unsigned long nblocks, ndatablk, nnodeblk;
                struct nilfs_finfo *finfo;
-                finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+                finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
+                                                sizeof(*finfo));
                if (unlikely(!finfo))
                        goto out;
@@ -334,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
                        struct nilfs_recovery_block *rb;
                        struct nilfs_binfo_v *binfo;
-                        binfo = segsum_get(sbi->s_super, &bh, &offset,
+                        binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
-                                           sizeof(*binfo));
+                                                        sizeof(*binfo));
                        if (unlikely(!binfo))
                                goto out;
@@ -353,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
                }
                if (--nfinfo == 0)
                        break;
-                blocknr += nnodeblk; /* always 0 for the data sync segments */
+                blocknr += nnodeblk; /* always 0 for data sync logs */
-                segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+                nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
-                            nnodeblk);
+                                        nnodeblk);
                if (unlikely(!bh))
                        goto out;
        }
@@ -465,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        return err;
 }
-static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
                                     struct nilfs_recovery_block *rb,
                                     struct page *page)
 {
        struct buffer_head *bh_org;
        void *kaddr;
-        bh_org = sb_bread(sbi->s_super, rb->blocknr);
+        bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
        if (unlikely(!bh_org))
                return -EIO;
@@ -483,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
        return 0;
 }
-static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
-                                struct list_head *head,
+                                      struct nilfs_sb_info *sbi,
-                                unsigned long *nr_salvaged_blocks)
+                                      struct list_head *head,
+                                      unsigned long *nr_salvaged_blocks)
 {
        struct inode *inode;
        struct nilfs_recovery_block *rb, *n;
-        unsigned blocksize = sbi->s_super->s_blocksize;
+        unsigned blocksize = nilfs->ns_blocksize;
        struct page *page;
        loff_t pos;
        int err = 0, err2 = 0;
@@ -503,13 +523,16 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
                }
                pos = rb->blkoff << inode->i_blkbits;
-                page = NULL;
+                err = block_write_begin(inode->i_mapping, pos, blocksize,
-                err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
+                                        0, &page, nilfs_get_block);
-                                        0, &page, NULL, nilfs_get_block);
+                if (unlikely(err)) {
-                if (unlikely(err))
+                        loff_t isize = inode->i_size;
+                        if (pos + blocksize > isize)
+                                vmtruncate(inode, isize);
                        goto failed_inode;
+                }
-                err = nilfs_recovery_copy_block(sbi, rb, page);
+                err = nilfs_recovery_copy_block(nilfs, rb, page);
                if (unlikely(err))
                        goto failed_page;
@@ -549,18 +572,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
 /**
 * nilfs_do_roll_forward - salvage logical segments newer than the latest
 * checkpoint
+ * @nilfs: nilfs object
 * @sbi: nilfs_sb_info
- * @nilfs: the_nilfs
 * @ri: pointer to a nilfs_recovery_info
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct nilfs_sb_info *sbi,
                                 struct nilfs_recovery_info *ri)
 {
-        struct nilfs_segsum_info ssi;
+        struct buffer_head *bh_sum = NULL;
+        struct nilfs_segment_summary *sum;
        sector_t pseg_start;
        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
        unsigned long nsalvaged_blocks = 0;
+        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        int empty_seg = 0;
@@ -579,8 +604,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
        while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+                brelse(bh_sum);
+                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+                if (!bh_sum) {
+                        err = -EIO;
+                        goto failed;
+                }
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO) {
                                err = -EIO;
@@ -588,33 +619,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                        }
                        goto strayed;
                }
-                if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+                flags = le16_to_cpu(sum->ss_flags);
+                if (flags & NILFS_SS_SR)
                        goto confused;
                /* Found a valid partial segment; do recovery actions */
-                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                nextnum = nilfs_get_segnum_of_block(nilfs,
+                                                    le64_to_cpu(sum->ss_next));
                empty_seg = 0;
-                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
-                if (!(ssi.flags & NILFS_SS_GC))
+                if (!(flags & NILFS_SS_GC))
-                        nilfs->ns_nongc_ctime = ssi.ctime;
+                        nilfs->ns_nongc_ctime = nilfs->ns_ctime;
                switch (state) {
                case RF_INIT_ST:
-                        if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+                        if (!(flags & NILFS_SS_LOGBGN) ||
+                            !(flags & NILFS_SS_SYNDT))
                                goto try_next_pseg;
                        state = RF_DSYNC_ST;
                        /* Fall through */
                case RF_DSYNC_ST:
-                        if (!NILFS_SEG_DSYNC(&ssi))
+                        if (!(flags & NILFS_SS_SYNDT))
                                goto confused;
-                        err = collect_blocks_from_segsum(
+                        err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
-                                sbi, pseg_start, &ssi, &dsync_blocks);
+                                                   &dsync_blocks);
                        if (unlikely(err))
                                goto failed;
-                        if (NILFS_SEG_LOGEND(&ssi)) {
+                        if (flags & NILFS_SS_LOGEND) {
-                                err = recover_dsync_blocks(
+                                err = nilfs_recover_dsync_blocks(
-                                        sbi, &dsync_blocks, &nsalvaged_blocks);
+                                        nilfs, sbi, &dsync_blocks,
+                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
                                state = RF_INIT_ST;
@@ -625,7 +661,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 try_next_pseg:
                if (pseg_start == ri->ri_lsegs_end)
                        break;
-                pseg_start += ssi.nblocks;
+                pseg_start += le32_to_cpu(sum->ss_nblocks);
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;
@@ -650,8 +686,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
        }
 out:
+        brelse(bh_sum);
        dispose_recovery_list(&dsync_blocks);
-        nilfs_detach_writer(sbi->s_nilfs, sbi);
+        nilfs_detach_writer(nilfs, sbi);
        return err;
 confused:
@@ -665,7 +702,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 }
 static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
-                                      struct nilfs_sb_info *sbi,
                                      struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh;
@@ -675,7 +711,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
            nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
                return;
-        bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+        bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
        BUG_ON(!bh);
        memset(bh->b_data, 0, bh->b_size);
        set_buffer_dirty(bh);
@@ -688,9 +724,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 }
 /**
- * nilfs_recover_logical_segments - salvage logical segments written after
+ * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
- * the latest super root
+ * @nilfs: nilfs object
- * @nilfs: the_nilfs
 * @sbi: nilfs_sb_info
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
@@ -707,9 +742,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
-                                   struct nilfs_sb_info *sbi,
+                              struct nilfs_sb_info *sbi,
-                                   struct nilfs_recovery_info *ri)
+                              struct nilfs_recovery_info *ri)
 {
        int err;
@@ -749,7 +784,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
                        goto failed;
                }
-                nilfs_finish_roll_forward(nilfs, sbi, ri);
+                nilfs_finish_roll_forward(nilfs, ri);
        }
 failed:
@@ -760,7 +795,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 /**
 * nilfs_search_super_root - search the latest valid super root
 * @nilfs: the_nilfs
- * @sbi: nilfs_sb_info
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -773,14 +807,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
 * %-EINVAL - No valid segment found
 *
 * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+int nilfs_search_super_root(struct the_nilfs *nilfs,
                            struct nilfs_recovery_info *ri)
 {
-        struct nilfs_segsum_info ssi;
+        struct buffer_head *bh_sum = NULL;
+        struct nilfs_segment_summary *sum;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
        sector_t b, end;
+        unsigned long nblocks;
+        unsigned int flags;
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
@@ -799,17 +838,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        /* Read ahead segment */
        b = seg_start;
        while (b <= seg_end)
-                sb_breadahead(sbi->s_super, b++);
+                __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
        for (;;) {
-                /* Load segment summary */
+                brelse(bh_sum);
-                ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
+                ret = NILFS_SEG_FAIL_IO;
+                bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
+                if (!bh_sum)
+                        goto failed;
+                ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
                if (ret) {
                        if (ret == NILFS_SEG_FAIL_IO)
                                goto failed;
                        goto strayed;
                }
-                pseg_end = pseg_start + ssi.nblocks - 1;
+                nblocks = le32_to_cpu(sum->ss_nblocks);
+                pseg_end = pseg_start + nblocks - 1;
                if (unlikely(pseg_end > seg_end)) {
                        ret = NILFS_SEG_FAIL_CONSISTENCY;
                        goto strayed;
@@ -819,11 +865,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_pseg_start = pseg_start;
                ri->ri_seq = seg_seq;
                ri->ri_segnum = segnum;
-                nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+                nextnum = nilfs_get_segnum_of_block(nilfs,
+                                                    le64_to_cpu(sum->ss_next));
                ri->ri_nextnum = nextnum;
                empty_seg = 0;
-                if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+                flags = le16_to_cpu(sum->ss_flags);
+                if (!(flags & NILFS_SS_SR) && !scan_newer) {
                        /* This will never happen because a superblock
                           (last_segment) always points to a pseg
                           having a super root. */
@@ -834,14 +882,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                if (pseg_start == seg_start) {
                        nilfs_get_segment_range(nilfs, nextnum, &b, &end);
                        while (b <= end)
-                                sb_breadahead(sbi->s_super, b++);
+                                __breadahead(nilfs->ns_bdev, b++,
+                                             nilfs->ns_blocksize);
                }
-                if (!NILFS_SEG_HAS_SR(&ssi)) {
+                if (!(flags & NILFS_SS_SR)) {
-                        if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+                        if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
                                ri->ri_lsegs_start = pseg_start;
                                ri->ri_lsegs_start_seq = seg_seq;
                        }
-                        if (NILFS_SEG_LOGEND(&ssi))
+                        if (flags & NILFS_SS_LOGEND)
                                ri->ri_lsegs_end = pseg_start;
                        goto try_next_pseg;
                }
@@ -852,12 +901,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
                nilfs_dispose_segment_list(&segments);
-                nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+                sr_pseg_start = pseg_start;
-                        + ssi.nblocks - seg_start;
+                nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
                nilfs->ns_seg_seq = seg_seq;
                nilfs->ns_segnum = segnum;
                nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
-                nilfs->ns_ctime = ssi.ctime;
+                nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
                nilfs->ns_nextnum = nextnum;
                if (scan_newer)
@@ -868,15 +917,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                        scan_newer = 1;
                }
-                /* reset region for roll-forward */
-                pseg_start += ssi.nblocks;
-                if (pseg_start < seg_end)
-                        continue;
-                goto feed_segment;
 try_next_pseg:
                /* Standing on a course, or met an inconsistent state */
-                pseg_start += ssi.nblocks;
+                pseg_start += nblocks;
                if (pseg_start < seg_end)
                        continue;
                goto feed_segment;
@@ -907,6 +950,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 super_root_found:
        /* Updating pointers relating to the latest checkpoint */
+        brelse(bh_sum);
        list_splice_tail(&segments, &ri->ri_used_segments);
        nilfs->ns_last_pseg = sr_pseg_start;
        nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -914,6 +958,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        return 0;
 failed:
+        brelse(bh_sum);
        nilfs_dispose_segment_list(&segments);
        return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
 }
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 17851f77f739..4588fb9e93df 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,35 +40,10 @@ struct nilfs_write_info {
        sector_t                blocknr;
 };
 static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                              struct the_nilfs *nilfs);
 static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-static struct kmem_cache *nilfs_segbuf_cachep;
-static void nilfs_segbuf_init_once(void *obj)
-{
-        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-int __init nilfs_init_segbuf_cache(void)
-{
-        nilfs_segbuf_cachep =
-                kmem_cache_create("nilfs2_segbuf_cache",
-                                  sizeof(struct nilfs_segment_buffer),
-                                  0, SLAB_RECLAIM_ACCOUNT,
-                                  nilfs_segbuf_init_once);
-        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-void nilfs_destroy_segbuf_cache(void)
-{
-        kmem_cache_destroy(nilfs_segbuf_cachep);
-}
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 {
        struct nilfs_segment_buffer *segbuf;
@@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
        init_completion(&segbuf->sb_bio_event);
        atomic_set(&segbuf->sb_err, 0);
@@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
-                       time_t ctime)
+                       time_t ctime, __u64 cno)
 {
        int err;
@@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_sum.cno = cno;
        return 0;
 }
@@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
        raw_sum->ss_pad      = 0;
+        raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
 }
 /*
 * CRC calculation routines
 */
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+static void
-                                     u32 seed)
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_sumsum = cpu_to_le32(crc);
 }
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
-                                   u32 seed)
+                                          u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+                                    u32 seed)
+{
+        struct nilfs_super_root *raw_sr;
+        u32 crc;
+        raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
 static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
 {
        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
 }
 /*
@@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs)
        return ret;
 }
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                if (segbuf->sb_super_root)
+                        nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+        }
+}
 /*
 * BIO operations
 */
@@ -498,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+                rw |= REQ_SYNC | REQ_UNPLUG;
                res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
        }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..b04f08cc2397 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
 * @sumbytes: Byte count of segment summary
 * @nfileblk: Total number of file blocks
 * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
 * @ctime: Creation time
 * @next: Block number of the next full segment
 */
@@ -48,21 +49,11 @@ struct nilfs_segsum_info {
        unsigned long           sumbytes;
        unsigned long           nfileblk;
        u64                     seg_seq;
+        __u64                   cno;
        time_t                  ctime;
        sector_t                next;
 };
-/* macro for the flags */
-#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
-#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
-#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
-#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
-#define NILFS_SEG_SIMPLEX(sum) \
-        (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
-         (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
-#define NILFS_SEG_EMPTY(sum)    ((sum)->nblocks == (sum)->nsumblk)
 /**
 * struct nilfs_segment_buffer - Segment buffer
 * @sb_super: back pointer to a superblock struct
@@ -76,6 +67,7 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
 * @sb_nbio: Number of flying bio requests
 * @sb_err: I/O error status
 * @sb_bio_event: Completion event of log writing
@@ -95,6 +87,7 @@ struct nilfs_segment_buffer {
        /* Buffers */
        struct list_head        sb_segsum_buffers;
        struct list_head        sb_payload_buffers; /* including super root */
+        struct buffer_head     *sb_super_root;
        /* io status */
        int                     sb_nbio;
@@ -121,9 +114,8 @@ struct nilfs_segment_buffer {
                    b_assoc_buffers))
 #define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+extern struct kmem_cache *nilfs_segbuf_cachep;
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
@@ -132,13 +124,24 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
                                struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
+static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
+{
+        unsigned int flags = segbuf->sb_sum.flags;
+        return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
+                (NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
+}
+static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
+{
+        return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
+}
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
                         struct nilfs_segment_buffer *last);
 int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
 static inline void nilfs_destroy_logs(struct list_head *logs)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6a7dbd8451db..9fd051a33c4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
 #define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
 #define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
-        nilfs_transaction_cachep =
-                kmem_cache_create("nilfs2_transaction_cache",
-                                  sizeof(struct nilfs_transaction_info),
-                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
-        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-/**
- * nilfs_destroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
-        kmem_cache_destroy(nilfs_transaction_cachep);
-}
 static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
                        return err;
                segbuf = sci->sc_curseg;
        }
-        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
        if (likely(!err))
                segbuf->sb_sum.flags |= NILFS_SS_SR;
        return err;
@@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
        *vblocknr = binfo->bi_v.bi_vblocknr;
 }
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_file_bmap,
@@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
        *binfo_dat = binfo->bi_dat;
 }
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .collect_data = nilfs_collect_dat_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_dat_bmap,
@@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .write_node_binfo = nilfs_write_dat_node_binfo,
 };
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = NULL,
        .collect_bmap = NULL,
@@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
        }
 }
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
-        struct nilfs_super_root *raw_sr =
-                (struct nilfs_super_root *)bh_sr->b_data;
-        u32 crc;
-        crc = crc32_le(seed,
-                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
-        raw_sr->sr_sum = cpu_to_le32(crc);
-}
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
-                                            u32 seed)
-{
-        struct nilfs_segment_buffer *segbuf;
-        if (sci->sc_super_root)
-                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
-                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
-        }
-}
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct buffer_head *bh_sr;
-        struct nilfs_super_root *raw_sr =
+        struct nilfs_super_root *raw_sr;
-                (struct nilfs_super_root *)bh_sr->b_data;
        unsigned isz = nilfs->ns_inode_size;
+        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
@@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
        /* Collection retry loop */
        for (;;) {
-                sci->sc_super_root = NULL;
                sci->sc_nblk_this_inc = 0;
                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
@@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        ssp.offset = sizeof(struct nilfs_segment_summary);
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                if (bh == sci->sc_super_root)
+                if (bh == segbuf->sb_super_root)
                        break;
                if (!finfo) {
                        finfo = nilfs_segctor_map_segsum_entry(
@@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        lock_page(bd_page);
                                        clear_page_dirty_for_io(bd_page);
@@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 }
 static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                             struct buffer_head *bh_sr, int err)
+                             int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
@@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == bh_sr) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
-        nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
+        nilfs_abort_logs(&logs, NULL, ret ? : err);
        list_splice_tail_init(&sci->sc_segbufs, &logs);
        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        }
        nilfs_destroy_logs(&logs);
-        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        int update_sr = (sci->sc_super_root != NULL);
+        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        clear_buffer_nilfs_volatile(bh);
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
                                }
+                                update_sr = true;
                                break;
                        }
                        if (bh->b_page != fs_page) {
@@ -1977,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        }
                }
-                if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+                if (!nilfs_segbuf_simplex(segbuf)) {
-                        if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+                        if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
                                set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                                sci->sc_lseg_stime = jiffies;
                        }
-                        if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+                        if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
                                clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
                }
        }
@@ -2014,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        if (update_sr) {
                nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
                                       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-                set_nilfs_sb_dirty(nilfs);
                clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
                clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2115,7 +2051,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
-        int err, has_sr = 0;
+        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2143,11 +2079,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        goto failed;
-                has_sr = (sci->sc_super_root != NULL);
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
-                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+                    nilfs_segbuf_empty(sci->sc_curseg)) {
                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
                }
@@ -2159,7 +2093,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
-                if (has_sr) {
+                if (mode == SC_LSEG_SR &&
+                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
                                goto failed_to_write;
@@ -2171,11 +2106,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
                if (err) {
-                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-                                         sci->sc_super_root, err);
                        goto failed_to_write;
                }
-                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+                                            nilfs->ns_crc_seed);
                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
@@ -2196,8 +2132,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
-        sci->sc_super_root = NULL;
 out:
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
@@ -2224,9 +2158,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
 {
        spin_lock(&sci->sc_state_lock);
-        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+        if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
-                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                sci->sc_timer.expires = jiffies + sci->sc_interval;
-                add_timer(sci->sc_timer);
+                add_timer(&sci->sc_timer);
                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
        }
        spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2365,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
        spin_lock(&sci->sc_state_lock);
        sci->sc_seq_accepted = sci->sc_seq_request;
        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(&sci->sc_timer);
-        if (sci->sc_timer)
-                del_timer_sync(sci->sc_timer);
 }
 /**
@@ -2459,9 +2391,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;
                /* re-enable timer if checkpoint creation was not done */
-                if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                    time_before(jiffies, sci->sc_timer->expires))
+                    time_before(jiffies, sci->sc_timer.expires))
-                        add_timer(sci->sc_timer);
+                        add_timer(&sci->sc_timer);
        }
        spin_unlock(&sci->sc_state_lock);
 }
@@ -2475,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
        int err = 0;
        nilfs_segctor_accept(sci);
@@ -2490,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
-                        err = nilfs_commit_super(
+                        err = -EIO;
-                                sbi, nilfs_altsb_need_update(nilfs));
+                        sbp = nilfs_prepare_super(sbi,
+                                                  nilfs_sb_will_flip(nilfs));
+                        if (likely(sbp)) {
+                                nilfs_set_log_cursor(sbp[0], nilfs);
+                                err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                        }
                        up_write(&nilfs->ns_sem);
                }
        }
@@ -2640,13 +2578,10 @@ static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct timer_list timer;
        int timeout = 0;
-        init_timer(&timer);
+        sci->sc_timer.data = (unsigned long)current;
-        timer.data = (unsigned long)current;
+        sci->sc_timer.function = nilfs_construction_timeout;
-        timer.function = nilfs_construction_timeout;
-        sci->sc_timer = &timer;
        /* start sync. */
        sci->sc_task = current;
@@ -2695,7 +2630,7 @@ static int nilfs_segctor_thread(void *arg)
                        should_sleep = 0;
                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
                        should_sleep = time_before(jiffies,
-                                                   sci->sc_timer->expires);
+                                        sci->sc_timer.expires);
                if (should_sleep) {
                        spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2639,7 @@ static int nilfs_segctor_thread(void *arg)
                }
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                           time_after_eq(jiffies, sci->sc_timer->expires));
+                           time_after_eq(jiffies, sci->sc_timer.expires));
                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);
@@ -2713,8 +2648,6 @@ static int nilfs_segctor_thread(void *arg)
 end_thread:
        spin_unlock(&sci->sc_state_lock);
-        del_timer_sync(sci->sc_timer);
-        sci->sc_timer = NULL;
        /* end sync. */
        sci->sc_task = NULL;
@@ -2750,13 +2683,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
        }
 }
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
-        sci->sc_seq_done = sci->sc_seq_request;
-        return nilfs_segctor_start_thread(sci);
-}
 /*
 * Setup & clean-up functions
 */
@@ -2780,6 +2706,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2773,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
@@ -2880,7 +2808,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                return -ENOMEM;
        nilfs_attach_writer(nilfs, sbi);
-        err = nilfs_segctor_init(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..17c487bd8152 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
 * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
 * @sc_stage: Collection stage
 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
-        struct buffer_head     *sc_super_root;
        struct nilfs_cstage     sc_stage;
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
        unsigned long           sc_watermark;
-        struct timer_list      *sc_timer;
+        struct timer_list       sc_timer;
        struct task_struct     *sc_task;
 };
@@ -219,10 +217,10 @@ enum {
 */
 #define NILFS_SC_DEFAULT_WATERMARK  3600
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
 extern void nilfs_relax_pressure_in_lock(struct super_block *);
 extern int nilfs_construct_segment(struct super_block *);
@@ -236,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
-extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
                                       struct buffer_head **, int);
-extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+extern int nilfs_search_super_root(struct the_nilfs *,
                                   struct nilfs_recovery_info *);
-extern int nilfs_recover_logical_segments(struct the_nilfs *,
+extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
-                                          struct nilfs_sb_info *,
+                                     struct nilfs_sb_info *,
-                                          struct nilfs_recovery_info *);
+                                     struct nilfs_recovery_info *);
 extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f505a6a..922263393c76 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -55,6 +55,8 @@
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
+#include "btree.h"
+#include "btnode.h"
 #include "page.h"
 #include "cpfile.h"
 #include "ifile.h"
@@ -67,8 +69,32 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static void nilfs_set_error(struct nilfs_sb_info *sbi)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
+        down_write(&nilfs->ns_sem);
+        if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+                nilfs->ns_mount_state |= NILFS_ERROR_FS;
+                sbp = nilfs_prepare_super(sbi, 0);
+                if (likely(sbp)) {
+                        sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+                        if (sbp[1])
+                                sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
+                        nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+                }
+        }
+        up_write(&nilfs->ns_sem);
+}
 /**
 * nilfs_error() - report failure condition on a filesystem
 *
@@ -94,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
-                struct the_nilfs *nilfs = sbi->s_nilfs;
+                nilfs_set_error(sbi);
-                down_write(&nilfs->ns_sem);
-                if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
-                        nilfs->ns_mount_state |= NILFS_ERROR_FS;
-                        nilfs->ns_sbp[0]->s_state |=
-                                cpu_to_le16(NILFS_ERROR_FS);
-                        nilfs_commit_super(sbi, 1);
-                }
-                up_write(&nilfs->ns_sem);
                if (nilfs_test_opt(sbi, ERRORS_RO)) {
                        printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -129,7 +146,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
-static struct kmem_cache *nilfs_inode_cachep;
 struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
@@ -155,83 +171,46 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
-static void init_once(void *obj)
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
-{
-        struct nilfs_inode_info *ii = obj;
-        INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
-        init_rwsem(&ii->xattr_sem);
-#endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
-        inode_init_once(&ii->vfs_inode);
-}
-static int nilfs_init_inode_cache(void)
-{
-        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
-                                               sizeof(struct nilfs_inode_info),
-                                               0, SLAB_RECLAIM_ACCOUNT,
-                                               init_once);
-        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-static inline void nilfs_destroy_inode_cache(void)
-{
-        kmem_cache_destroy(nilfs_inode_cachep);
-}
-static void nilfs_clear_inode(struct inode *inode)
-{
-        struct nilfs_inode_info *ii = NILFS_I(inode);
-        /*
-         * Free resources allocated in nilfs_read_inode(), here.
-         */
-        BUG_ON(!list_empty(&ii->i_dirty));
-        brelse(ii->i_bh);
-        ii->i_bh = NULL;
-        if (test_bit(NILFS_I_BMAP, &ii->i_state))
-                nilfs_bmap_clear(ii->i_bmap);
-        nilfs_btnode_cache_clear(&ii->i_btnode_cache);
-}
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
-        int barrier_done = 0;
-        if (nilfs_test_opt(sbi, BARRIER)) {
-                set_buffer_ordered(nilfs->ns_sbh[0]);
-                barrier_done = 1;
-        }
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
-        err = sync_dirty_buffer(nilfs->ns_sbh[0]);
-        if (err == -EOPNOTSUPP && barrier_done) {
+        if (nilfs_test_opt(sbi, BARRIER)) {
-                nilfs_warning(sbi->s_super, __func__,
+                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-                              "barrier-based sync failed. "
+                                          WRITE_SYNC | WRITE_BARRIER);
-                              "disabling barriers\n");
+                if (err == -EOPNOTSUPP) {
-                nilfs_clear_opt(sbi, BARRIER);
+                        nilfs_warning(sbi->s_super, __func__,
-                barrier_done = 0;
+                                      "barrier-based sync failed. "
-                clear_buffer_ordered(nilfs->ns_sbh[0]);
+                                      "disabling barriers\n");
-                goto retry;
+                        nilfs_clear_opt(sbi, BARRIER);
+                        goto retry;
+                }
+        } else {
+                err = sync_dirty_buffer(nilfs->ns_sbh[0]);
        }
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: unable to write superblock (err=%d)\n", err);
                if (err == -EIO && nilfs->ns_sbh[1]) {
+                        /*
+                         * sbp[0] points to newer log than sbp[1],
+                         * so copy sbp[0] to sbp[1] to take over sbp[0].
+                         */
+                        memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
+                               nilfs->ns_sbsize);
                        nilfs_fall_back_super_block(nilfs);
                        goto retry;
                }
        } else {
                struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+                nilfs->ns_sbwcount++;
                /*
                 * The latest segment becomes trailable from the position
                 * written in superblock.
@@ -240,66 +219,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
                /* update GC protection for recent segments */
                if (nilfs->ns_sbh[1]) {
-                        sbp = NULL;
+                        if (flag == NILFS_SB_COMMIT_ALL) {
-                        if (dupsb) {
                                set_buffer_dirty(nilfs->ns_sbh[1]);
-                                if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+                                if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
-                                        sbp = nilfs->ns_sbp[1];
+                                        goto out;
                        }
+                        if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
+                            le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
+                                sbp = nilfs->ns_sbp[1];
                }
-                if (sbp) {
-                        spin_lock(&nilfs->ns_last_segment_lock);
-                        nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
-                        spin_unlock(&nilfs->ns_last_segment_lock);
-                }
-        }
+                spin_lock(&nilfs->ns_last_segment_lock);
+                nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+                spin_unlock(&nilfs->ns_last_segment_lock);
+        }
+ out:
        return err;
 }
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
+                          struct the_nilfs *nilfs)
+{
+        sector_t nfreeblocks;
+        /* nilfs->ns_sem must be locked by the caller. */
+        nilfs_count_free_blocks(nilfs, &nfreeblocks);
+        sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+        spin_lock(&nilfs->ns_last_segment_lock);
+        sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+        sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+        sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+        spin_unlock(&nilfs->ns_last_segment_lock);
+}
+struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+                                               int flip)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
-        sector_t nfreeblocks;
-        time_t t;
-        int err;
-        /* nilfs->sem must be locked by the caller. */
+        /* nilfs->ns_sem must be locked by the caller. */
-        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                if (sbp[1] &&
-                        nilfs_swap_super_block(nilfs);
+                    sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                else {
+                        memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+                } else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
                               sbi->s_super->s_id);
-                        return -EIO;
+                        return NULL;
                }
+        } else if (sbp[1] &&
+                   sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
+                        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
        }
-        err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
-        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: failed to count free blocks\n");
-                return err;
-        }
-        spin_lock(&nilfs->ns_last_segment_lock);
-        sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
-        sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
-        sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
-        spin_unlock(&nilfs->ns_last_segment_lock);
+        if (flip && sbp[1])
+                nilfs_swap_super_block(nilfs);
+        return sbp;
+}
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+{
+        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        time_t t;
+        /* nilfs->ns_sem must be locked by the caller. */
        t = get_seconds();
-        nilfs->ns_sbwtime[0] = t;
+        nilfs->ns_sbwtime = t;
-        sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
        sbp[0]->s_wtime = cpu_to_le64(t);
        sbp[0]->s_sum = 0;
        sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
                                             (unsigned char *)sbp[0],
                                             nilfs->ns_sbsize));
-        if (dupsb && sbp[1]) {
+        if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
-                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+                sbp[1]->s_wtime = sbp[0]->s_wtime;
-                nilfs->ns_sbwtime[1] = t;
+                sbp[1]->s_sum = 0;
+                sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+                                            (unsigned char *)sbp[1],
+                                            nilfs->ns_sbsize));
        }
        clear_nilfs_sb_dirty(nilfs);
-        return nilfs_sync_super(sbi, dupsb);
+        return nilfs_sync_super(sbi, flag);
+}
+/**
+ * nilfs_cleanup_super() - write filesystem state for cleanup
+ * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ *
+ * This function restores state flags in the on-disk super block.
+ * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
+ * filesystem was not clean previously.
+ */
+int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+{
+        struct nilfs_super_block **sbp;
+        int flag = NILFS_SB_COMMIT;
+        int ret = -EIO;
+        sbp = nilfs_prepare_super(sbi, 0);
+        if (sbp) {
+                sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
+                nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+                if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
+                        /*
+                         * make the "clean" flag also to the opposite
+                         * super block if both super blocks point to
+                         * the same checkpoint.
+                         */
+                        sbp[1]->s_state = sbp[0]->s_state;
+                        flag = NILFS_SB_COMMIT_ALL;
+                }
+                ret = nilfs_commit_super(sbi, flag);
+        }
+        return ret;
 }
 static void nilfs_put_super(struct super_block *sb)
@@ -313,8 +348,7 @@ static void nilfs_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+                nilfs_cleanup_super(sbi);
-                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        }
        down_write(&nilfs->ns_super_sem);
@@ -335,6 +369,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct nilfs_super_block **sbp;
        int err = 0;
        /* This function is called when super block should be written back */
@@ -342,8 +377,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
                err = nilfs_construct_segment(sb);
        down_write(&nilfs->ns_sem);
-        if (nilfs_sb_dirty(nilfs))
+        if (nilfs_sb_dirty(nilfs)) {
-                nilfs_commit_super(sbi, 1);
+                sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+                if (likely(sbp)) {
+                        nilfs_set_log_cursor(sbp[0], nilfs);
+                        nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                }
+        }
        up_write(&nilfs->ns_sem);
        return err;
@@ -360,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
+        err = -ENOMEM;
        sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
        if (!sbi->s_ifile)
-                return -ENOMEM;
+                goto delist;
        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -393,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
+ delist:
        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
        up_write(&nilfs->ns_super_sem);
@@ -466,20 +508,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        if (!nilfs_test_opt(sbi, BARRIER))
-                seq_printf(seq, ",nobarrier");
+                seq_puts(seq, ",nobarrier");
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
-        if (nilfs_test_opt(sbi, ERRORS_RO))
-                seq_printf(seq, ",errors=remount-ro");
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
-                seq_printf(seq, ",errors=panic");
+                seq_puts(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, ERRORS_CONT))
+                seq_puts(seq, ",errors=continue");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
-                seq_printf(seq, ",order=strict");
+                seq_puts(seq, ",order=strict");
        if (nilfs_test_opt(sbi, NORECOVERY))
-                seq_printf(seq, ",norecovery");
+                seq_puts(seq, ",norecovery");
        if (nilfs_test_opt(sbi, DISCARD))
-                seq_printf(seq, ",discard");
+                seq_puts(seq, ",discard");
        return 0;
 }
@@ -491,7 +533,7 @@ static const struct super_operations nilfs_sops = {
        /* .write_inode    = nilfs_write_inode, */
        /* .put_inode      = nilfs_put_inode, */
        /* .drop_inode    = nilfs_drop_inode, */
-        .delete_inode   = nilfs_delete_inode,
+        .evict_inode    = nilfs_evict_inode,
        .put_super      = nilfs_put_super,
        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
@@ -499,7 +541,6 @@ static const struct super_operations nilfs_sops = {
        /* .unlockfs */
        .statfs         = nilfs_statfs,
        .remount_fs     = nilfs_remount,
-        .clear_inode    = nilfs_clear_inode,
        /* .umount_begin */
        .show_options = nilfs_show_options
 };
@@ -548,23 +589,25 @@ static const struct export_operations nilfs_export_ops = {
 enum {
        Opt_err_cont, Opt_err_panic, Opt_err_ro,
-        Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
+        Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-        Opt_discard, Opt_err,
+        Opt_discard, Opt_nodiscard, Opt_err,
 };
 static match_table_t tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_snapshot, "cp=%u"},
        {Opt_order, "order=%s"},
        {Opt_norecovery, "norecovery"},
        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
        {Opt_err, NULL}
 };
-static int parse_options(char *options, struct super_block *sb)
+static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        char *p;
@@ -581,6 +624,9 @@ static int parse_options(char *options, struct super_block *sb)
                token = match_token(p, tokens, args);
                switch (token) {
+                case Opt_barrier:
+                        nilfs_set_opt(sbi, BARRIER);
+                        break;
                case Opt_nobarrier:
                        nilfs_clear_opt(sbi, BARRIER);
                        break;
@@ -606,8 +652,26 @@ static int parse_options(char *options, struct super_block *sb)
                case Opt_snapshot:
                        if (match_int(&args[0], &option) || option <= 0)
                                return 0;
-                        if (!(sb->s_flags & MS_RDONLY))
+                        if (is_remount) {
+                                if (!nilfs_test_opt(sbi, SNAPSHOT)) {
+                                        printk(KERN_ERR
+                                               "NILFS: cannot change regular "
+                                               "mount to snapshot.\n");
+                                        return 0;
+                                } else if (option != sbi->s_snapshot_cno) {
+                                        printk(KERN_ERR
+                                               "NILFS: cannot remount to a "
+                                               "different snapshot.\n");
+                                        return 0;
+                                }
+                                break;
+                        }
+                        if (!(sb->s_flags & MS_RDONLY)) {
+                                printk(KERN_ERR "NILFS: cannot mount snapshot "
+                                       "read/write.  A read-only option is "
+                                       "required.\n");
                                return 0;
+                        }
                        sbi->s_snapshot_cno = option;
                        nilfs_set_opt(sbi, SNAPSHOT);
                        break;
@@ -617,6 +681,9 @@ static int parse_options(char *options, struct super_block *sb)
                case Opt_discard:
                        nilfs_set_opt(sbi, DISCARD);
                        break;
+                case Opt_nodiscard:
+                        nilfs_clear_opt(sbi, DISCARD);
+                        break;
                default:
                        printk(KERN_ERR
                               "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -631,17 +698,24 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                          struct nilfs_super_block *sbp)
 {
        sbi->s_mount_opt =
-                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 {
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+        struct nilfs_super_block **sbp;
-        int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+        int max_mnt_count;
-        int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+        int mnt_count;
+        /* nilfs->ns_sem must be locked by the caller. */
+        sbp = nilfs_prepare_super(sbi, 0);
+        if (!sbp)
+                return -EIO;
+        max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
+        mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
-        /* nilfs->sem must be locked by the caller. */
        if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
                printk(KERN_WARNING
                       "NILFS warning: mounting fs with errors\n");
@@ -652,12 +726,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
 #endif
        }
        if (!max_mnt_count)
-                sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+                sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
-        sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+        sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
-        sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+        sbp[0]->s_state =
-        sbp->s_mtime = cpu_to_le64(get_seconds());
+                cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
-        return nilfs_commit_super(sbi, 1);
+        sbp[0]->s_mtime = cpu_to_le64(get_seconds());
+        /* synchronize sbp[1] with sbp[0] */
+        memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -694,7 +771,31 @@ int nilfs_store_magic_and_option(struct super_block *sb,
        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
-        return !parse_options(data, sb) ? -EINVAL : 0 ;
+        return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
+}
+int nilfs_check_feature_compatibility(struct super_block *sb,
+                                      struct nilfs_super_block *sbp)
+{
+        __u64 features;
+        features = le64_to_cpu(sbp->s_feature_incompat) &
+                ~NILFS_FEATURE_INCOMPAT_SUPP;
+        if (features) {
+                printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+                       "optional features (%llx)\n",
+                       (unsigned long long)features);
+                return -EINVAL;
+        }
+        features = le64_to_cpu(sbp->s_feature_compat_ro) &
+                ~NILFS_FEATURE_COMPAT_RO_SUPP;
+        if (!(sb->s_flags & MS_RDONLY) && features) {
+                printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
+                       "unsupported optional features (%llx)\n",
+                       (unsigned long long)features);
+                return -EINVAL;
+        }
+        return 0;
 }
 /**
@@ -778,9 +879,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                                goto failed_sbi;
                        }
                        cno = sbi->s_snapshot_cno;
-                } else
+                }
-                        /* Read-only mount */
-                        sbi->s_snapshot_cno = cno;
        }
        err = nilfs_attach_checkpoint(sbi, cno);
@@ -845,11 +944,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
-        struct nilfs_super_block *sbp;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
-        int err;
+        int was_snapshot, err;
        lock_kernel();
@@ -857,19 +955,18 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
-        if (!parse_options(data, sb)) {
+        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-        if ((*flags & MS_RDONLY) &&
+        err = -EINVAL;
-            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+        if (was_snapshot && !(*flags & MS_RDONLY)) {
-                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
-                       "remount to a different snapshot.\n",
+                       "read/write.\n", sb->s_id);
-                       sb->s_id);
-                err = -EINVAL;
                goto restore_opts;
        }
@@ -877,7 +974,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                printk(KERN_WARNING "NILFS (device %s): couldn't "
                       "remount because the filesystem is in an "
                       "incomplete recovery state.\n", sb->s_id);
-                err = -EINVAL;
                goto restore_opts;
        }
@@ -888,45 +984,35 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                nilfs_detach_segment_constructor(sbi);
                sb->s_flags |= MS_RDONLY;
-                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
-                /* nilfs_set_opt(sbi, SNAPSHOT); */
                /*
                 * Remounting a valid RW partition RDONLY, so set
                 * the RDONLY flag and then mark the partition as valid again.
                 */
                down_write(&nilfs->ns_sem);
-                sbp = nilfs->ns_sbp[0];
+                nilfs_cleanup_super(sbi);
-                if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
-                    (nilfs->ns_mount_state & NILFS_VALID_FS))
-                        sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
-                sbp->s_mtime = cpu_to_le64(get_seconds());
-                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        } else {
+                __u64 features;
                /*
                 * Mounting a RDONLY partition read-write, so reread and
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                if (nilfs->ns_current && nilfs->ns_current != sbi) {
+                down_read(&nilfs->ns_sem);
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
+                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
-                               "remount because an RW-mount exists.\n",
+                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
-                               sb->s_id);
+                up_read(&nilfs->ns_sem);
-                        err = -EBUSY;
+                if (features) {
-                        goto restore_opts;
-                }
-                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because the current RO-mount is not "
+                               "remount RDWR because of unsupported optional "
-                               "the latest one.\n",
+                               "features (%llx)\n",
-                               sb->s_id);
+                               sb->s_id, (unsigned long long)features);
-                        err = -EINVAL;
+                        err = -EROFS;
                        goto restore_opts;
                }
                sb->s_flags &= ~MS_RDONLY;
-                nilfs_clear_opt(sbi, SNAPSHOT);
-                sbi->s_snapshot_cno = 0;
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
@@ -935,8 +1021,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                nilfs->ns_current = sbi;
        }
 out:
        up_write(&nilfs->ns_super_sem);
@@ -1022,10 +1106,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
+        fmode_t mode = FMODE_READ;
        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
-        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return PTR_ERR(sd.bdev);
@@ -1092,10 +1180,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                /* New superblock instance created */
                s->s_flags = flags;
+                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                                       nilfs);
                if (err)
                        goto cancel_new;
@@ -1106,7 +1196,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
-                close_bdev_exclusive(sd.bdev, flags);
+                close_bdev_exclusive(sd.bdev, mode);
        simple_set_mnt(mnt, s);
        return 0;
@@ -1114,7 +1204,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
 failed:
-        close_bdev_exclusive(sd.bdev, flags);
+        close_bdev_exclusive(sd.bdev, mode);
        return err;
@@ -1124,7 +1214,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        put_nilfs(nilfs);
        deactivate_locked_super(s);
        /*
-         * deactivate_super() invokes close_bdev_exclusive().
+         * deactivate_locked_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
         * put_nilfs() needs the block device.
         */
@@ -1139,54 +1229,93 @@ struct file_system_type nilfs_fs_type = {
        .fs_flags = FS_REQUIRES_DEV,
 };
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
 {
-        int err;
+        struct nilfs_inode_info *ii = obj;
-        err = nilfs_init_inode_cache();
-        if (err)
-                goto failed;
-        err = nilfs_init_transaction_cache();
+        INIT_LIST_HEAD(&ii->i_dirty);
-        if (err)
+#ifdef CONFIG_NILFS_XATTR
-                goto failed_inode_cache;
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = &ii->i_bmap_data;
+        inode_init_once(&ii->vfs_inode);
+}
-        err = nilfs_init_segbuf_cache();
+static void nilfs_segbuf_init_once(void *obj)
-        if (err)
+{
-                goto failed_transaction_cache;
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
-        err = nilfs_btree_path_cache_init();
+static void nilfs_destroy_cachep(void)
-        if (err)
+{
-                goto failed_segbuf_cache;
+        if (nilfs_inode_cachep)
+                kmem_cache_destroy(nilfs_inode_cachep);
+        if (nilfs_transaction_cachep)
+                kmem_cache_destroy(nilfs_transaction_cachep);
+        if (nilfs_segbuf_cachep)
+                kmem_cache_destroy(nilfs_segbuf_cachep);
+        if (nilfs_btree_path_cache)
+                kmem_cache_destroy(nilfs_btree_path_cache);
+}
-        err = register_filesystem(&nilfs_fs_type);
+static int __init nilfs_init_cachep(void)
-        if (err)
+{
-                goto failed_btree_path_cache;
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                        sizeof(struct nilfs_inode_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+        if (!nilfs_inode_cachep)
+                goto fail;
+        nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+                        sizeof(struct nilfs_transaction_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, NULL);
+        if (!nilfs_transaction_cachep)
+                goto fail;
+        nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+                        sizeof(struct nilfs_segment_buffer), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+        if (!nilfs_segbuf_cachep)
+                goto fail;
+        nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+                        sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+                        0, 0, NULL);
+        if (!nilfs_btree_path_cache)
+                goto fail;
        return 0;
- failed_btree_path_cache:
+fail:
-        nilfs_btree_path_cache_destroy();
+        nilfs_destroy_cachep();
+        return -ENOMEM;
+}
- failed_segbuf_cache:
+static int __init init_nilfs_fs(void)
-        nilfs_destroy_segbuf_cache();
+{
+        int err;
- failed_transaction_cache:
+        err = nilfs_init_cachep();
-        nilfs_destroy_transaction_cache();
+        if (err)
+                goto fail;
- failed_inode_cache:
+        err = register_filesystem(&nilfs_fs_type);
-        nilfs_destroy_inode_cache();
+        if (err)
+                goto free_cachep;
- failed:
+        printk(KERN_INFO "NILFS version 2 loaded\n");
+        return 0;
+free_cachep:
+        nilfs_destroy_cachep();
+fail:
        return err;
 }
 static void __exit exit_nilfs_fs(void)
 {
-        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_cachep();
-        nilfs_destroy_transaction_cache();
-        nilfs_destroy_inode_cache();
-        nilfs_btree_path_cache_destroy();
        unregister_filesystem(&nilfs_fs_type);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..ba7c10c917fc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -38,6 +38,8 @@
 static LIST_HEAD(nilfs_objects);
 static DEFINE_SPINLOCK(nilfs_lock);
+static int nilfs_valid_sb(struct nilfs_super_block *sbp);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
        nilfs->ns_last_pseg = start_blocknr;
        nilfs->ns_last_seq = seq;
        nilfs->ns_last_cno = cno;
+        if (!nilfs_sb_dirty(nilfs)) {
+                if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
+                        goto stay_cursor;
+                set_nilfs_sb_dirty(nilfs);
+        }
+        nilfs->ns_prev_seq = nilfs->ns_last_seq;
+ stay_cursor:
        spin_unlock(&nilfs->ns_last_segment_lock);
 }
@@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs)
        kfree(nilfs);
 }
-static int nilfs_load_super_root(struct the_nilfs *nilfs,
+static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
-                                 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
        struct buffer_head *bh_sr;
        struct nilfs_super_root *raw_sr;
@@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        unsigned inode_size;
        int err;
-        err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+        err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
        if (unlikely(err))
                return err;
@@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
 }
 /**
+ * nilfs_store_log_cursor - load log cursor from a super block
+ * @nilfs: nilfs object
+ * @sbp: buffer storing super block to be read
+ *
+ * nilfs_store_log_cursor() reads the last position of the log
+ * containing a super root from a given super block, and initializes
+ * relevant information on the nilfs object preparatory for log
+ * scanning and recovery.
+ */
+static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
+                                  struct nilfs_super_block *sbp)
+{
+        int ret = 0;
+        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+        nilfs->ns_prev_seq = nilfs->ns_last_seq;
+        nilfs->ns_seg_seq = nilfs->ns_last_seq;
+        nilfs->ns_segnum =
+                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+        nilfs->ns_cno = nilfs->ns_last_cno + 1;
+        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+                printk(KERN_ERR "NILFS invalid last segment number.\n");
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/**
 * load_nilfs - load and recover the nilfs
 * @nilfs: the_nilfs structure to be released
 * @sbi: nilfs_sb_info used to recover past segment
@@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        nilfs_init_recovery_info(&ri);
-        err = nilfs_search_super_root(nilfs, sbi, &ri);
+        err = nilfs_search_super_root(nilfs, &ri);
        if (unlikely(err)) {
-                printk(KERN_ERR "NILFS: error searching super root.\n");
+                struct nilfs_super_block **sbp = nilfs->ns_sbp;
-                goto failed;
+                int blocksize;
+                if (err != -EINVAL)
+                        goto scan_error;
+                if (!nilfs_valid_sb(sbp[1])) {
+                        printk(KERN_WARNING
+                               "NILFS warning: unable to fall back to spare"
+                               "super block\n");
+                        goto scan_error;
+                }
+                printk(KERN_INFO
+                       "NILFS: try rollback from an earlier position\n");
+                /*
+                 * restore super block with its spare and reconfigure
+                 * relevant states of the nilfs object.
+                 */
+                memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
+                nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
+                nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
+                /* verify consistency between two super blocks */
+                blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+                if (blocksize != nilfs->ns_blocksize) {
+                        printk(KERN_WARNING
+                               "NILFS warning: blocksize differs between "
+                               "two super blocks (%d != %d)\n",
+                               blocksize, nilfs->ns_blocksize);
+                        goto scan_error;
+                }
+                err = nilfs_store_log_cursor(nilfs, sbp[0]);
+                if (err)
+                        goto scan_error;
+                /* drop clean flag to allow roll-forward and recovery */
+                nilfs->ns_mount_state &= ~NILFS_VALID_FS;
+                valid_fs = 0;
+                err = nilfs_search_super_root(nilfs, &ri);
+                if (err)
+                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto skip_recovery;
        if (s_flags & MS_RDONLY) {
+                __u64 features;
                if (nilfs_test_opt(sbi, NORECOVERY)) {
                        printk(KERN_INFO "NILFS: norecovery option specified. "
                               "skipping roll-forward recovery\n");
                        goto skip_recovery;
                }
+                features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
+                        ~NILFS_FEATURE_COMPAT_RO_SUPP;
+                if (features) {
+                        printk(KERN_ERR "NILFS: couldn't proceed with "
+                               "recovery because of unsupported optional "
+                               "features (%llx)\n",
+                               (unsigned long long)features);
+                        err = -EROFS;
+                        goto failed_unload;
+                }
                if (really_read_only) {
                        printk(KERN_ERR "NILFS: write access "
                               "unavailable, cannot proceed.\n");
@@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                goto failed_unload;
        }
-        err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+        err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
        if (err)
                goto failed_unload;
        down_write(&nilfs->ns_sem);
-        nilfs->ns_mount_state |= NILFS_VALID_FS;
+        nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
-        nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+        err = nilfs_cleanup_super(sbi);
-        err = nilfs_commit_super(sbi, 1);
        up_write(&nilfs->ns_sem);
        if (err) {
@@ -343,10 +438,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        sbi->s_super->s_flags = s_flags;
        return 0;
+ scan_error:
+        printk(KERN_ERR "NILFS: error searching super root.\n");
+        goto failed;
 failed_unload:
        nilfs_mdt_destroy(nilfs->ns_cpfile);
        nilfs_mdt_destroy(nilfs->ns_sufile);
        nilfs_mdt_destroy(nilfs->ns_dat);
+        nilfs_mdt_destroy(nilfs->ns_gc_dat);
 failed:
        nilfs_clear_recovery_info(&ri);
@@ -486,11 +586,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                printk(KERN_WARNING
                       "NILFS warning: unable to read secondary superblock\n");
+        /*
+         * Compare two super blocks and set 1 in swp if the secondary
+         * super block is valid and newer.  Otherwise, set 0 in swp.
+         */
        valid[0] = nilfs_valid_sb(sbp[0]);
        valid[1] = nilfs_valid_sb(sbp[1]);
-        swp = valid[1] &&
+        swp = valid[1] && (!valid[0] ||
-                (!valid[0] ||
+                           le64_to_cpu(sbp[1]->s_last_cno) >
-                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+                           le64_to_cpu(sbp[0]->s_last_cno));
        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
                brelse(sbh[1]);
@@ -505,14 +609,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                return -EINVAL;
        }
-        if (swp) {
+        if (!valid[!swp])
                printk(KERN_WARNING "NILFS warning: broken superblock. "
                       "using spare superblock.\n");
+        if (swp)
                nilfs_swap_super_block(nilfs);
-        }
-        nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+        nilfs->ns_sbwcount = 0;
-        nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+        nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
        nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
        *sbpp = sbp[0];
        return 0;
@@ -553,6 +657,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                if (err)
                        goto out;
+                err = nilfs_check_feature_compatibility(sb, sbp);
+                if (err)
+                        goto out;
                blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
                if (sb->s_blocksize != blocksize &&
                    !sb_set_blocksize(sb, blocksize)) {
@@ -564,7 +672,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                goto out;
        }
-        blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
        if (!blocksize) {
                printk(KERN_ERR "NILFS: unable to set blocksize\n");
                err = -EINVAL;
@@ -578,7 +686,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        if (err)
                goto failed_sbh;
+        err = nilfs_check_feature_compatibility(sb, sbp);
+        if (err)
+                goto failed_sbh;
        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+        if (blocksize < NILFS_MIN_BLOCK_SIZE ||
+            blocksize > NILFS_MAX_BLOCK_SIZE) {
+                printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
+                       "filesystem blocksize %d\n", blocksize);
+                err = -EINVAL;
+                goto failed_sbh;
+        }
        if (sb->s_blocksize != blocksize) {
                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
@@ -600,6 +719,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
                           when reloading fails. */
        }
        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+        nilfs->ns_blocksize = blocksize;
        err = nilfs_store_disk_layout(nilfs, sbp);
        if (err)
@@ -612,23 +732,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
-        /* Finding last segment */
+        err = nilfs_store_log_cursor(nilfs, sbp);
-        nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+        if (err)
-        nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
-        nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
-        nilfs->ns_seg_seq = nilfs->ns_last_seq;
-        nilfs->ns_segnum =
-                nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
-        nilfs->ns_cno = nilfs->ns_last_cno + 1;
-        if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
-                printk(KERN_ERR "NILFS invalid last segment number.\n");
-                err = -EINVAL;
                goto failed_sbh;
-        }
-        /* Dummy values  */
-        nilfs->ns_free_segments_count =
-                nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
        /* Initialize gcinode cache */
        err = nilfs_init_gccache(nilfs);
@@ -670,7 +776,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
-                                                   DISCARD_FL_BARRIER);
+                                                   BLKDEV_IFL_WAIT |
+                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -680,7 +787,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS, DISCARD_FL_BARRIER);
+                                           GFP_NOFS,
+                                          BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
        return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab974533697..f785a7b0ab99 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -57,7 +57,8 @@ enum {
 * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
- * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbwtime: previous write time of super block
+ * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
 * @ns_supers: list of nilfs super block structs
 * @ns_seg_seq: segment sequence counter
@@ -73,7 +74,7 @@ enum {
 * @ns_last_seq: sequence value of the latest segment
 * @ns_last_cno: checkpoint number of the latest segment
 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
- * @ns_free_segments_count: counter of free segments
+ * @ns_prev_seq: base sequence number used to decide if advance log cursor
 * @ns_segctor_sem: segment constructor semaphore
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
@@ -82,6 +83,7 @@ enum {
 * @ns_gc_inodes: dummy inodes to keep live blocks
 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
 * @ns_blocksize_bits: bit length of block size
+ * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
 * @ns_blocks_per_segment: number of blocks per segment
 * @ns_r_segments_percentage: reserved segments percentage
@@ -119,7 +121,8 @@ struct the_nilfs {
         */
        struct buffer_head     *ns_sbh[2];
        struct nilfs_super_block *ns_sbp[2];
-        time_t                  ns_sbwtime[2];
+        time_t                  ns_sbwtime;
+        unsigned                ns_sbwcount;
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
@@ -149,7 +152,7 @@ struct the_nilfs {
        u64                     ns_last_seq;
        __u64                   ns_last_cno;
        u64                     ns_prot_seq;
-        unsigned long           ns_free_segments_count;
+        u64                     ns_prev_seq;
        struct rw_semaphore     ns_segctor_sem;
@@ -168,6 +171,7 @@ struct the_nilfs {
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
+        unsigned int            ns_blocksize;
        unsigned long           ns_nsegments;
        unsigned long           ns_blocks_per_segment;
        unsigned long           ns_r_segments_percentage;
@@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
-#define NILFS_ALTSB_FREQ        60  /* spare superblock */
 static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
 {
        u64 t = get_seconds();
-        return t < nilfs->ns_sbwtime[0] ||
+        return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
-                 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
 }
-static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 {
-        u64 t = get_seconds();
+        int flip_bits = nilfs->ns_sbwcount & 0x0FL;
-        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return (flip_bits != 0x08 && flip_bits != 0x0F);
-        return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
 }
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index dffbb0911d02..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,3 +3,4 @@ config FSNOTIFY
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
+#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 0922cc826c46..ae5f33a6d868 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,6 @@
-obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o
+obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o \
+                                   mark.o vfsmount_mark.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
+obj-y                   += fanotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 7e54e52964dd..3344bdd5506e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,17 +29,17 @@
 int dir_notify_enable __read_mostly = 1;
 static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_cache __read_mostly;
 static struct fsnotify_group *dnotify_group __read_mostly;
 static DEFINE_MUTEX(dnotify_mark_mutex);
 /*
- * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
-struct dnotify_mark_entry {
+struct dnotify_mark {
-        struct fsnotify_mark_entry fsn_entry;
+        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
 };
@@ -51,27 +51,27 @@ struct dnotify_mark_entry {
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
-static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
+static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 {
        __u32 new_mask, old_mask;
        struct dnotify_struct *dn;
-        struct dnotify_mark_entry *dnentry  = container_of(entry,
+        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
-                                                           struct dnotify_mark_entry,
+                                                     struct dnotify_mark,
-                                                           fsn_entry);
+                                                     fsn_mark);
-        assert_spin_locked(&entry->lock);
+        assert_spin_locked(&fsn_mark->lock);
-        old_mask = entry->mask;
+        old_mask = fsn_mark->mask;
        new_mask = 0;
-        for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
+        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-        entry->mask = new_mask;
+        fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
        if (old_mask == new_mask)
                return;
-        if (entry->inode)
+        if (fsn_mark->i.inode)
-                fsnotify_recalc_inode_mask(entry->inode);
+                fsnotify_recalc_inode_mask(fsn_mark->i.inode);
 }
 /*
@@ -83,29 +83,25 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 * events.
 */
 static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_mark *inode_mark,
+                                struct fsnotify_mark *vfsmount_mark,
                                struct fsnotify_event *event)
 {
-        struct fsnotify_mark_entry *entry = NULL;
+        struct dnotify_mark *dn_mark;
-        struct dnotify_mark_entry *dnentry;
        struct inode *to_tell;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
-        to_tell = event->to_tell;
+        BUG_ON(vfsmount_mark);
-        spin_lock(&to_tell->i_lock);
+        to_tell = event->to_tell;
-        entry = fsnotify_find_mark_entry(group, to_tell);
-        spin_unlock(&to_tell->i_lock);
-        /* unlikely since we alreay passed dnotify_should_send_event() */
+        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
-        if (unlikely(!entry))
-                return 0;
-        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-        spin_lock(&entry->lock);
+        spin_lock(&inode_mark->lock);
-        prev = &dnentry->dn;
+        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
@@ -118,12 +114,11 @@ static int dnotify_handle_event(struct fsnotify_group *group,
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
-                        dnotify_recalc_inode_mask(entry);
+                        dnotify_recalc_inode_mask(inode_mark);
                }
        }
-        spin_unlock(&entry->lock);
+        spin_unlock(&inode_mark->lock);
-        fsnotify_put_mark(entry);
        return 0;
 }
@@ -133,44 +128,27 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 * userspace notification for that pair.
 */
 static bool dnotify_should_send_event(struct fsnotify_group *group,
-                                      struct inode *inode, __u32 mask)
+                                      struct inode *inode,
+                                      struct fsnotify_mark *inode_mark,
+                                      struct fsnotify_mark *vfsmount_mark,
+                                      __u32 mask, void *data, int data_type)
 {
-        struct fsnotify_mark_entry *entry;
-        bool send;
-        /* !dir_notify_enable should never get here, don't waste time checking
-        if (!dir_notify_enable)
-                return 0; */
        /* not a dir, dnotify doesn't care */
        if (!S_ISDIR(inode->i_mode))
                return false;
-        spin_lock(&inode->i_lock);
+        return true;
-        entry = fsnotify_find_mark_entry(group, inode);
-        spin_unlock(&inode->i_lock);
-        /* no mark means no dnotify watch */
-        if (!entry)
-                return false;
-        mask = (mask & ~FS_EVENT_ON_CHILD);
-        send = (mask & entry->mask);
-        fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
-        return send;
 }
-static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-        struct dnotify_mark_entry *dnentry = container_of(entry,
+        struct dnotify_mark *dn_mark = container_of(fsn_mark,
-                                                          struct dnotify_mark_entry,
+                                                    struct dnotify_mark,
-                                                          fsn_entry);
+                                                    fsn_mark);
-        BUG_ON(dnentry->dn);
+        BUG_ON(dn_mark->dn);
-        kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+        kmem_cache_free(dnotify_mark_cache, dn_mark);
 }
 static struct fsnotify_ops dnotify_fsnotify_ops = {
@@ -183,15 +161,15 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 /*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
- * inode.  If one is found run all of the ->dn entries attached to that
+ * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
- * fsnotify_mark_entry.
+ * fsnotify_mark.
 */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_mark *fsn_mark;
-        struct dnotify_mark_entry *dnentry;
+        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
@@ -200,38 +178,34 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        if (!S_ISDIR(inode->i_mode))
                return;
-        spin_lock(&inode->i_lock);
+        fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
-        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        if (!fsn_mark)
-        spin_unlock(&inode->i_lock);
-        if (!entry)
                return;
-        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
        mutex_lock(&dnotify_mark_mutex);
-        spin_lock(&entry->lock);
+        spin_lock(&fsn_mark->lock);
-        prev = &dnentry->dn;
+        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
-                        dnotify_recalc_inode_mask(entry);
+                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }
-        spin_unlock(&entry->lock);
+        spin_unlock(&fsn_mark->lock);
        /* nothing else could have found us thanks to the dnotify_mark_mutex */
-        if (dnentry->dn == NULL)
+        if (dn_mark->dn == NULL)
-                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_destroy_mark(fsn_mark);
-        fsnotify_recalc_group_mask(dnotify_group);
        mutex_unlock(&dnotify_mark_mutex);
-        fsnotify_put_mark(entry);
+        fsnotify_put_mark(fsn_mark);
 }
 /* this conversion is done only at watch creation */
@@ -259,16 +233,16 @@ static __u32 convert_arg(unsigned long arg)
 /*
 * If multiple processes watch the same inode with dnotify there is only one
- * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
-static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
 {
        struct dnotify_struct *odn;
-        odn = dnentry->dn;
+        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
@@ -283,8 +257,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
-        dn->dn_next = dnentry->dn;
+        dn->dn_next = dn_mark->dn;
-        dnentry->dn = dn;
+        dn_mark->dn = dn;
        return 0;
 }
@@ -296,8 +270,8 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnent
 */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
-        struct dnotify_mark_entry *new_dnentry, *dnentry;
+        struct dnotify_mark *new_dn_mark, *dn_mark;
-        struct fsnotify_mark_entry *new_entry, *entry;
+        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
@@ -306,7 +280,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        __u32 mask;
        /* we use these to tell if we need to kfree */
-        new_entry = NULL;
+        new_fsn_mark = NULL;
        dn = NULL;
        if (!dir_notify_enable) {
@@ -336,8 +310,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        }
        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
-        new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
+        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
-        if (!new_dnentry) {
+        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }
@@ -345,29 +319,27 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
        mask = convert_arg(arg);
-        /* set up the new_entry and new_dnentry */
+        /* set up the new_fsn_mark and new_dn_mark */
-        new_entry = &new_dnentry->fsn_entry;
+        new_fsn_mark = &new_dn_mark->fsn_mark;
-        fsnotify_init_mark(new_entry, dnotify_free_mark);
+        fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
-        new_entry->mask = mask;
+        new_fsn_mark->mask = mask;
-        new_dnentry->dn = NULL;
+        new_dn_mark->dn = NULL;
        /* this is needed to prevent the fcntl/close race described below */
        mutex_lock(&dnotify_mark_mutex);
-        /* add the new_entry or find an old one. */
+        /* add the new_fsn_mark or find an old one. */
-        spin_lock(&inode->i_lock);
+        fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
-        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        if (fsn_mark) {
-        spin_unlock(&inode->i_lock);
+                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
-        if (entry) {
+                spin_lock(&fsn_mark->lock);
-                dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-                spin_lock(&entry->lock);
        } else {
-                fsnotify_add_mark(new_entry, dnotify_group, inode);
+                fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
-                spin_lock(&new_entry->lock);
+                spin_lock(&new_fsn_mark->lock);
-                entry = new_entry;
+                fsn_mark = new_fsn_mark;
-                dnentry = new_dnentry;
+                dn_mark = new_dn_mark;
-                /* we used new_entry, so don't free it */
+                /* we used new_fsn_mark, so don't free it */
-                new_entry = NULL;
+                new_fsn_mark = NULL;
        }
        rcu_read_lock();
@@ -376,17 +348,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
-         * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+         * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
-         * only time we clean up the mark entries we need to get our mark off
+         * only time we clean up the marks we need to get our mark off
         * the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
-                 * the flush actually did shoot this entry.  That's fine too
+                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
-                 * we found a dnentry already attached to the inode, just sod
+                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
-                if (dnentry == new_dnentry)
+                if (dn_mark == new_dn_mark)
                        destroy = 1;
                goto out;
        }
@@ -394,13 +366,13 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
        if (error) {
                /* if we added, we must shoot */
-                if (dnentry == new_dnentry)
+                if (dn_mark == new_dn_mark)
                        destroy = 1;
                goto out;
        }
-        error = attach_dn(dn, dnentry, id, fd, filp, mask);
+        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
-        /* !error means that we attached the dn to the dnentry, so don't free it */
+        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
@@ -408,20 +380,18 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        else if (error == -EEXIST)
                error = 0;
-        dnotify_recalc_inode_mask(entry);
+        dnotify_recalc_inode_mask(fsn_mark);
 out:
-        spin_unlock(&entry->lock);
+        spin_unlock(&fsn_mark->lock);
        if (destroy)
-                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_destroy_mark(fsn_mark);
-        fsnotify_recalc_group_mask(dnotify_group);
        mutex_unlock(&dnotify_mark_mutex);
-        fsnotify_put_mark(entry);
+        fsnotify_put_mark(fsn_mark);
 out_err:
-        if (new_entry)
+        if (new_fsn_mark)
-                fsnotify_put_mark(new_entry);
+                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        return error;
@@ -430,10 +400,9 @@ out_err:
 static int __init dnotify_init(void)
 {
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-        dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
-        dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
-                                              0, &dnotify_fsnotify_ops);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
new file mode 100644
index 000000000000..3ac36b7bf6b9
--- /dev/null
+++ b/fs/notify/fanotify/Kconfig
@@ -0,0 +1,26 @@
+config FANOTIFY
+        bool "Filesystem wide access notification"
+        select FSNOTIFY
+        select ANON_INODES
+        default n
+        ---help---
+           Say Y here to enable fanotify suport.  fanotify is a file access
+           notification system which differs from inotify in that it sends
+           and open file descriptor to the userspace listener along with
+           the event.
+           If unsure, say Y.
+config FANOTIFY_ACCESS_PERMISSIONS
+        bool "fanotify permissions checking"
+        depends on FANOTIFY
+        depends on SECURITY
+        default n
+        ---help---
+           Say Y here is you want fanotify listeners to be able to make permissions
+           decisions concerning filesystem events.  This is used by some fanotify
+           listeners which need to scan files before allowing the system access to
+           use those files.  This is used by some anti-malware vendors and by some
+           hierarchical storage managent systems.
+           If unsure, say N.
diff --git a/fs/notify/fanotify/Makefile b/fs/notify/fanotify/Makefile
new file mode 100644
index 000000000000..0999213e7e6e
--- /dev/null
+++ b/fs/notify/fanotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FANOTIFY)          += fanotify.o fanotify_user.o
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
new file mode 100644
index 000000000000..85366c78cc37
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.c
@@ -0,0 +1,209 @@
+#include <linux/fanotify.h>
+#include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h> /* UINT_MAX */
+#include <linux/mount.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+        if (old->to_tell == new->to_tell &&
+            old->data_type == new->data_type &&
+            old->tgid == new->tgid) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                case (FSNOTIFY_EVENT_NONE):
+                        return true;
+                default:
+                        BUG();
+                };
+        }
+        return false;
+}
+/* and the list better be locked by something too! */
+static struct fsnotify_event *fanotify_merge(struct list_head *list,
+                                             struct fsnotify_event *event)
+{
+        struct fsnotify_event_holder *test_holder;
+        struct fsnotify_event *test_event = NULL;
+        struct fsnotify_event *new_event;
+        pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+        list_for_each_entry_reverse(test_holder, list, event_list) {
+                if (should_merge(test_holder->event, event)) {
+                        test_event = test_holder->event;
+                        break;
+                }
+        }
+        if (!test_event)
+                return NULL;
+        fsnotify_get_event(test_event);
+        /* if they are exactly the same we are done */
+        if (test_event->mask == event->mask)
+                return test_event;
+        /*
+         * if the refcnt == 2 this is the only queue
+         * for this event and so we can update the mask
+         * in place.
+         */
+        if (atomic_read(&test_event->refcnt) == 2) {
+                test_event->mask |= event->mask;
+                return test_event;
+        }
+        new_event = fsnotify_clone_event(test_event);
+        /* done with test_event */
+        fsnotify_put_event(test_event);
+        /* couldn't allocate memory, merge was not possible */
+        if (unlikely(!new_event))
+                return ERR_PTR(-ENOMEM);
+        /* build new event and replace it on the list */
+        new_event->mask = (test_event->mask | event->mask);
+        fsnotify_replace_event(test_holder, new_event);
+        /* we hold a reference on new_event from clone_event */
+        return new_event;
+}
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static int fanotify_get_response_from_access(struct fsnotify_group *group,
+                                             struct fsnotify_event *event)
+{
+        int ret;
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+        wait_event(group->fanotify_data.access_waitq, event->response);
+        /* userspace responded, convert to something usable */
+        spin_lock(&event->lock);
+        switch (event->response) {
+        case FAN_ALLOW:
+                ret = 0;
+                break;
+        case FAN_DENY:
+        default:
+                ret = -EPERM;
+        }
+        event->response = 0;
+        spin_unlock(&event->lock);
+        pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
+                 group, event, ret);
+        
+        return ret;
+}
+#endif
+static int fanotify_handle_event(struct fsnotify_group *group,
+                                 struct fsnotify_mark *inode_mark,
+                                 struct fsnotify_mark *fanotify_mark,
+                                 struct fsnotify_event *event)
+{
+        int ret = 0;
+        struct fsnotify_event *notify_event = NULL;
+        BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+        BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+        BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+        notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
+        if (IS_ERR(notify_event))
+                return PTR_ERR(notify_event);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (event->mask & FAN_ALL_PERM_EVENTS) {
+                /* if we merged we need to wait on the new event */
+                if (notify_event)
+                        event = notify_event;
+                ret = fanotify_get_response_from_access(group, event);
+        }
+#endif
+        if (notify_event)
+                fsnotify_put_event(notify_event);
+        return ret;
+}
+static bool fanotify_should_send_event(struct fsnotify_group *group,
+                                       struct inode *to_tell,
+                                       struct fsnotify_mark *inode_mark,
+                                       struct fsnotify_mark *vfsmnt_mark,
+                                       __u32 event_mask, void *data, int data_type)
+{
+        __u32 marks_mask, marks_ignored_mask;
+        pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
+                 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
+                 inode_mark, vfsmnt_mark, event_mask, data, data_type);
+        /* sorry, fanotify only gives a damn about files and dirs */
+        if (!S_ISREG(to_tell->i_mode) &&
+            !S_ISDIR(to_tell->i_mode))
+                return false;
+        /* if we don't have enough info to send an event to userspace say no */
+        if (data_type != FSNOTIFY_EVENT_PATH)
+                return false;
+        if (inode_mark && vfsmnt_mark) {
+                marks_mask = (vfsmnt_mark->mask | inode_mark->mask);
+                marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask);
+        } else if (inode_mark) {
+                /*
+                 * if the event is for a child and this inode doesn't care about
+                 * events on the child, don't send it!
+                 */
+                if ((event_mask & FS_EVENT_ON_CHILD) &&
+                    !(inode_mark->mask & FS_EVENT_ON_CHILD))
+                        return false;
+                marks_mask = inode_mark->mask;
+                marks_ignored_mask = inode_mark->ignored_mask;
+        } else if (vfsmnt_mark) {
+                marks_mask = vfsmnt_mark->mask;
+                marks_ignored_mask = vfsmnt_mark->ignored_mask;
+        } else {
+                BUG();
+        }
+        if (event_mask & marks_mask & ~marks_ignored_mask)
+                return true;
+        return false;
+}
+const struct fsnotify_ops fanotify_fsnotify_ops = {
+        .handle_event = fanotify_handle_event,
+        .should_send_event = fanotify_should_send_event,
+        .free_group_priv = NULL,
+        .free_event_priv = NULL,
+        .freeing_mark = NULL,
+};
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
new file mode 100644
index 000000000000..5ed8e58d7bfc
--- /dev/null
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -0,0 +1,787 @@
+#include <linux/fanotify.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+extern const struct fsnotify_ops fanotify_fsnotify_ops;
+static struct kmem_cache *fanotify_mark_cache __read_mostly;
+static struct kmem_cache *fanotify_response_event_cache __read_mostly;
+struct fanotify_response_event {
+        struct list_head list;
+        __s32 fd;
+        struct fsnotify_event *event;
+};
+/*
+ * Get an fsnotify notification event if one exists and is small
+ * enough to fit in "count". Return an error pointer if the count
+ * is not large enough.
+ *
+ * Called with the group->notification_mutex held.
+ */
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+                                            size_t count)
+{
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+        if (fsnotify_notify_queue_is_empty(group))
+                return NULL;
+        if (FAN_EVENT_METADATA_LEN > count)
+                return ERR_PTR(-EINVAL);
+        /* held the notification_mutex the whole time, so this is the
+         * same event we peeked above */
+        return fsnotify_remove_notify_event(group);
+}
+static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        int client_fd;
+        struct dentry *dentry;
+        struct vfsmount *mnt;
+        struct file *new_file;
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+        client_fd = get_unused_fd();
+        if (client_fd < 0)
+                return client_fd;
+        if (event->data_type != FSNOTIFY_EVENT_PATH) {
+                WARN_ON(1);
+                put_unused_fd(client_fd);
+                return -EINVAL;
+        }
+        /*
+         * we need a new file handle for the userspace program so it can read even if it was
+         * originally opened O_WRONLY.
+         */
+        dentry = dget(event->path.dentry);
+        mnt = mntget(event->path.mnt);
+        /* it's possible this event was an overflow event.  in that case dentry and mnt
+         * are NULL;  That's fine, just don't call dentry open */
+        if (dentry && mnt)
+                new_file = dentry_open(dentry, mnt,
+                                       group->fanotify_data.f_flags | FMODE_NONOTIFY,
+                                       current_cred());
+        else
+                new_file = ERR_PTR(-EOVERFLOW);
+        if (IS_ERR(new_file)) {
+                /*
+                 * we still send an event even if we can't open the file.  this
+                 * can happen when say tasks are gone and we try to open their
+                 * /proc files or we try to open a WRONLY file like in sysfs
+                 * we just send the errno to userspace since there isn't much
+                 * else we can do.
+                 */
+                put_unused_fd(client_fd);
+                client_fd = PTR_ERR(new_file);
+        } else {
+                fd_install(client_fd, new_file);
+        }
+        return client_fd;
+}
+static ssize_t fill_event_metadata(struct fsnotify_group *group,
+                                   struct fanotify_event_metadata *metadata,
+                                   struct fsnotify_event *event)
+{
+        pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
+                 group, metadata, event);
+        metadata->event_len = FAN_EVENT_METADATA_LEN;
+        metadata->vers = FANOTIFY_METADATA_VERSION;
+        metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+        metadata->pid = pid_vnr(event->tgid);
+        metadata->fd = create_fd(group, event);
+        return metadata->fd;
+}
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
+                                                  __s32 fd)
+{
+        struct fanotify_response_event *re, *return_re = NULL;
+        mutex_lock(&group->fanotify_data.access_mutex);
+        list_for_each_entry(re, &group->fanotify_data.access_list, list) {
+                if (re->fd != fd)
+                        continue;
+                list_del_init(&re->list);
+                return_re = re;
+                break;
+        }
+        mutex_unlock(&group->fanotify_data.access_mutex);
+        pr_debug("%s: found return_re=%p\n", __func__, return_re);
+        return return_re;
+}
+static int process_access_response(struct fsnotify_group *group,
+                                   struct fanotify_response *response_struct)
+{
+        struct fanotify_response_event *re;
+        __s32 fd = response_struct->fd;
+        __u32 response = response_struct->response;
+        pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
+                 fd, response);
+        /*
+         * make sure the response is valid, if invalid we do nothing and either
+         * userspace can send a valid responce or we will clean it up after the
+         * timeout
+         */
+        switch (response) {
+        case FAN_ALLOW:
+        case FAN_DENY:
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (fd < 0)
+                return -EINVAL;
+        re = dequeue_re(group, fd);
+        if (!re)
+                return -ENOENT;
+        re->event->response = response;
+        wake_up(&group->fanotify_data.access_waitq);
+        kmem_cache_free(fanotify_response_event_cache, re);
+        return 0;
+}
+static int prepare_for_access_response(struct fsnotify_group *group,
+                                       struct fsnotify_event *event,
+                                       __s32 fd)
+{
+        struct fanotify_response_event *re;
+        if (!(event->mask & FAN_ALL_PERM_EVENTS))
+                return 0;
+        re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
+        if (!re)
+                return -ENOMEM;
+        re->event = event;
+        re->fd = fd;
+        mutex_lock(&group->fanotify_data.access_mutex);
+        if (group->fanotify_data.bypass_perm) {
+                mutex_unlock(&group->fanotify_data.access_mutex);
+                kmem_cache_free(fanotify_response_event_cache, re);
+                event->response = FAN_ALLOW;
+                return 0;
+        }
+                
+        list_add_tail(&re->list, &group->fanotify_data.access_list);
+        mutex_unlock(&group->fanotify_data.access_mutex);
+        return 0;
+}
+static void remove_access_response(struct fsnotify_group *group,
+                                   struct fsnotify_event *event,
+                                   __s32 fd)
+{
+        struct fanotify_response_event *re;
+        if (!(event->mask & FAN_ALL_PERM_EVENTS))
+                return;
+        re = dequeue_re(group, fd);
+        if (!re)
+                return;
+        BUG_ON(re->event != event);
+        kmem_cache_free(fanotify_response_event_cache, re);
+        return;
+}
+#else
+static int prepare_for_access_response(struct fsnotify_group *group,
+                                       struct fsnotify_event *event,
+                                       __s32 fd)
+{
+        return 0;
+}
+static void remove_access_response(struct fsnotify_group *group,
+                                   struct fsnotify_event *event,
+                                   __s32 fd)
+{
+        return;
+}
+#endif
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+                                  struct fsnotify_event *event,
+                                  char __user *buf)
+{
+        struct fanotify_event_metadata fanotify_event_metadata;
+        int fd, ret;
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+        fd = fill_event_metadata(group, &fanotify_event_metadata, event);
+        if (fd < 0)
+                return fd;
+        ret = prepare_for_access_response(group, event, fd);
+        if (ret)
+                goto out_close_fd;
+        ret = -EFAULT;
+        if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN))
+                goto out_kill_access_response;
+        return FAN_EVENT_METADATA_LEN;
+out_kill_access_response:
+        remove_access_response(group, event, fd);
+out_close_fd:
+        sys_close(fd);
+        return ret;
+}
+/* intofiy userspace file descriptor functions */
+static unsigned int fanotify_poll(struct file *file, poll_table *wait)
+{
+        struct fsnotify_group *group = file->private_data;
+        int ret = 0;
+        poll_wait(file, &group->notification_waitq, wait);
+        mutex_lock(&group->notification_mutex);
+        if (!fsnotify_notify_queue_is_empty(group))
+                ret = POLLIN | POLLRDNORM;
+        mutex_unlock(&group->notification_mutex);
+        return ret;
+}
+static ssize_t fanotify_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *pos)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event *kevent;
+        char __user *start;
+        int ret;
+        DEFINE_WAIT(wait);
+        start = buf;
+        group = file->private_data;
+        pr_debug("%s: group=%p\n", __func__, group);
+        while (1) {
+                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&group->notification_mutex);
+                kevent = get_one_event(group, count);
+                mutex_unlock(&group->notification_mutex);
+                if (kevent) {
+                        ret = PTR_ERR(kevent);
+                        if (IS_ERR(kevent))
+                                break;
+                        ret = copy_event_to_user(group, kevent, buf);
+                        fsnotify_put_event(kevent);
+                        if (ret < 0)
+                                break;
+                        buf += ret;
+                        count -= ret;
+                        continue;
+                }
+                ret = -EAGAIN;
+                if (file->f_flags & O_NONBLOCK)
+                        break;
+                ret = -EINTR;
+                if (signal_pending(current))
+                        break;
+                if (start != buf)
+                        break;
+                schedule();
+        }
+        finish_wait(&group->notification_waitq, &wait);
+        if (start != buf && ret != -EFAULT)
+                ret = buf - start;
+        return ret;
+}
+static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
+{
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        struct fanotify_response response = { .fd = -1, .response = -1 };
+        struct fsnotify_group *group;
+        int ret;
+        group = file->private_data;
+        if (count > sizeof(response))
+                count = sizeof(response);
+        pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
+        if (copy_from_user(&response, buf, count))
+                return -EFAULT;
+        ret = process_access_response(group, &response);
+        if (ret < 0)
+                count = ret;
+        return count;
+#else
+        return -EINVAL;
+#endif
+}
+static int fanotify_release(struct inode *ignored, struct file *file)
+{
+        struct fsnotify_group *group = file->private_data;
+        struct fanotify_response_event *re, *lre;
+        pr_debug("%s: file=%p group=%p\n", __func__, file, group);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        mutex_lock(&group->fanotify_data.access_mutex);
+        group->fanotify_data.bypass_perm = true;
+        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+                         re, re->event);
+                list_del_init(&re->list);
+                re->event->response = FAN_ALLOW;
+                kmem_cache_free(fanotify_response_event_cache, re);
+        }
+        mutex_unlock(&group->fanotify_data.access_mutex);
+        wake_up(&group->fanotify_data.access_waitq);
+#endif
+        /* matches the fanotify_init->fsnotify_alloc_group */
+        fsnotify_put_group(group);
+        return 0;
+}
+static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event_holder *holder;
+        void __user *p;
+        int ret = -ENOTTY;
+        size_t send_len = 0;
+        group = file->private_data;
+        p = (void __user *) arg;
+        switch (cmd) {
+        case FIONREAD:
+                mutex_lock(&group->notification_mutex);
+                list_for_each_entry(holder, &group->notification_list, event_list)
+                        send_len += FAN_EVENT_METADATA_LEN;
+                mutex_unlock(&group->notification_mutex);
+                ret = put_user(send_len, (int __user *) p);
+                break;
+        }
+        return ret;
+}
+static const struct file_operations fanotify_fops = {
+        .poll           = fanotify_poll,
+        .read           = fanotify_read,
+        .write          = fanotify_write,
+        .fasync         = NULL,
+        .release        = fanotify_release,
+        .unlocked_ioctl = fanotify_ioctl,
+        .compat_ioctl   = fanotify_ioctl,
+};
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+        kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+static int fanotify_find_path(int dfd, const char __user *filename,
+                              struct path *path, unsigned int flags)
+{
+        int ret;
+        pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
+                 dfd, filename, flags);
+        if (filename == NULL) {
+                struct file *file;
+                int fput_needed;
+                ret = -EBADF;
+                file = fget_light(dfd, &fput_needed);
+                if (!file)
+                        goto out;
+                ret = -ENOTDIR;
+                if ((flags & FAN_MARK_ONLYDIR) &&
+                    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
+                        fput_light(file, fput_needed);
+                        goto out;
+                }
+                *path = file->f_path;
+                path_get(path);
+                fput_light(file, fput_needed);
+        } else {
+                unsigned int lookup_flags = 0;
+                if (!(flags & FAN_MARK_DONT_FOLLOW))
+                        lookup_flags |= LOOKUP_FOLLOW;
+                if (flags & FAN_MARK_ONLYDIR)
+                        lookup_flags |= LOOKUP_DIRECTORY;
+                ret = user_path_at(dfd, filename, lookup_flags, path);
+                if (ret)
+                        goto out;
+        }
+        /* you can only watch an inode if you have read permissions on it */
+        ret = inode_permission(path->dentry->d_inode, MAY_READ);
+        if (ret)
+                path_put(path);
+out:
+        return ret;
+}
+static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+                                            __u32 mask,
+                                            unsigned int flags)
+{
+        __u32 oldmask;
+        spin_lock(&fsn_mark->lock);
+        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                oldmask = fsn_mark->mask;
+                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+        } else {
+                oldmask = fsn_mark->ignored_mask;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+        }
+        spin_unlock(&fsn_mark->lock);
+        if (!(oldmask & ~mask))
+                fsnotify_destroy_mark(fsn_mark);
+        return mask & oldmask;
+}
+static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
+                                         struct vfsmount *mnt, __u32 mask,
+                                         unsigned int flags)
+{
+        struct fsnotify_mark *fsn_mark = NULL;
+        __u32 removed;
+        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+        if (!fsn_mark)
+                return -ENOENT;
+        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+        fsnotify_put_mark(fsn_mark);
+        if (removed & mnt->mnt_fsnotify_mask)
+                fsnotify_recalc_vfsmount_mask(mnt);
+        return 0;
+}
+static int fanotify_remove_inode_mark(struct fsnotify_group *group,
+                                      struct inode *inode, __u32 mask,
+                                      unsigned int flags)
+{
+        struct fsnotify_mark *fsn_mark = NULL;
+        __u32 removed;
+        fsn_mark = fsnotify_find_inode_mark(group, inode);
+        if (!fsn_mark)
+                return -ENOENT;
+        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags);
+        /* matches the fsnotify_find_inode_mark() */
+        fsnotify_put_mark(fsn_mark);
+        if (removed & inode->i_fsnotify_mask)
+                fsnotify_recalc_inode_mask(inode);
+        return 0;
+}
+static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+                                       __u32 mask,
+                                       unsigned int flags)
+{
+        __u32 oldmask;
+        spin_lock(&fsn_mark->lock);
+        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                oldmask = fsn_mark->mask;
+                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+        } else {
+                oldmask = fsn_mark->ignored_mask;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask));
+                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+        }
+        spin_unlock(&fsn_mark->lock);
+        return mask & ~oldmask;
+}
+static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
+                                      struct vfsmount *mnt, __u32 mask,
+                                      unsigned int flags)
+{
+        struct fsnotify_mark *fsn_mark;
+        __u32 added;
+        fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+        if (!fsn_mark) {
+                int ret;
+                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+                if (!fsn_mark)
+                        return -ENOMEM;
+                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+                ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
+                if (ret) {
+                        fanotify_free_mark(fsn_mark);
+                        return ret;
+                }
+        }
+        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+        fsnotify_put_mark(fsn_mark);
+        if (added & ~mnt->mnt_fsnotify_mask)
+                fsnotify_recalc_vfsmount_mask(mnt);
+        return 0;
+}
+static int fanotify_add_inode_mark(struct fsnotify_group *group,
+                                   struct inode *inode, __u32 mask,
+                                   unsigned int flags)
+{
+        struct fsnotify_mark *fsn_mark;
+        __u32 added;
+        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
+        fsn_mark = fsnotify_find_inode_mark(group, inode);
+        if (!fsn_mark) {
+                int ret;
+                fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+                if (!fsn_mark)
+                        return -ENOMEM;
+                fsnotify_init_mark(fsn_mark, fanotify_free_mark);
+                ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
+                if (ret) {
+                        fanotify_free_mark(fsn_mark);
+                        return ret;
+                }
+        }
+        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+        fsnotify_put_mark(fsn_mark);
+        if (added & ~inode->i_fsnotify_mask)
+                fsnotify_recalc_inode_mask(inode);
+        return 0;
+}
+/* fanotify syscalls */
+SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+{
+        struct fsnotify_group *group;
+        int f_flags, fd;
+        pr_debug("%s: flags=%d event_f_flags=%d\n",
+                __func__, flags, event_f_flags);
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (flags & ~FAN_ALL_INIT_FLAGS)
+                return -EINVAL;
+        f_flags = O_RDWR | FMODE_NONOTIFY;
+        if (flags & FAN_CLOEXEC)
+                f_flags |= O_CLOEXEC;
+        if (flags & FAN_NONBLOCK)
+                f_flags |= O_NONBLOCK;
+        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
+        group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+        if (IS_ERR(group))
+                return PTR_ERR(group);
+        group->fanotify_data.f_flags = event_f_flags;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        mutex_init(&group->fanotify_data.access_mutex);
+        init_waitqueue_head(&group->fanotify_data.access_waitq);
+        INIT_LIST_HEAD(&group->fanotify_data.access_list);
+#endif
+        fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
+        if (fd < 0)
+                goto out_put_group;
+        return fd;
+out_put_group:
+        fsnotify_put_group(group);
+        return fd;
+}
+SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
+                              __u64 mask, int dfd,
+                              const char  __user * pathname)
+{
+        struct inode *inode = NULL;
+        struct vfsmount *mnt = NULL;
+        struct fsnotify_group *group;
+        struct file *filp;
+        struct path path;
+        int ret, fput_needed;
+        pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
+                 __func__, fanotify_fd, flags, dfd, pathname, mask);
+        /* we only use the lower 32 bits as of right now. */
+        if (mask & ((__u64)0xffffffff << 32))
+                return -EINVAL;
+        if (flags & ~FAN_ALL_MARK_FLAGS)
+                return -EINVAL;
+        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+        case FAN_MARK_ADD:
+        case FAN_MARK_REMOVE:
+        case FAN_MARK_FLUSH:
+                break;
+        default:
+                return -EINVAL;
+        }
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD))
+#else
+        if (mask & ~(FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD))
+#endif
+                return -EINVAL;
+        filp = fget_light(fanotify_fd, &fput_needed);
+        if (unlikely(!filp))
+                return -EBADF;
+        /* verify that this is indeed an fanotify instance */
+        ret = -EINVAL;
+        if (unlikely(filp->f_op != &fanotify_fops))
+                goto fput_and_out;
+        ret = fanotify_find_path(dfd, pathname, &path, flags);
+        if (ret)
+                goto fput_and_out;
+        /* inode held in place by reference to path; group by fget on fd */
+        if (!(flags & FAN_MARK_MOUNT))
+                inode = path.dentry->d_inode;
+        else
+                mnt = path.mnt;
+        group = filp->private_data;
+        /* create/update an inode mark */
+        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+        case FAN_MARK_ADD:
+                if (flags & FAN_MARK_MOUNT)
+                        ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+                else
+                        ret = fanotify_add_inode_mark(group, inode, mask, flags);
+                break;
+        case FAN_MARK_REMOVE:
+                if (flags & FAN_MARK_MOUNT)
+                        ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+                else
+                        ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+                break;
+        case FAN_MARK_FLUSH:
+                if (flags & FAN_MARK_MOUNT)
+                        fsnotify_clear_vfsmount_marks_by_group(group);
+                else
+                        fsnotify_clear_inode_marks_by_group(group);
+                break;
+        default:
+                ret = -EINVAL;
+        }
+        path_put(&path);
+fput_and_out:
+        fput_light(filp, fput_needed);
+        return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
+                                  long dfd, long pathname)
+{
+        return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
+                                  mask, (int) dfd,
+                                  (const char  __user *) pathname);
+}
+SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
+#endif
+/*
+ * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init fanotify_user_setup(void)
+{
+        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+        fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
+                                                   SLAB_PANIC);
+        return 0;
+}
+device_initcall(fanotify_user_setup);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fcc2f064af83..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -21,6 +21,7 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/srcu.h>
 #include <linux/fsnotify_backend.h>
@@ -35,6 +36,11 @@ void __fsnotify_inode_delete(struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
+{
+        fsnotify_clear_marks_by_mount(mnt);
+}
 /*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
@@ -78,13 +84,16 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
        bool send = false;
        bool should_update_children = false;
+        if (!dentry)
+                dentry = path->dentry;
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                return;
@@ -115,8 +124,12 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
-                fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                if (path)
-                         dentry->d_name.name, 0);
+                        fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
+                                 dentry->d_name.name, 0);
+                else
+                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                                 dentry->d_name.name, 0);
                dput(parent);
        }
@@ -127,63 +140,185 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
+static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+                         struct fsnotify_mark *inode_mark,
+                         struct fsnotify_mark *vfsmount_mark,
+                         __u32 mask, void *data,
+                         int data_is, u32 cookie,
+                         const unsigned char *file_name,
+                         struct fsnotify_event **event)
+{
+        struct fsnotify_group *group = NULL;
+        __u32 inode_test_mask = 0;
+        __u32 vfsmount_test_mask = 0;
+        if (unlikely(!inode_mark && !vfsmount_mark)) {
+                BUG();
+                return 0;
+        }
+        /* clear ignored on inode modification */
+        if (mask & FS_MODIFY) {
+                if (inode_mark &&
+                    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+                        inode_mark->ignored_mask = 0;
+                if (vfsmount_mark &&
+                    !(vfsmount_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+                        vfsmount_mark->ignored_mask = 0;
+        }
+        /* does the inode mark tell us to do something? */
+        if (inode_mark) {
+                group = inode_mark->group;
+                inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+                inode_test_mask &= inode_mark->mask;
+                inode_test_mask &= ~inode_mark->ignored_mask;
+        }
+        /* does the vfsmount_mark tell us to do something? */
+        if (vfsmount_mark) {
+                vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+                group = vfsmount_mark->group;
+                vfsmount_test_mask &= vfsmount_mark->mask;
+                vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
+                if (inode_mark)
+                        vfsmount_test_mask &= ~inode_mark->ignored_mask;
+        }
+        pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+                 " data=%p data_is=%d cookie=%d event=%p\n",
+                 __func__, group, to_tell, mnt, mask, inode_mark,
+                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+                 data_is, cookie, *event);
+        if (!inode_test_mask && !vfsmount_test_mask)
+                return 0;
+        if (group->ops->should_send_event(group, to_tell, inode_mark,
+                                          vfsmount_mark, mask, data,
+                                          data_is) == false)
+                return 0;
+        if (!*event) {
+                *event = fsnotify_create_event(to_tell, mask, data,
+                                                data_is, file_name,
+                                                cookie, GFP_KERNEL);
+                if (!*event)
+                        return -ENOMEM;
+        }
+        return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+}
 /*
 * This is the main call to fsnotify.  The VFS calls into hook specific functions
 * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
 * out to all of the registered fsnotify_group.  Those groups can then use the
 * notification event in whatever means they feel necessary.
 */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+             const unsigned char *file_name, u32 cookie)
 {
-        struct fsnotify_group *group;
+        struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
+        struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
+        struct fsnotify_group *inode_group, *vfsmount_group;
        struct fsnotify_event *event = NULL;
-        int idx;
+        struct vfsmount *mnt;
+        int idx, ret = 0;
        /* global tests shouldn't care about events on child only the specific event */
        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
-        if (list_empty(&fsnotify_groups))
+        if (data_is == FSNOTIFY_EVENT_PATH)
-                return;
+                mnt = ((struct path *)data)->mnt;
+        else
+                mnt = NULL;
-        if (!(test_mask & fsnotify_mask))
-                return;
-        if (!(test_mask & to_tell->i_fsnotify_mask))
-                return;
        /*
-         * SRCU!!  the groups list is very very much read only and the path is
+         * if this is a modify event we may need to clear the ignored masks
-         * very hot.  The VAST majority of events are not going to need to do
+         * otherwise return if neither the inode nor the vfsmount care about
-         * anything other than walk the list so it's crazy to pre-allocate.
+         * this type of event.
         */
-        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        if (!(mask & FS_MODIFY) &&
-        list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+            !(test_mask & to_tell->i_fsnotify_mask) &&
-                if (test_mask & group->mask) {
+            !(mnt && test_mask & mnt->mnt_fsnotify_mask))
-                        if (!group->ops->should_send_event(group, to_tell, mask))
+                return 0;
-                                continue;
-                        if (!event) {
+        idx = srcu_read_lock(&fsnotify_mark_srcu);
-                                event = fsnotify_create_event(to_tell, mask, data,
-                                                              data_is, file_name, cookie,
+        if ((mask & FS_MODIFY) ||
-                                                              GFP_KERNEL);
+            (test_mask & to_tell->i_fsnotify_mask))
-                                /* shit, we OOM'd and now we can't tell, maybe
+                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
-                                 * someday someone else will want to do something
+                                              &fsnotify_mark_srcu);
-                                 * here */
-                                if (!event)
+        if (mnt && ((mask & FS_MODIFY) ||
-                                        break;
+                    (test_mask & mnt->mnt_fsnotify_mask))) {
-                        }
+                vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-                        group->ops->handle_event(group, event);
+                                                 &fsnotify_mark_srcu);
+                inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+                                              &fsnotify_mark_srcu);
+        }
+        while (inode_node || vfsmount_node) {
+                inode_group = vfsmount_group = NULL;
+                if (inode_node) {
+                        inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
+                                                 struct fsnotify_mark, i.i_list);
+                        inode_group = inode_mark->group;
                }
+                if (vfsmount_node) {
+                        vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
+                                                        struct fsnotify_mark, m.m_list);
+                        vfsmount_group = vfsmount_mark->group;
+                }
+                if (inode_group > vfsmount_group) {
+                        /* handle inode */
+                        send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+                                      data_is, cookie, file_name, &event);
+                        /* we didn't use the vfsmount_mark */
+                        vfsmount_group = NULL;
+                } else if (vfsmount_group > inode_group) {
+                        send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
+                                      data_is, cookie, file_name, &event);
+                        inode_group = NULL;
+                } else {
+                        send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
+                                      mask, data, data_is, cookie, file_name,
+                                      &event);
+                }
+                if (inode_group)
+                        inode_node = srcu_dereference(inode_node->next,
+                                                      &fsnotify_mark_srcu);
+                if (vfsmount_group)
+                        vfsmount_node = srcu_dereference(vfsmount_node->next,
+                                                         &fsnotify_mark_srcu);
        }
-        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        /*
         * fsnotify_create_event() took a reference so the event can't be cleaned
         * up while we are still trying to add it to lists, drop that one.
         */
        if (event)
                fsnotify_put_event(event);
+        return ret;
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 static __init int fsnotify_init(void)
 {
-        return init_srcu_struct(&fsnotify_grp_srcu);
+        int ret;
+        BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23);
+        ret = init_srcu_struct(&fsnotify_mark_srcu);
+        if (ret)
+                panic("initializing fsnotify_mark_srcu");
+        return 0;
 }
-subsys_initcall(fsnotify_init);
+core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 4dc240824b2d..85e7d2b431d9 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,21 +6,34 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
-/* protects reads of fsnotify_groups */
-extern struct srcu_struct fsnotify_grp_srcu;
-/* all groups which receive fsnotify events */
-extern struct list_head fsnotify_groups;
-/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
-extern __u32 fsnotify_mask;
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
+/* protects reads of inode and vfsmount marks list */
+extern struct srcu_struct fsnotify_mark_srcu;
+extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
+                                                __u32 mask);
+/* add a mark to an inode */
+extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+                                   struct fsnotify_group *group, struct inode *inode,
+                                   int allow_dups);
+/* add a mark to a vfsmount */
+extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+                                      struct fsnotify_group *group, struct vfsmount *mnt,
+                                      int allow_dups);
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* vfsmount specific destruction of a mark */
+extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
+/* inode specific destruction of a mark */
+extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/* run the list of all marks associated with vfsmount and flag them to be freed */
+extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
 /*
 * update the dentry->d_flags of all of inode's children to indicate if inode cares
 * about events that happen to its children.
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0e1677144bc5..d309f38449cb 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -28,64 +28,6 @@
 #include <asm/atomic.h>
-/* protects writes to fsnotify_groups and fsnotify_mask */
-static DEFINE_MUTEX(fsnotify_grp_mutex);
-/* protects reads while running the fsnotify_groups list */
-struct srcu_struct fsnotify_grp_srcu;
-/* all groups registered to receive filesystem notifications */
-LIST_HEAD(fsnotify_groups);
-/* bitwise OR of all events (FS_*) interesting to some group on this system */
-__u32 fsnotify_mask;
-/*
- * When a new group registers or changes it's set of interesting events
- * this function updates the fsnotify_mask to contain all interesting events
- */
-void fsnotify_recalc_global_mask(void)
-{
-        struct fsnotify_group *group;
-        __u32 mask = 0;
-        int idx;
-        idx = srcu_read_lock(&fsnotify_grp_srcu);
-        list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
-                mask |= group->mask;
-        srcu_read_unlock(&fsnotify_grp_srcu, idx);
-        fsnotify_mask = mask;
-}
-/*
- * Update the group->mask by running all of the marks associated with this
- * group and finding the bitwise | of all of the mark->mask.  If we change
- * the group->mask we need to update the global mask of events interesting
- * to the system.
- */
-void fsnotify_recalc_group_mask(struct fsnotify_group *group)
-{
-        __u32 mask = 0;
-        __u32 old_mask = group->mask;
-        struct fsnotify_mark_entry *entry;
-        spin_lock(&group->mark_lock);
-        list_for_each_entry(entry, &group->mark_entries, g_list)
-                mask |= entry->mask;
-        spin_unlock(&group->mark_lock);
-        group->mask = mask;
-        if (old_mask != mask)
-                fsnotify_recalc_global_mask();
-}
-/*
- * Take a reference to a group so things found under the fsnotify_grp_mutex
- * can't get freed under us
- */
-static void fsnotify_get_group(struct fsnotify_group *group)
-{
-        atomic_inc(&group->refcnt);
-}
 /*
 * Final freeing of a group
 */
@@ -110,145 +52,53 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
 */
 static void fsnotify_destroy_group(struct fsnotify_group *group)
 {
-        /* clear all inode mark entries for this group */
+        /* clear all inode marks for this group */
        fsnotify_clear_marks_by_group(group);
+        synchronize_srcu(&fsnotify_mark_srcu);
        /* past the point of no return, matches the initial value of 1 */
        if (atomic_dec_and_test(&group->num_marks))
                fsnotify_final_destroy_group(group);
 }
 /*
- * Remove this group from the global list of groups that will get events
- * this can be done even if there are still references and things still using
- * this group.  This just stops the group from getting new events.
- */
-static void __fsnotify_evict_group(struct fsnotify_group *group)
-{
-        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-        if (group->on_group_list)
-                list_del_rcu(&group->group_list);
-        group->on_group_list = 0;
-}
-/*
- * Called when a group is no longer interested in getting events.  This can be
- * used if a group is misbehaving or if for some reason a group should no longer
- * get any filesystem events.
- */
-void fsnotify_evict_group(struct fsnotify_group *group)
-{
-        mutex_lock(&fsnotify_grp_mutex);
-        __fsnotify_evict_group(group);
-        mutex_unlock(&fsnotify_grp_mutex);
-}
-/*
 * Drop a reference to a group.  Free it if it's through.
 */
 void fsnotify_put_group(struct fsnotify_group *group)
 {
-        if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+        if (atomic_dec_and_test(&group->refcnt))
-                return;
+                fsnotify_destroy_group(group);
-        /*
-         * OK, now we know that there's no other users *and* we hold mutex,
-         * so no new references will appear
-         */
-        __fsnotify_evict_group(group);
-        /*
-         * now it's off the list, so the only thing we might care about is
-         * srcu access....
-         */
-        mutex_unlock(&fsnotify_grp_mutex);
-        synchronize_srcu(&fsnotify_grp_srcu);
-        /* and now it is really dead. _Nothing_ could be seeing it */
-        fsnotify_recalc_global_mask();
-        fsnotify_destroy_group(group);
-}
-/*
- * Simply run the fsnotify_groups list and find a group which matches
- * the given parameters.  If a group is found we take a reference to that
- * group.
- */
-static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
-                                                  const struct fsnotify_ops *ops)
-{
-        struct fsnotify_group *group_iter;
-        struct fsnotify_group *group = NULL;
-        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
-        list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
-                if (group_iter->group_num == group_num) {
-                        if ((group_iter->mask == mask) &&
-                            (group_iter->ops == ops)) {
-                                fsnotify_get_group(group_iter);
-                                group = group_iter;
-                        } else
-                                group = ERR_PTR(-EEXIST);
-                }
-        }
-        return group;
 }
 /*
- * Either finds an existing group which matches the group_num, mask, and ops or
+ * Create a new fsnotify_group and hold a reference for the group returned.
- * creates a new group and adds it to the global group list.  In either case we
- * take a reference for the group returned.
 */
-struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
-                                             const struct fsnotify_ops *ops)
 {
-        struct fsnotify_group *group, *tgroup;
+        struct fsnotify_group *group;
-        /* very low use, simpler locking if we just always alloc */
+        group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
-        group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
        if (!group)
                return ERR_PTR(-ENOMEM);
+        /* set to 0 when there a no external references to this group */
        atomic_set(&group->refcnt, 1);
+        /*
-        group->on_group_list = 0;
+         * hits 0 when there are no external references AND no marks for
-        group->group_num = group_num;
+         * this group
-        group->mask = mask;
+         */
+        atomic_set(&group->num_marks, 1);
        mutex_init(&group->notification_mutex);
        INIT_LIST_HEAD(&group->notification_list);
        init_waitqueue_head(&group->notification_waitq);
-        group->q_len = 0;
        group->max_events = UINT_MAX;
        spin_lock_init(&group->mark_lock);
-        atomic_set(&group->num_marks, 0);
+        INIT_LIST_HEAD(&group->marks_list);
-        INIT_LIST_HEAD(&group->mark_entries);
        group->ops = ops;
-        mutex_lock(&fsnotify_grp_mutex);
-        tgroup = fsnotify_find_group(group_num, mask, ops);
-        if (tgroup) {
-                /* group already exists */
-                mutex_unlock(&fsnotify_grp_mutex);
-                /* destroy the new one we made */
-                fsnotify_put_group(group);
-                return tgroup;
-        }
-        /* group not found, add a new one */
-        list_add_rcu(&group->group_list, &fsnotify_groups);
-        group->on_group_list = 1;
-        /* being on the fsnotify_groups list holds one num_marks */
-        atomic_inc(&group->num_marks);
-        mutex_unlock(&fsnotify_grp_mutex);
-        if (mask)
-                fsnotify_recalc_global_mask();
        return group;
 }
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0399bcbe09c8..33297c005060 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -16,72 +16,6 @@
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 */
-/*
- * fsnotify inode mark locking/lifetime/and refcnting
- *
- * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guarenteed to survive until the reference is dropped.
- *
- * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
- *
- * entry->lock
- * group->mark_lock
- * inode->i_lock
- *
- * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the mark_entries list anchored inside a given group
- * and each entry is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
- *
- * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
- * given inode and each entry is hooked via the i_list. (and sorta the
- * free_i_list)
- *
- *
- * LIFETIME:
- * Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
- *
- * The inode mark can be cleared for a number of different reasons including:
- * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
- * - The inode is being evicted from cache. (fsnotify_inode_delete)
- * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
- * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
- * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_clear_marks_by_group)
- *
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
- * Very similarly for freeing by group, except we use free_g_list.
- *
- * This has the very interesting property of being able to run concurrently with
- * any (or all) other directions.
- */
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -95,30 +29,19 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
-{
-        atomic_inc(&entry->refcnt);
-}
-void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
-{
-        if (atomic_dec_and_test(&entry->refcnt))
-                entry->free_mark(entry);
-}
 /*
 * Recalculate the mask of events relevant to a given inode locked.
 */
 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_mark *mark;
        struct hlist_node *pos;
        __u32 new_mask = 0;
        assert_spin_locked(&inode->i_lock);
-        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+        hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
-                new_mask |= entry->mask;
+                new_mask |= mark->mask;
        inode->i_fsnotify_mask = new_mask;
 }
@@ -135,107 +58,26 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
        __fsnotify_update_child_dentry_flags(inode);
 }
-/*
+void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the entry->lock
- */
-void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 {
-        struct fsnotify_group *group;
+        struct inode *inode = mark->i.inode;
-        struct inode *inode;
-        spin_lock(&entry->lock);
+        assert_spin_locked(&mark->lock);
+        assert_spin_locked(&mark->group->mark_lock);
-        group = entry->group;
-        inode = entry->inode;
-        BUG_ON(group && !inode);
-        BUG_ON(!group && inode);
-        /* if !group something else already marked this to die */
-        if (!group) {
-                spin_unlock(&entry->lock);
-                return;
-        }
-        /* 1 from caller and 1 for being on i_list/g_list */
-        BUG_ON(atomic_read(&entry->refcnt) < 2);
-        spin_lock(&group->mark_lock);
        spin_lock(&inode->i_lock);
-        hlist_del_init(&entry->i_list);
+        hlist_del_init_rcu(&mark->i.i_list);
-        entry->inode = NULL;
+        mark->i.inode = NULL;
-        list_del_init(&entry->g_list);
-        entry->group = NULL;
-        fsnotify_put_mark(entry); /* for i_list and g_list */
        /*
-         * this mark is now off the inode->i_fsnotify_mark_entries list and we
+         * this mark is now off the inode->i_fsnotify_marks list and we
         * hold the inode->i_lock, so this is the perfect time to update the
         * inode->i_fsnotify_mask
         */
        fsnotify_recalc_inode_mask_locked(inode);
        spin_unlock(&inode->i_lock);
-        spin_unlock(&group->mark_lock);
-        spin_unlock(&entry->lock);
-        /*
-         * Some groups like to know that marks are being freed.  This is a
-         * callback to the group function to let it know that this entry
-         * is being freed.
-         */
-        if (group->ops->freeing_mark)
-                group->ops->freeing_mark(entry, group);
-        /*
-         * __fsnotify_update_child_dentry_flags(inode);
-         *
-         * I really want to call that, but we can't, we have no idea if the inode
-         * still exists the second we drop the entry->lock.
-         *
-         * The next time an event arrive to this inode from one of it's children
-         * __fsnotify_parent will see that the inode doesn't care about it's
-         * children and will update all of these flags then.  So really this
-         * is just a lazy update (and could be a perf win...)
-         */
-        iput(inode);
-        /*
-         * it's possible that this group tried to destroy itself, but this
-         * this mark was simultaneously being freed by inode.  If that's the
-         * case, we finish freeing the group here.
-         */
-        if (unlikely(atomic_dec_and_test(&group->num_marks)))
-                fsnotify_final_destroy_group(group);
-}
-/*
- * Given a group, destroy all of the marks associated with that group.
- */
-void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
-{
-        struct fsnotify_mark_entry *lentry, *entry;
-        LIST_HEAD(free_list);
-        spin_lock(&group->mark_lock);
-        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
-                list_add(&entry->free_g_list, &free_list);
-                list_del_init(&entry->g_list);
-                fsnotify_get_mark(entry);
-        }
-        spin_unlock(&group->mark_lock);
-        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
-                fsnotify_destroy_mark_by_entry(entry);
-                fsnotify_put_mark(entry);
-        }
 }
 /*
@@ -243,112 +85,145 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 */
 void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-        struct fsnotify_mark_entry *entry, *lentry;
+        struct fsnotify_mark *mark, *lmark;
        struct hlist_node *pos, *n;
        LIST_HEAD(free_list);
        spin_lock(&inode->i_lock);
-        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+        hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
-                list_add(&entry->free_i_list, &free_list);
+                list_add(&mark->i.free_i_list, &free_list);
-                hlist_del_init(&entry->i_list);
+                hlist_del_init_rcu(&mark->i.i_list);
-                fsnotify_get_mark(entry);
+                fsnotify_get_mark(mark);
        }
        spin_unlock(&inode->i_lock);
-        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+        list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
-                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_destroy_mark(mark);
-                fsnotify_put_mark(entry);
+                fsnotify_put_mark(mark);
        }
 }
 /*
+ * Given a group clear all of the inode marks associated with that group.
+ */
+void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+        fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+}
+/*
 * given a group and inode, find the mark associated with that combination.
 * if found take a reference to that mark and return it, else return NULL
 */
-struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group,
-                                                     struct inode *inode)
+                                                      struct inode *inode)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_mark *mark;
        struct hlist_node *pos;
        assert_spin_locked(&inode->i_lock);
-        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+        hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
-                if (entry->group == group) {
+                if (mark->group == group) {
-                        fsnotify_get_mark(entry);
+                        fsnotify_get_mark(mark);
-                        return entry;
+                        return mark;
                }
        }
        return NULL;
 }
 /*
- * Nothing fancy, just initialize lists and locks and counters.
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
 */
-void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
-                        void (*free_mark)(struct fsnotify_mark_entry *entry))
+                                               struct inode *inode)
+{
+        struct fsnotify_mark *mark;
+        spin_lock(&inode->i_lock);
+        mark = fsnotify_find_inode_mark_locked(group, inode);
+        spin_unlock(&inode->i_lock);
+        return mark;
+}
+/*
+ * If we are setting a mark mask on an inode mark we should pin the inode
+ * in memory.
+ */
+void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
+                                         __u32 mask)
 {
-        spin_lock_init(&entry->lock);
+        struct inode *inode;
-        atomic_set(&entry->refcnt, 1);
-        INIT_HLIST_NODE(&entry->i_list);
+        assert_spin_locked(&mark->lock);
-        entry->group = NULL;
-        entry->mask = 0;
+        if (mask &&
-        entry->inode = NULL;
+            mark->i.inode &&
-        entry->free_mark = free_mark;
+            !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
+                mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
+                inode = igrab(mark->i.inode);
+                /*
+                 * we shouldn't be able to get here if the inode wasn't
+                 * already safely held in memory.  But bug in case it
+                 * ever is wrong.
+                 */
+                BUG_ON(!inode);
+        }
 }
 /*
- * Attach an initialized mark entry to a given group and inode.
+ * Attach an initialized mark to a given inode.
 * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes.
+ * event types should be delivered to which group and for which inodes.  These
+ * marks are ordered according to the group's location in memory.
 */
-int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
-                      struct fsnotify_group *group, struct inode *inode)
+                            struct fsnotify_group *group, struct inode *inode,
+                            int allow_dups)
 {
-        struct fsnotify_mark_entry *lentry;
+        struct fsnotify_mark *lmark;
+        struct hlist_node *node, *last = NULL;
        int ret = 0;
-        inode = igrab(inode);
+        mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
-        if (unlikely(!inode))
-                return -EINVAL;
+        assert_spin_locked(&mark->lock);
+        assert_spin_locked(&group->mark_lock);
-        /*
-         * LOCKING ORDER!!!!
-         * entry->lock
-         * group->mark_lock
-         * inode->i_lock
-         */
-        spin_lock(&entry->lock);
-        spin_lock(&group->mark_lock);
        spin_lock(&inode->i_lock);
-        lentry = fsnotify_find_mark_entry(group, inode);
+        mark->i.inode = inode;
-        if (!lentry) {
-                entry->group = group;
-                entry->inode = inode;
-                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+        /* is mark the first mark? */
-                list_add(&entry->g_list, &group->mark_entries);
+        if (hlist_empty(&inode->i_fsnotify_marks)) {
+                hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks);
+                goto out;
+        }
-                fsnotify_get_mark(entry); /* for i_list and g_list */
+        /* should mark be in the middle of the current list? */
+        hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
+                last = node;
+                if ((lmark->group == group) && !allow_dups) {
+                        ret = -EEXIST;
+                        goto out;
+                }
-                atomic_inc(&group->num_marks);
+                if (mark->group < lmark->group)
+                        continue;
-                fsnotify_recalc_inode_mask_locked(inode);
+                hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
+                goto out;
        }
+        BUG_ON(last == NULL);
+        /* mark should be the last entry.  last is the current last entry */
+        hlist_add_after_rcu(last, &mark->i.i_list);
+out:
+        fsnotify_recalc_inode_mask_locked(inode);
        spin_unlock(&inode->i_lock);
-        spin_unlock(&group->mark_lock);
-        spin_unlock(&entry->lock);
-        if (lentry) {
-                ret = -EEXIST;
-                iput(inode);
-                fsnotify_put_mark(lentry);
-        } else {
-                __fsnotify_update_child_dentry_flags(inode);
-        }
        return ret;
 }
@@ -369,11 +244,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
                struct inode *need_iput_tmp;
                /*
-                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
-                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
                        continue;
                /*
@@ -397,7 +272,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
                /* In case the dropping of a reference would nuke next_i. */
                if ((&next_i->i_sb_list != list) &&
                    atomic_read(&next_i->i_count) &&
-                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+                    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
                        __iget(next_i);
                        need_iput = next_i;
                }
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cfd..b981fc0c8379 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,18 +1,3 @@
-config INOTIFY
-        bool "Inotify file change notification support"
-        default n
-        ---help---
-          Say Y here to enable legacy in kernel inotify support.  Inotify is a
-          file change notification system.  It is a replacement for dnotify.
-          This option only provides the legacy inotify in kernel API.  There
-          are no in tree kernel users of this interface since it is deprecated.
-          You only need this if you are loading an out of tree kernel module
-          that uses inotify.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say N.
 config INOTIFY_USER
        bool "Inotify support for userspace"
        select ANON_INODES
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index 943828171362..a380dabe09de 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1 @@
-obj-$(CONFIG_INOTIFY)           += inotify.o
 obj-$(CONFIG_INOTIFY_USER)      += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
deleted file mode 100644
index 40b1cf914ccb..000000000000
--- a/fs/notify/inotify/inotify.c
+++ /dev/null
@@ -1,933 +0,0 @@
-/*
- * fs/inotify.c - inode-based file event notifications
- *
- * Authors:
- *      John McCutchan  <ttb@tentacle.dhs.org>
- *      Robert Love     <rml@novell.com>
- *
- * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/writeback.h>
-#include <linux/inotify.h>
-#include <linux/fsnotify_backend.h>
-static atomic_t inotify_cookie;
-/*
- * Lock ordering:
- *
- * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_mutex (synchronize shrink_icache_memory())
- *      inode_lock (protects the super_block->s_inodes list)
- *      inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- *              inotify_handle->mutex (protects inotify_handle and watches->h_list)
- *
- * The inode->inotify_mutex and inotify_handle->mutex and held during execution
- * of a caller's event handler.  Thus, the caller must not hold any locks
- * taken in their event handler while calling any of the published inotify
- * interfaces.
- */
-/*
- * Lifetimes of the three main data structures--inotify_handle, inode, and
- * inotify_watch--are managed by reference count.
- *
- * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * Additional references can bump the count via get_inotify_handle() and drop
- * the count via put_inotify_handle().
- *
- * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * to remove_watch_no_event().  Additional references can bump the count via
- * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
- * is reponsible for the final put after receiving IN_IGNORED, or when using
- * IN_ONESHOT after receiving the first event.  Inotify does the final put if
- * inotify_destroy() is called.
- *
- * inode: Pinned so long as the inode is associated with a watch, from
- * inotify_add_watch() to the final put_inotify_watch().
- */
-/*
- * struct inotify_handle - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_handle {
-        struct idr              idr;            /* idr mapping wd -> watch */
-        struct mutex            mutex;          /* protects this bad boy */
-        struct list_head        watches;        /* list of watches */
-        atomic_t                count;          /* reference count */
-        u32                     last_wd;        /* the last wd allocated */
-        const struct inotify_operations *in_ops; /* inotify caller operations */
-};
-static inline void get_inotify_handle(struct inotify_handle *ih)
-{
-        atomic_inc(&ih->count);
-}
-static inline void put_inotify_handle(struct inotify_handle *ih)
-{
-        if (atomic_dec_and_test(&ih->count)) {
-                idr_destroy(&ih->idr);
-                kfree(ih);
-        }
-}
-/**
- * get_inotify_watch - grab a reference to an inotify_watch
- * @watch: watch to grab
- */
-void get_inotify_watch(struct inotify_watch *watch)
-{
-        atomic_inc(&watch->count);
-}
-EXPORT_SYMBOL_GPL(get_inotify_watch);
-int pin_inotify_watch(struct inotify_watch *watch)
-{
-        struct super_block *sb = watch->inode->i_sb;
-        spin_lock(&sb_lock);
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
-                atomic_inc(&watch->count);
-                return 1;
-        }
-        spin_unlock(&sb_lock);
-        return 0;
-}
-/**
- * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * watch references if the count reaches zero.  inotify_watch is freed by
- * inotify callers via the destroy_watch() op.
- * @watch: watch to release
- */
-void put_inotify_watch(struct inotify_watch *watch)
-{
-        if (atomic_dec_and_test(&watch->count)) {
-                struct inotify_handle *ih = watch->ih;
-                iput(watch->inode);
-                ih->in_ops->destroy_watch(watch);
-                put_inotify_handle(ih);
-        }
-}
-EXPORT_SYMBOL_GPL(put_inotify_watch);
-void unpin_inotify_watch(struct inotify_watch *watch)
-{
-        struct super_block *sb = watch->inode->i_sb;
-        put_inotify_watch(watch);
-        deactivate_super(sb);
-}
-/*
- * inotify_handle_get_wd - returns the next WD for use by the given handle
- *
- * Callers must hold ih->mutex.  This function can sleep.
- */
-static int inotify_handle_get_wd(struct inotify_handle *ih,
-                                 struct inotify_watch *watch)
-{
-        int ret;
-        do {
-                if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
-                        return -ENOSPC;
-                ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
-        } while (ret == -EAGAIN);
-        if (likely(!ret))
-                ih->last_wd = watch->wd;
-        return ret;
-}
-/*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
- */
-static inline int inotify_inode_watched(struct inode *inode)
-{
-        return !list_empty(&inode->inotify_watches);
-}
-/*
- * Get child dentry flag into synch with parent inode.
- * Flag should always be clear for negative dentrys.
- */
-static void set_dentry_child_flags(struct inode *inode, int watched)
-{
-        struct dentry *alias;
-        spin_lock(&dcache_lock);
-        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-                struct dentry *child;
-                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-                        if (!child->d_inode)
-                                continue;
-                        spin_lock(&child->d_lock);
-                        if (watched)
-                                child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-                        else
-                                child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
-                        spin_unlock(&child->d_lock);
-                }
-        }
-        spin_unlock(&dcache_lock);
-}
-/*
- * inotify_find_handle - find the watch associated with the given inode and
- * handle
- *
- * Callers must hold inode->inotify_mutex.
- */
-static struct inotify_watch *inode_find_handle(struct inode *inode,
-                                               struct inotify_handle *ih)
-{
-        struct inotify_watch *watch;
-        list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-                if (watch->ih == ih)
-                        return watch;
-        }
-        return NULL;
-}
-/*
- * remove_watch_no_event - remove watch without the IN_IGNORED event.
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-static void remove_watch_no_event(struct inotify_watch *watch,
-                                  struct inotify_handle *ih)
-{
-        list_del(&watch->i_list);
-        list_del(&watch->h_list);
-        if (!inotify_inode_watched(watch->inode))
-                set_dentry_child_flags(watch->inode, 0);
-        idr_remove(&ih->idr, watch->wd);
-}
-/**
- * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * watched.  May be invoked from a caller's event handler.
- * @ih: inotify handle associated with watch
- * @watch: watch to remove
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-void inotify_remove_watch_locked(struct inotify_handle *ih,
-                                 struct inotify_watch *watch)
-{
-        remove_watch_no_event(watch, ih);
-        ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-/* Kernel API for producing events */
-/*
- * inotify_d_instantiate - instantiate dcache entry for inode
- */
-void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
-{
-        struct dentry *parent;
-        if (!inode)
-                return;
-        spin_lock(&entry->d_lock);
-        parent = entry->d_parent;
-        if (parent->d_inode && inotify_inode_watched(parent->d_inode))
-                entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-        spin_unlock(&entry->d_lock);
-}
-/*
- * inotify_d_move - dcache entry has been moved
- */
-void inotify_d_move(struct dentry *entry)
-{
-        struct dentry *parent;
-        parent = entry->d_parent;
-        if (inotify_inode_watched(parent->d_inode))
-                entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-        else
-                entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
-}
-/**
- * inotify_inode_queue_event - queue an event to all watches on this inode
- * @inode: inode event is originating from
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- * @n_inode: inode associated with name
- */
-void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-                               const char *name, struct inode *n_inode)
-{
-        struct inotify_watch *watch, *next;
-        if (!inotify_inode_watched(inode))
-                return;
-        mutex_lock(&inode->inotify_mutex);
-        list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-                u32 watch_mask = watch->mask;
-                if (watch_mask & mask) {
-                        struct inotify_handle *ih= watch->ih;
-                        mutex_lock(&ih->mutex);
-                        if (watch_mask & IN_ONESHOT)
-                                remove_watch_no_event(watch, ih);
-                        ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-                                                 name, n_inode);
-                        mutex_unlock(&ih->mutex);
-                }
-        }
-        mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
-/**
- * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
- * @dentry: the dentry in question, we queue against this dentry's parent
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- */
-void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
-                                       u32 cookie, const char *name)
-{
-        struct dentry *parent;
-        struct inode *inode;
-        if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
-                return;
-        spin_lock(&dentry->d_lock);
-        parent = dentry->d_parent;
-        inode = parent->d_inode;
-        if (inotify_inode_watched(inode)) {
-                dget(parent);
-                spin_unlock(&dentry->d_lock);
-                inotify_inode_queue_event(inode, mask, cookie, name,
-                                          dentry->d_inode);
-                dput(parent);
-        } else
-                spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
-/**
- * inotify_get_cookie - return a unique cookie for use in synchronizing events.
- */
-u32 inotify_get_cookie(void)
-{
-        return atomic_inc_return(&inotify_cookie);
-}
-EXPORT_SYMBOL_GPL(inotify_get_cookie);
-/**
- * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void inotify_unmount_inodes(struct list_head *list)
-{
-        struct inode *inode, *next_i, *need_iput = NULL;
-        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-                struct inotify_watch *watch, *next_w;
-                struct inode *need_iput_tmp;
-                struct list_head *watches;
-                /*
-                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
-                 * I_WILL_FREE, or I_NEW which is fine because by that point
-                 * the inode cannot have any associated watches.
-                 */
-                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
-                        continue;
-                /*
-                 * If i_count is zero, the inode cannot have any watches and
-                 * doing an __iget/iput with MS_ACTIVE clear would actually
-                 * evict all inodes with zero i_count from icache which is
-                 * unnecessarily violent and may in fact be illegal to do.
-                 */
-                if (!atomic_read(&inode->i_count))
-                        continue;
-                need_iput_tmp = need_iput;
-                need_iput = NULL;
-                /* In case inotify_remove_watch_locked() drops a reference. */
-                if (inode != need_iput_tmp)
-                        __iget(inode);
-                else
-                        need_iput_tmp = NULL;
-                /* In case the dropping of a reference would nuke next_i. */
-                if ((&next_i->i_sb_list != list) &&
-                                atomic_read(&next_i->i_count) &&
-                                !(next_i->i_state & (I_CLEAR | I_FREEING |
-                                        I_WILL_FREE))) {
-                        __iget(next_i);
-                        need_iput = next_i;
-                }
-                /*
-                 * We can safely drop inode_lock here because we hold
-                 * references on both inode and next_i.  Also no new inodes
-                 * will be added since the umount has begun.  Finally,
-                 * iprune_mutex keeps shrink_icache_memory() away.
-                 */
-                spin_unlock(&inode_lock);
-                if (need_iput_tmp)
-                        iput(need_iput_tmp);
-                /* for each watch, send IN_UNMOUNT and then remove it */
-                mutex_lock(&inode->inotify_mutex);
-                watches = &inode->inotify_watches;
-                list_for_each_entry_safe(watch, next_w, watches, i_list) {
-                        struct inotify_handle *ih= watch->ih;
-                        get_inotify_watch(watch);
-                        mutex_lock(&ih->mutex);
-                        ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-                                                 NULL, NULL);
-                        inotify_remove_watch_locked(ih, watch);
-                        mutex_unlock(&ih->mutex);
-                        put_inotify_watch(watch);
-                }
-                mutex_unlock(&inode->inotify_mutex);
-                iput(inode);            
-                spin_lock(&inode_lock);
-        }
-}
-EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
-/**
- * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
- * @inode: inode that is about to be removed
- */
-void inotify_inode_is_dead(struct inode *inode)
-{
-        struct inotify_watch *watch, *next;
-        mutex_lock(&inode->inotify_mutex);
-        list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-                struct inotify_handle *ih = watch->ih;
-                mutex_lock(&ih->mutex);
-                inotify_remove_watch_locked(ih, watch);
-                mutex_unlock(&ih->mutex);
-        }
-        mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-/* Kernel Consumer API */
-/**
- * inotify_init - allocate and initialize an inotify instance
- * @ops: caller's inotify operations
- */
-struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-        struct inotify_handle *ih;
-        ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-        if (unlikely(!ih))
-                return ERR_PTR(-ENOMEM);
-        idr_init(&ih->idr);
-        INIT_LIST_HEAD(&ih->watches);
-        mutex_init(&ih->mutex);
-        ih->last_wd = 0;
-        ih->in_ops = ops;
-        atomic_set(&ih->count, 0);
-        get_inotify_handle(ih);
-        return ih;
-}
-EXPORT_SYMBOL_GPL(inotify_init);
-/**
- * inotify_init_watch - initialize an inotify watch
- * @watch: watch to initialize
- */
-void inotify_init_watch(struct inotify_watch *watch)
-{
-        INIT_LIST_HEAD(&watch->h_list);
-        INIT_LIST_HEAD(&watch->i_list);
-        atomic_set(&watch->count, 0);
-        get_inotify_watch(watch); /* initial get */
-}
-EXPORT_SYMBOL_GPL(inotify_init_watch);
-/*
- * Watch removals suck violently.  To kick the watch out we need (in this
- * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
- * a hold on inode; however, for all other cases we need to make damn sure
- * we don't race with umount.  We can *NOT* just grab a reference to a
- * watch - inotify_unmount_inodes() will happily sail past it and we'll end
- * with reference to inode potentially outliving its superblock.  Ideally
- * we just want to grab an active reference to superblock if we can; that
- * will make sure we won't go into inotify_umount_inodes() until we are
- * done.  Cleanup is just deactivate_super().  However, that leaves a messy
- * case - what if we *are* racing with umount() and active references to
- * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
- * down and the watch in question is pining for fjords.  That's fine, but
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
- *
- * And yes, this is far beyond mere "not very pretty"; so's the entire
- * concept of inotify to start with.
- */
-/**
- * pin_to_kill - pin the watch down for removal
- * @ih: inotify handle
- * @watch: watch to kill
- *
- * Called with ih->mutex held, drops it.  Possible return values:
- * 0 - nothing to do, it has died
- * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
- */
-static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
-{
-        struct super_block *sb = watch->inode->i_sb;
-        s32 wd = watch->wd;
-        spin_lock(&sb_lock);
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
-                get_inotify_watch(watch);
-                mutex_unlock(&ih->mutex);
-                return 1;       /* the best outcome */
-        }
-        sb->s_count++;
-        spin_unlock(&sb_lock);
-        mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
-        down_read(&sb->s_umount);
-        if (likely(!sb->s_root)) {
-                /* fs is already shut down; the watch is dead */
-                drop_super(sb);
-                return 0;
-        }
-        /* raced with the final deactivate_super() */
-        mutex_lock(&ih->mutex);
-        if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-                /* the watch is dead */
-                mutex_unlock(&ih->mutex);
-                drop_super(sb);
-                return 0;
-        }
-        /* still alive or freed and reused with the same sb and wd; kill */
-        get_inotify_watch(watch);
-        mutex_unlock(&ih->mutex);
-        return 2;
-}
-static void unpin_and_kill(struct inotify_watch *watch, int how)
-{
-        struct super_block *sb = watch->inode->i_sb;
-        put_inotify_watch(watch);
-        switch (how) {
-        case 1:
-                deactivate_super(sb);
-                break;
-        case 2:
-                drop_super(sb);
-        }
-}
-/**
- * inotify_destroy - clean up and destroy an inotify instance
- * @ih: inotify handle
- */
-void inotify_destroy(struct inotify_handle *ih)
-{
-        /*
-         * Destroy all of the watches for this handle. Unfortunately, not very
-         * pretty.  We cannot do a simple iteration over the list, because we
-         * do not know the inode until we iterate to the watch.  But we need to
-         * hold inode->inotify_mutex before ih->mutex.  The following works.
-         *
-         * AV: it had to become even uglier to start working ;-/
-         */
-        while (1) {
-                struct inotify_watch *watch;
-                struct list_head *watches;
-                struct super_block *sb;
-                struct inode *inode;
-                int how;
-                mutex_lock(&ih->mutex);
-                watches = &ih->watches;
-                if (list_empty(watches)) {
-                        mutex_unlock(&ih->mutex);
-                        break;
-                }
-                watch = list_first_entry(watches, struct inotify_watch, h_list);
-                sb = watch->inode->i_sb;
-                how = pin_to_kill(ih, watch);
-                if (!how)
-                        continue;
-                inode = watch->inode;
-                mutex_lock(&inode->inotify_mutex);
-                mutex_lock(&ih->mutex);
-                /* make sure we didn't race with another list removal */
-                if (likely(idr_find(&ih->idr, watch->wd))) {
-                        remove_watch_no_event(watch, ih);
-                        put_inotify_watch(watch);
-                }
-                mutex_unlock(&ih->mutex);
-                mutex_unlock(&inode->inotify_mutex);
-                unpin_and_kill(watch, how);
-        }
-        /* free this handle: the put matching the get in inotify_init() */
-        put_inotify_handle(ih);
-}
-EXPORT_SYMBOL_GPL(inotify_destroy);
-/**
- * inotify_find_watch - find an existing watch for an (ih,inode) pair
- * @ih: inotify handle
- * @inode: inode to watch
- * @watchp: pointer to existing inotify_watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-                       struct inotify_watch **watchp)
-{
-        struct inotify_watch *old;
-        int ret = -ENOENT;
-        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&ih->mutex);
-        old = inode_find_handle(inode, ih);
-        if (unlikely(old)) {
-                get_inotify_watch(old); /* caller must put watch */
-                *watchp = old;
-                ret = old->wd;
-        }
-        mutex_unlock(&ih->mutex);
-        mutex_unlock(&inode->inotify_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_watch);
-/**
- * inotify_find_update_watch - find and update the mask of an existing watch
- * @ih: inotify handle
- * @inode: inode's watch to update
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
-                              u32 mask)
-{
-        struct inotify_watch *old;
-        int mask_add = 0;
-        int ret;
-        if (mask & IN_MASK_ADD)
-                mask_add = 1;
-        /* don't allow invalid bits: we don't want flags set */
-        mask &= IN_ALL_EVENTS | IN_ONESHOT;
-        if (unlikely(!mask))
-                return -EINVAL;
-        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&ih->mutex);
-        /*
-         * Handle the case of re-adding a watch on an (inode,ih) pair that we
-         * are already watching.  We just update the mask and return its wd.
-         */
-        old = inode_find_handle(inode, ih);
-        if (unlikely(!old)) {
-                ret = -ENOENT;
-                goto out;
-        }
-        if (mask_add)
-                old->mask |= mask;
-        else
-                old->mask = mask;
-        ret = old->wd;
-out:
-        mutex_unlock(&ih->mutex);
-        mutex_unlock(&inode->inotify_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-/**
- * inotify_add_watch - add a watch to an inotify instance
- * @ih: inotify handle
- * @watch: caller allocated watch structure
- * @inode: inode to watch
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- * Caller must ensure it only calls inotify_add_watch() once per watch.
- * Calls inotify_handle_get_wd() so may sleep.
- */
-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
-                      struct inode *inode, u32 mask)
-{
-        int ret = 0;
-        int newly_watched;
-        /* don't allow invalid bits: we don't want flags set */
-        mask &= IN_ALL_EVENTS | IN_ONESHOT;
-        if (unlikely(!mask))
-                return -EINVAL;
-        watch->mask = mask;
-        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&ih->mutex);
-        /* Initialize a new watch */
-        ret = inotify_handle_get_wd(ih, watch);
-        if (unlikely(ret))
-                goto out;
-        ret = watch->wd;
-        /* save a reference to handle and bump the count to make it official */
-        get_inotify_handle(ih);
-        watch->ih = ih;
-        /*
-         * Save a reference to the inode and bump the ref count to make it
-         * official.  We hold a reference to nameidata, which makes this safe.
-         */
-        watch->inode = igrab(inode);
-        /* Add the watch to the handle's and the inode's list */
-        newly_watched = !inotify_inode_watched(inode);
-        list_add(&watch->h_list, &ih->watches);
-        list_add(&watch->i_list, &inode->inotify_watches);
-        /*
-         * Set child flags _after_ adding the watch, so there is no race
-         * windows where newly instantiated children could miss their parent's
-         * watched flag.
-         */
-        if (newly_watched)
-                set_dentry_child_flags(inode, 1);
-out:
-        mutex_unlock(&ih->mutex);
-        mutex_unlock(&inode->inotify_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_add_watch);
-/**
- * inotify_clone_watch - put the watch next to existing one
- * @old: already installed watch
- * @new: new watch
- *
- * Caller must hold the inotify_mutex of inode we are dealing with;
- * it is expected to remove the old watch before unlocking the inode.
- */
-s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
-{
-        struct inotify_handle *ih = old->ih;
-        int ret = 0;
-        new->mask = old->mask;
-        new->ih = ih;
-        mutex_lock(&ih->mutex);
-        /* Initialize a new watch */
-        ret = inotify_handle_get_wd(ih, new);
-        if (unlikely(ret))
-                goto out;
-        ret = new->wd;
-        get_inotify_handle(ih);
-        new->inode = igrab(old->inode);
-        list_add(&new->h_list, &ih->watches);
-        list_add(&new->i_list, &old->inode->inotify_watches);
-out:
-        mutex_unlock(&ih->mutex);
-        return ret;
-}
-void inotify_evict_watch(struct inotify_watch *watch)
-{
-        get_inotify_watch(watch);
-        mutex_lock(&watch->ih->mutex);
-        inotify_remove_watch_locked(watch->ih, watch);
-        mutex_unlock(&watch->ih->mutex);
-}
-/**
- * inotify_rm_wd - remove a watch from an inotify instance
- * @ih: inotify handle
- * @wd: watch descriptor to remove
- *
- * Can sleep.
- */
-int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
-{
-        struct inotify_watch *watch;
-        struct super_block *sb;
-        struct inode *inode;
-        int how;
-        mutex_lock(&ih->mutex);
-        watch = idr_find(&ih->idr, wd);
-        if (unlikely(!watch)) {
-                mutex_unlock(&ih->mutex);
-                return -EINVAL;
-        }
-        sb = watch->inode->i_sb;
-        how = pin_to_kill(ih, watch);
-        if (!how)
-                return 0;
-        inode = watch->inode;
-        mutex_lock(&inode->inotify_mutex);
-        mutex_lock(&ih->mutex);
-        /* make sure that we did not race */
-        if (likely(idr_find(&ih->idr, wd) == watch))
-                inotify_remove_watch_locked(ih, watch);
-        mutex_unlock(&ih->mutex);
-        mutex_unlock(&inode->inotify_mutex);
-        unpin_and_kill(watch, how);
-        return 0;
-}
-EXPORT_SYMBOL_GPL(inotify_rm_wd);
-/**
- * inotify_rm_watch - remove a watch from an inotify instance
- * @ih: inotify handle
- * @watch: watch to remove
- *
- * Can sleep.
- */
-int inotify_rm_watch(struct inotify_handle *ih,
-                     struct inotify_watch *watch)
-{
-        return inotify_rm_wd(ih, watch->wd);
-}
-EXPORT_SYMBOL_GPL(inotify_rm_watch);
-/*
- * inotify_setup - core initialization function
- */
-static int __init inotify_setup(void)
-{
-        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
-        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
-        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
-        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
-        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
-        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
-        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
-        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
-        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
-        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
-        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
-        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
-        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
-        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
-        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
-        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
-        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
-        atomic_set(&inotify_cookie, 0);
-        return 0;
-}
-module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a4c8ca..b6642e4de4bf 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -9,13 +9,12 @@ struct inotify_event_private_data {
        int wd;
 };
-struct inotify_inode_mark_entry {
+struct inotify_inode_mark {
-        /* fsnotify_mark_entry MUST be the first thing */
+        struct fsnotify_mark fsn_mark;
-        struct fsnotify_mark_entry fsn_entry;
        int wd;
 };
-extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                           struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960cd76ab..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -22,6 +22,7 @@
 * General Public License for more details.
 */
+#include <linux/dcache.h> /* d_unlinked */
 #include <linux/fs.h> /* struct inode */
 #include <linux/fsnotify_backend.h>
 #include <linux/inotify.h>
@@ -32,26 +33,84 @@
 #include "inotify.h"
-static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        if ((old->mask == new->mask) &&
+            (old->to_tell == new->to_tell) &&
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_INODE):
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (!old->name_len ||
+                            !strcmp(old->file_name, new->file_name))
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_NONE):
+                        if (old->mask & FS_Q_OVERFLOW)
+                                return true;
+                        else if (old->mask & FS_IN_IGNORED)
+                                return false;
+                        return true;
+                };
+        }
+        return false;
+}
+static struct fsnotify_event *inotify_merge(struct list_head *list,
+                                            struct fsnotify_event *event)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_event_holder *last_holder;
-        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *last_event;
+        /* and the list better be locked by something too */
+        spin_lock(&event->lock);
+        last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+        last_event = last_holder->event;
+        if (event_compare(last_event, event))
+                fsnotify_get_event(last_event);
+        else
+                last_event = NULL;
+        spin_unlock(&event->lock);
+        return last_event;
+}
+static int inotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_mark *inode_mark,
+                                struct fsnotify_mark *vfsmount_mark,
+                                struct fsnotify_event *event)
+{
+        struct inotify_inode_mark *i_mark;
        struct inode *to_tell;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
-        int wd, ret;
+        struct fsnotify_event *added_event;
+        int wd, ret = 0;
+        BUG_ON(vfsmount_mark);
+        pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
+                 event, event->to_tell, event->mask);
        to_tell = event->to_tell;
-        spin_lock(&to_tell->i_lock);
+        i_mark = container_of(inode_mark, struct inotify_inode_mark,
-        entry = fsnotify_find_mark_entry(group, to_tell);
+                              fsn_mark);
-        spin_unlock(&to_tell->i_lock);
+        wd = i_mark->wd;
-        /* race with watch removal?  We already passes should_send */
-        if (unlikely(!entry))
-                return 0;
-        ientry = container_of(entry, struct inotify_inode_mark_entry,
-                              fsn_entry);
-        wd = ientry->wd;
        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
        if (unlikely(!event_priv))
@@ -62,48 +121,40 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
        fsn_event_priv->group = group;
        event_priv->wd = wd;
-        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+        added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
-        if (ret) {
+        if (added_event) {
                inotify_free_event_priv(fsn_event_priv);
-                /* EEXIST says we tail matched, EOVERFLOW isn't something
+                if (!IS_ERR(added_event))
-                 * to report up the stack. */
+                        fsnotify_put_event(added_event);
-                if ((ret == -EEXIST) ||
+                else
-                    (ret == -EOVERFLOW))
+                        ret = PTR_ERR(added_event);
-                        ret = 0;
        }
-        /*
+        if (inode_mark->mask & IN_ONESHOT)
-         * If we hold the entry until after the event is on the queue
+                fsnotify_destroy_mark(inode_mark);
-         * IN_IGNORED won't be able to pass this event in the queue
-         */
-        fsnotify_put_mark(entry);
        return ret;
 }
-static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
 {
-        inotify_ignored_and_remove_idr(entry, group);
+        inotify_ignored_and_remove_idr(fsn_mark, group);
 }
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
+                                      struct fsnotify_mark *inode_mark,
+                                      struct fsnotify_mark *vfsmount_mark,
+                                      __u32 mask, void *data, int data_type)
 {
-        struct fsnotify_mark_entry *entry;
+        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-        bool send;
+            (data_type == FSNOTIFY_EVENT_PATH)) {
+                struct path *path = data;
-        spin_lock(&inode->i_lock);
-        entry = fsnotify_find_mark_entry(group, inode);
-        spin_unlock(&inode->i_lock);
-        if (!entry)
-                return false;
-        mask = (mask & ~FS_EVENT_ON_CHILD);
+                if (d_unlinked(path->dentry))
-        send = (entry->mask & mask);
+                        return false;
+        }
-        /* find took a reference */
-        fsnotify_put_mark(entry);
-        return send;
+        return true;
 }
 /*
@@ -115,18 +166,18 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 */
 static int idr_callback(int id, void *p, void *data)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_mark *fsn_mark;
-        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark *i_mark;
        static bool warned = false;
        if (warned)
                return 0;
        warned = true;
-        entry = p;
+        fsn_mark = p;
-        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-        WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+        WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
                "idr.  Probably leaking memory\n", id, p, data);
        /*
@@ -135,9 +186,9 @@ static int idr_callback(int id, void *p, void *data)
         * out why we got here and the panic is no worse than the original
         * BUG() that was here.
         */
-        if (entry)
+        if (fsn_mark)
-                printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+                printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
-                        entry->group, entry->inode, ientry->wd);
+                        fsn_mark->group, fsn_mark->i.inode, i_mark->wd);
        return 0;
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e46ca685b9be..bf7f6d776c31 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -46,17 +46,11 @@
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
-int inotify_max_user_watches __read_mostly;
+static int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-/*
- * When inotify registers a new group it increments this and uses that
- * value as an offset to set the fsnotify group "name" and priority.
- */
-static atomic_t inotify_grp_num;
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -96,11 +90,14 @@ static inline __u32 inotify_arg_to_mask(u32 arg)
 {
        __u32 mask;
-        /* everything should accept their own ignored and cares about children */
+        /*
-        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
+         * everything should accept their own ignored, cares about children,
+         * and should receive events when the inode is unmounted
+         */
+        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT);
        /* mask off the flags used to open the fd */
-        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
+        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
        return mask;
 }
@@ -144,6 +141,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        event = fsnotify_peek_notify_event(group);
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
        if (event->name_len)
                event_size += roundup(event->name_len + 1, event_size);
@@ -173,6 +172,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        size_t event_size = sizeof(struct inotify_event);
        size_t name_len = 0;
+        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
        /* we get the inotify watch descriptor from the event private data */
        spin_lock(&event->lock);
        fsn_priv = fsnotify_remove_priv_from_event(group, event);
@@ -245,6 +246,8 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                kevent = get_one_event(group, count);
                mutex_unlock(&group->notification_mutex);
+                pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent);
                if (kevent) {
                        ret = PTR_ERR(kevent);
                        if (IS_ERR(kevent))
@@ -289,6 +292,8 @@ static int inotify_release(struct inode *ignored, struct file *file)
        struct fsnotify_group *group = file->private_data;
        struct user_struct *user = group->inotify_data.user;
+        pr_debug("%s: group=%p\n", __func__, group);
        fsnotify_clear_marks_by_group(group);
        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
@@ -312,6 +317,8 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
        group = file->private_data;
        p = (void __user *) arg;
+        pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
        switch (cmd) {
        case FIONREAD:
                mutex_lock(&group->notification_mutex);
@@ -357,59 +364,159 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
        return error;
 }
+static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
+                              int *last_wd,
+                              struct inotify_inode_mark *i_mark)
+{
+        int ret;
+        do {
+                if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
+                        return -ENOMEM;
+                spin_lock(idr_lock);
+                ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
+                                        &i_mark->wd);
+                /* we added the mark to the idr, take a reference */
+                if (!ret) {
+                        *last_wd = i_mark->wd;
+                        fsnotify_get_mark(&i_mark->fsn_mark);
+                }
+                spin_unlock(idr_lock);
+        } while (ret == -EAGAIN);
+        return ret;
+}
+static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
+                                                                int wd)
+{
+        struct idr *idr = &group->inotify_data.idr;
+        spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+        struct inotify_inode_mark *i_mark;
+        assert_spin_locked(idr_lock);
+        i_mark = idr_find(idr, wd);
+        if (i_mark) {
+                struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark;
+                fsnotify_get_mark(fsn_mark);
+                /* One ref for being in the idr, one ref we just took */
+                BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+        }
+        return i_mark;
+}
+static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
+                                                         int wd)
+{
+        struct inotify_inode_mark *i_mark;
+        spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+        spin_lock(idr_lock);
+        i_mark = inotify_idr_find_locked(group, wd);
+        spin_unlock(idr_lock);
+        return i_mark;
+}
+static void do_inotify_remove_from_idr(struct fsnotify_group *group,
+                                       struct inotify_inode_mark *i_mark)
+{
+        struct idr *idr = &group->inotify_data.idr;
+        spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+        int wd = i_mark->wd;
+        assert_spin_locked(idr_lock);
+        idr_remove(idr, wd);
+        /* removed from the idr, drop that ref */
+        fsnotify_put_mark(&i_mark->fsn_mark);
+}
 /*
 * Remove the mark from the idr (if present) and drop the reference
 * on the mark because it was in the idr.
 */
 static void inotify_remove_from_idr(struct fsnotify_group *group,
-                                    struct inotify_inode_mark_entry *ientry)
+                                    struct inotify_inode_mark *i_mark)
 {
-        struct idr *idr;
+        spinlock_t *idr_lock = &group->inotify_data.idr_lock;
-        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark *found_i_mark = NULL;
-        struct inotify_inode_mark_entry *found_ientry;
        int wd;
-        spin_lock(&group->inotify_data.idr_lock);
+        spin_lock(idr_lock);
-        idr = &group->inotify_data.idr;
+        wd = i_mark->wd;
-        wd = ientry->wd;
-        if (wd == -1)
+        /*
+         * does this i_mark think it is in the idr?  we shouldn't get called
+         * if it wasn't....
+         */
+        if (wd == -1) {
+                WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+                        " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+                        i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
                goto out;
+        }
-        entry = idr_find(&group->inotify_data.idr, wd);
+        /* Lets look in the idr to see if we find it */
-        if (unlikely(!entry))
+        found_i_mark = inotify_idr_find_locked(group, wd);
+        if (unlikely(!found_i_mark)) {
+                WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+                        " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+                        i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
                goto out;
+        }
-        found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        /*
-        if (unlikely(found_ientry != ientry)) {
+         * We found an mark in the idr at the right wd, but it's
-                /* We found an entry in the idr with the right wd, but it's
+         * not the mark we were told to remove.  eparis seriously
-                 * not the entry we were told to remove.  eparis seriously
+         * fucked up somewhere.
-                 * fucked up somewhere. */
+         */
-                WARN_ON(1);
+        if (unlikely(found_i_mark != i_mark)) {
-                ientry->wd = -1;
+                WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
+                        "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
+                        "found_i_mark->group=%p found_i_mark->inode=%p\n",
+                        __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
+                        i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd,
+                        found_i_mark->fsn_mark.group,
+                        found_i_mark->fsn_mark.i.inode);
                goto out;
        }
-        /* One ref for being in the idr, one ref held by the caller */
+        /*
-        BUG_ON(atomic_read(&entry->refcnt) < 2);
+         * One ref for being in the idr
+         * one ref held by the caller trying to kill us
-        idr_remove(idr, wd);
+         * one ref grabbed by inotify_idr_find
-        ientry->wd = -1;
+         */
+        if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
+                printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
+                        " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
+                        i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode);
+                /* we can't really recover with bad ref cnting.. */
+                BUG();
+        }
-        /* removed from the idr, drop that ref */
+        do_inotify_remove_from_idr(group, i_mark);
-        fsnotify_put_mark(entry);
 out:
-        spin_unlock(&group->inotify_data.idr_lock);
+        /* match the ref taken by inotify_idr_find_locked() */
+        if (found_i_mark)
+                fsnotify_put_mark(&found_i_mark->fsn_mark);
+        i_mark->wd = -1;
+        spin_unlock(idr_lock);
 }
 /*
 * Send IN_IGNORED for this wd, remove this wd from the idr.
 */
-void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
                                    struct fsnotify_group *group)
 {
-        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark *i_mark;
-        struct fsnotify_event *ignored_event;
+        struct fsnotify_event *ignored_event, *notify_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
        int ret;
@@ -420,7 +527,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        if (!ignored_event)
                return;
-        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
        if (unlikely(!event_priv))
@@ -429,37 +536,44 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
        fsn_event_priv->group = group;
-        event_priv->wd = ientry->wd;
+        event_priv->wd = i_mark->wd;
-        ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+        notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
-        if (ret)
+        if (notify_event) {
+                if (IS_ERR(notify_event))
+                        ret = PTR_ERR(notify_event);
+                else
+                        fsnotify_put_event(notify_event);
                inotify_free_event_priv(fsn_event_priv);
+        }
 skip_send_ignore:
        /* matches the reference taken when the event was created */
        fsnotify_put_event(ignored_event);
-        /* remove this entry from the idr */
+        /* remove this mark from the idr */
-        inotify_remove_from_idr(group, ientry);
+        inotify_remove_from_idr(group, i_mark);
        atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 /* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-        struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+        struct inotify_inode_mark *i_mark;
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-        kmem_cache_free(inotify_inode_mark_cachep, ientry);
+        kmem_cache_free(inotify_inode_mark_cachep, i_mark);
 }
 static int inotify_update_existing_watch(struct fsnotify_group *group,
                                         struct inode *inode,
                                         u32 arg)
 {
-        struct fsnotify_mark_entry *entry;
+        struct fsnotify_mark *fsn_mark;
-        struct inotify_inode_mark_entry *ientry;
+        struct inotify_inode_mark *i_mark;
        __u32 old_mask, new_mask;
        __u32 mask;
        int add = (arg & IN_MASK_ADD);
@@ -467,52 +581,43 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
        /* don't allow invalid bits: we don't want flags set */
        mask = inotify_arg_to_mask(arg);
-        if (unlikely(!mask))
+        if (unlikely(!(mask & IN_ALL_EVENTS)))
                return -EINVAL;
-        spin_lock(&inode->i_lock);
+        fsn_mark = fsnotify_find_inode_mark(group, inode);
-        entry = fsnotify_find_mark_entry(group, inode);
+        if (!fsn_mark)
-        spin_unlock(&inode->i_lock);
-        if (!entry)
                return -ENOENT;
-        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-        spin_lock(&entry->lock);
+        spin_lock(&fsn_mark->lock);
-        old_mask = entry->mask;
+        old_mask = fsn_mark->mask;
-        if (add) {
+        if (add)
-                entry->mask |= mask;
+                fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
-                new_mask = entry->mask;
+        else
-        } else {
+                fsnotify_set_mark_mask_locked(fsn_mark, mask);
-                entry->mask = mask;
+        new_mask = fsn_mark->mask;
-                new_mask = entry->mask;
-        }
-        spin_unlock(&entry->lock);
+        spin_unlock(&fsn_mark->lock);
        if (old_mask != new_mask) {
                /* more bits in old than in new? */
                int dropped = (old_mask & ~new_mask);
-                /* more bits in this entry than the inode's mask? */
+                /* more bits in this fsn_mark than the inode's mask? */
                int do_inode = (new_mask & ~inode->i_fsnotify_mask);
-                /* more bits in this entry than the group? */
-                int do_group = (new_mask & ~group->mask);
-                /* update the inode with this new entry */
+                /* update the inode with this new fsn_mark */
                if (dropped || do_inode)
                        fsnotify_recalc_inode_mask(inode);
-                /* update the group mask with the new mask */
-                if (dropped || do_group)
-                        fsnotify_recalc_group_mask(group);
        }
        /* return the wd */
-        ret = ientry->wd;
+        ret = i_mark->wd;
-        /* match the get from fsnotify_find_mark_entry() */
+        /* match the get from fsnotify_find_mark() */
-        fsnotify_put_mark(entry);
+        fsnotify_put_mark(fsn_mark);
        return ret;
 }
@@ -521,73 +626,51 @@ static int inotify_new_watch(struct fsnotify_group *group,
                             struct inode *inode,
                             u32 arg)
 {
-        struct inotify_inode_mark_entry *tmp_ientry;
+        struct inotify_inode_mark *tmp_i_mark;
        __u32 mask;
        int ret;
+        struct idr *idr = &group->inotify_data.idr;
+        spinlock_t *idr_lock = &group->inotify_data.idr_lock;
        /* don't allow invalid bits: we don't want flags set */
        mask = inotify_arg_to_mask(arg);
-        if (unlikely(!mask))
+        if (unlikely(!(mask & IN_ALL_EVENTS)))
                return -EINVAL;
-        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-        if (unlikely(!tmp_ientry))
+        if (unlikely(!tmp_i_mark))
                return -ENOMEM;
-        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+        fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
-        tmp_ientry->fsn_entry.mask = mask;
+        tmp_i_mark->fsn_mark.mask = mask;
-        tmp_ientry->wd = -1;
+        tmp_i_mark->wd = -1;
        ret = -ENOSPC;
        if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
                goto out_err;
-retry:
-        ret = -ENOMEM;
-        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-                goto out_err;
-        /* we are putting the mark on the idr, take a reference */
+        ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
-        fsnotify_get_mark(&tmp_ientry->fsn_entry);
+                                 tmp_i_mark);
+        if (ret)
-        spin_lock(&group->inotify_data.idr_lock);
-        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-                                group->inotify_data.last_wd+1,
-                                &tmp_ientry->wd);
-        spin_unlock(&group->inotify_data.idr_lock);
-        if (ret) {
-                /* we didn't get on the idr, drop the idr reference */
-                fsnotify_put_mark(&tmp_ientry->fsn_entry);
-                /* idr was out of memory allocate and try again */
-                if (ret == -EAGAIN)
-                        goto retry;
                goto out_err;
-        }
        /* we are on the idr, now get on the inode */
-        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+        ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
        if (ret) {
                /* we failed to get on the inode, get off the idr */
-                inotify_remove_from_idr(group, tmp_ientry);
+                inotify_remove_from_idr(group, tmp_i_mark);
                goto out_err;
        }
-        /* update the idr hint, who cares about races, it's just a hint */
-        group->inotify_data.last_wd = tmp_ientry->wd;
        /* increment the number of watches the user has */
        atomic_inc(&group->inotify_data.user->inotify_watches);
-        /* return the watch descriptor for this new entry */
+        /* return the watch descriptor for this new mark */
-        ret = tmp_ientry->wd;
+        ret = tmp_i_mark->wd;
-        /* if this mark added a new event update the group mask */
-        if (mask & ~group->mask)
-                fsnotify_recalc_group_mask(group);
 out_err:
-        /* match the ref from fsnotify_init_markentry() */
+        /* match the ref from fsnotify_init_mark() */
-        fsnotify_put_mark(&tmp_ientry->fsn_entry);
+        fsnotify_put_mark(&tmp_i_mark->fsn_mark);
        return ret;
 }
@@ -616,11 +699,8 @@ retry:
 static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
 {
        struct fsnotify_group *group;
-        unsigned int grp_num;
-        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+        group = fsnotify_alloc_group(&inotify_fsnotify_ops);
-        grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
-        group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
        if (IS_ERR(group))
                return group;
@@ -726,7 +806,7 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
        struct fsnotify_group *group;
-        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark *i_mark;
        struct file *filp;
        int ret = 0, fput_needed;
@@ -735,25 +815,23 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
                return -EBADF;
        /* verify that this is indeed an inotify instance */
-        if (unlikely(filp->f_op != &inotify_fops)) {
+        ret = -EINVAL;
-                ret = -EINVAL;
+        if (unlikely(filp->f_op != &inotify_fops))
                goto out;
-        }
        group = filp->private_data;
-        spin_lock(&group->inotify_data.idr_lock);
+        ret = -EINVAL;
-        entry = idr_find(&group->inotify_data.idr, wd);
+        i_mark = inotify_idr_find(group, wd);
-        if (unlikely(!entry)) {
+        if (unlikely(!i_mark))
-                spin_unlock(&group->inotify_data.idr_lock);
-                ret = -EINVAL;
                goto out;
-        }
-        fsnotify_get_mark(entry);
-        spin_unlock(&group->inotify_data.idr_lock);
-        fsnotify_destroy_mark_by_entry(entry);
+        ret = 0;
-        fsnotify_put_mark(entry);
+        fsnotify_destroy_mark(&i_mark->fsn_mark);
+        /* match ref taken by inotify_idr_find */
+        fsnotify_put_mark(&i_mark->fsn_mark);
 out:
        fput_light(filp, fput_needed);
@@ -767,7 +845,28 @@ out:
 */
 static int __init inotify_user_setup(void)
 {
-        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+        BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+        BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
+        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
        inotify_max_queued_events = 16384;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
new file mode 100644
index 000000000000..325185e514bb
--- /dev/null
+++ b/fs/notify/mark.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * mark->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the marks_list anchored inside a given group
+ * and each mark is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_marks list anchored inside a
+ * given inode and each mark is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+void fsnotify_get_mark(struct fsnotify_mark *mark)
+{
+        atomic_inc(&mark->refcnt);
+}
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+        if (atomic_dec_and_test(&mark->refcnt))
+                mark->free_mark(mark);
+}
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the mark->lock
+ */
+void fsnotify_destroy_mark(struct fsnotify_mark *mark)
+{
+        struct fsnotify_group *group;
+        struct inode *inode = NULL;
+        spin_lock(&mark->lock);
+        group = mark->group;
+        /* something else already called this function on this mark */
+        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+                spin_unlock(&mark->lock);
+                return;
+        }
+        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+        /* 1 from caller and 1 for being on i_list/g_list */
+        BUG_ON(atomic_read(&mark->refcnt) < 2);
+        spin_lock(&group->mark_lock);
+        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
+                inode = mark->i.inode;
+                fsnotify_destroy_inode_mark(mark);
+        } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+                fsnotify_destroy_vfsmount_mark(mark);
+        else
+                BUG();
+        list_del_init(&mark->g_list);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&mark->lock);
+        spin_lock(&destroy_lock);
+        list_add(&mark->destroy_list, &destroy_list);
+        spin_unlock(&destroy_lock);
+        wake_up(&destroy_waitq);
+        /*
+         * Some groups like to know that marks are being freed.  This is a
+         * callback to the group function to let it know that this mark
+         * is being freed.
+         */
+        if (group->ops->freeing_mark)
+                group->ops->freeing_mark(mark, group);
+        /*
+         * __fsnotify_update_child_dentry_flags(inode);
+         *
+         * I really want to call that, but we can't, we have no idea if the inode
+         * still exists the second we drop the mark->lock.
+         *
+         * The next time an event arrive to this inode from one of it's children
+         * __fsnotify_parent will see that the inode doesn't care about it's
+         * children and will update all of these flags then.  So really this
+         * is just a lazy update (and could be a perf win...)
+         */
+        if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+                iput(inode);
+        /*
+         * it's possible that this group tried to destroy itself, but this
+         * this mark was simultaneously being freed by inode.  If that's the
+         * case, we finish freeing the group here.
+         */
+        if (unlikely(atomic_dec_and_test(&group->num_marks)))
+                fsnotify_final_destroy_group(group);
+}
+void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+        assert_spin_locked(&mark->lock);
+        mark->mask = mask;
+        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+                fsnotify_set_inode_mark_mask_locked(mark, mask);
+}
+void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
+{
+        assert_spin_locked(&mark->lock);
+        mark->ignored_mask = mask;
+}
+/*
+ * Attach an initialized mark to a given group and fs object.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group.
+ */
+int fsnotify_add_mark(struct fsnotify_mark *mark,
+                      struct fsnotify_group *group, struct inode *inode,
+                      struct vfsmount *mnt, int allow_dups)
+{
+        int ret = 0;
+        BUG_ON(inode && mnt);
+        BUG_ON(!inode && !mnt);
+        /*
+         * LOCKING ORDER!!!!
+         * mark->lock
+         * group->mark_lock
+         * inode->i_lock
+         */
+        spin_lock(&mark->lock);
+        spin_lock(&group->mark_lock);
+        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+        mark->group = group;
+        list_add(&mark->g_list, &group->marks_list);
+        atomic_inc(&group->num_marks);
+        fsnotify_get_mark(mark); /* for i_list and g_list */
+        if (inode) {
+                ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
+                if (ret)
+                        goto err;
+        } else if (mnt) {
+                ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
+                if (ret)
+                        goto err;
+        } else {
+                BUG();
+        }
+        spin_unlock(&group->mark_lock);
+        /* this will pin the object if appropriate */
+        fsnotify_set_mark_mask_locked(mark, mark->mask);
+        spin_unlock(&mark->lock);
+        if (inode)
+                __fsnotify_update_child_dentry_flags(inode);
+        return ret;
+err:
+        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+        list_del_init(&mark->g_list);
+        mark->group = NULL;
+        atomic_dec(&group->num_marks);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&mark->lock);
+        spin_lock(&destroy_lock);
+        list_add(&mark->destroy_list, &destroy_list);
+        spin_unlock(&destroy_lock);
+        wake_up(&destroy_waitq);
+        return ret;
+}
+/*
+ * clear any marks in a group in which mark->flags & flags is true
+ */
+void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
+                                         unsigned int flags)
+{
+        struct fsnotify_mark *lmark, *mark;
+        LIST_HEAD(free_list);
+        spin_lock(&group->mark_lock);
+        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+                if (mark->flags & flags) {
+                        list_add(&mark->free_g_list, &free_list);
+                        list_del_init(&mark->g_list);
+                        fsnotify_get_mark(mark);
+                }
+        }
+        spin_unlock(&group->mark_lock);
+        list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
+                fsnotify_destroy_mark(mark);
+                fsnotify_put_mark(mark);
+        }
+}
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+        fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1);
+}
+void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
+{
+        assert_spin_locked(&old->lock);
+        new->i.inode = old->i.inode;
+        new->m.mnt = old->m.mnt;
+        new->group = old->group;
+        new->mask = old->mask;
+        new->free_mark = old->free_mark;
+}
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark *mark,
+                        void (*free_mark)(struct fsnotify_mark *mark))
+{
+        memset(mark, 0, sizeof(*mark));
+        spin_lock_init(&mark->lock);
+        atomic_set(&mark->refcnt, 1);
+        mark->free_mark = free_mark;
+}
+static int fsnotify_mark_destroy(void *ignored)
+{
+        struct fsnotify_mark *mark, *next;
+        LIST_HEAD(private_destroy_list);
+        for (;;) {
+                spin_lock(&destroy_lock);
+                /* exchange the list head */
+                list_replace_init(&destroy_list, &private_destroy_list);
+                spin_unlock(&destroy_lock);
+                synchronize_srcu(&fsnotify_mark_srcu);
+                list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) {
+                        list_del_init(&mark->destroy_list);
+                        fsnotify_put_mark(mark);
+                }
+                wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+        }
+        return 0;
+}
+static int __init fsnotify_mark_init(void)
+{
+        struct task_struct *thread;
+        thread = kthread_run(fsnotify_mark_destroy, NULL,
+                             "fsnotify_mark");
+        if (IS_ERR(thread))
+                panic("unable to start fsnotify mark destruction thread.");
+        return 0;
+}
+device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8bf53b4c108..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -56,7 +56,7 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
 * it is needed.  It's refcnt is set 1 at kernel init time and will never
 * get set to 0 so it will never get 'freed'
 */
-static struct fsnotify_event q_overflow_event;
+static struct fsnotify_event *q_overflow_event;
 static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
 /**
@@ -87,12 +87,15 @@ void fsnotify_put_event(struct fsnotify_event *event)
                return;
        if (atomic_dec_and_test(&event->refcnt)) {
+                pr_debug("%s: event=%p\n", __func__, event);
                if (event->data_type == FSNOTIFY_EVENT_PATH)
                        path_put(&event->path);
                BUG_ON(!list_empty(&event->private_data_list));
                kfree(event->file_name);
+                put_pid(event->tgid);
                kmem_cache_free(fsnotify_event_cachep, event);
        }
 }
@@ -104,7 +107,8 @@ struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
 void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 {
-        kmem_cache_free(fsnotify_event_holder_cachep, holder);
+        if (holder)
+                kmem_cache_free(fsnotify_event_holder_cachep, holder);
 }
 /*
@@ -129,53 +133,20 @@ struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnot
 }
 /*
- * Check if 2 events contain the same information.  We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
- */
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
-{
-        if ((old->mask == new->mask) &&
-            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type) &&
-            (old->name_len == new->name_len)) {
-                switch (old->data_type) {
-                case (FSNOTIFY_EVENT_INODE):
-                        /* remember, after old was put on the wait_q we aren't
-                         * allowed to look at the inode any more, only thing
-                         * left to check was if the file_name is the same */
-                        if (!old->name_len ||
-                            !strcmp(old->file_name, new->file_name))
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_PATH):
-                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->path.dentry == new->path.dentry))
-                                return true;
-                        break;
-                case (FSNOTIFY_EVENT_NONE):
-                        if (old->mask & FS_Q_OVERFLOW)
-                                return true;
-                        else if (old->mask & FS_IN_IGNORED)
-                                return false;
-                        return false;
-                };
-        }
-        return false;
-}
-/*
 * Add an event to the group notification queue.  The group can later pull this
 * event off the queue to deal with.  If the event is successfully added to the
 * group's notification queue, a reference is taken on event.
 */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
-                              struct fsnotify_event_private_data *priv)
+                                                 struct fsnotify_event_private_data *priv,
+                                                 struct fsnotify_event *(*merge)(struct list_head *,
+                                                                                 struct fsnotify_event *))
 {
+        struct fsnotify_event *return_event = NULL;
        struct fsnotify_event_holder *holder = NULL;
        struct list_head *list = &group->notification_list;
-        struct fsnotify_event_holder *last_holder;
-        struct fsnotify_event *last_event;
+        pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
-        int ret = 0;
        /*
         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -189,18 +160,40 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 alloc_holder:
                holder = fsnotify_alloc_event_holder();
                if (!holder)
-                        return -ENOMEM;
+                        return ERR_PTR(-ENOMEM);
        }
        mutex_lock(&group->notification_mutex);
        if (group->q_len >= group->max_events) {
-                event = &q_overflow_event;
+                event = q_overflow_event;
-                ret = -EOVERFLOW;
+                /*
+                 * we need to return the overflow event
+                 * which means we need a ref
+                 */
+                fsnotify_get_event(event);
+                return_event = event;
                /* sorry, no private data on the overflow event */
                priv = NULL;
        }
+        if (!list_empty(list) && merge) {
+                struct fsnotify_event *tmp;
+                tmp = merge(list, event);
+                if (tmp) {
+                        mutex_unlock(&group->notification_mutex);
+                        if (return_event)
+                                fsnotify_put_event(return_event);
+                        if (holder != &event->holder)
+                                fsnotify_destroy_event_holder(holder);
+                        return tmp;
+                }
+        }
        spin_lock(&event->lock);
        if (list_empty(&event->holder.event_list)) {
@@ -212,19 +205,13 @@ alloc_holder:
                 * event holder was used, go back and get a new one */
                spin_unlock(&event->lock);
                mutex_unlock(&group->notification_mutex);
-                goto alloc_holder;
-        }
-        if (!list_empty(list)) {
+                if (return_event) {
-                last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+                        fsnotify_put_event(return_event);
-                last_event = last_holder->event;
+                        return_event = NULL;
-                if (event_compare(last_event, event)) {
-                        spin_unlock(&event->lock);
-                        mutex_unlock(&group->notification_mutex);
-                        if (holder != &event->holder)
-                                fsnotify_destroy_event_holder(holder);
-                        return -EEXIST;
                }
+                goto alloc_holder;
        }
        group->q_len++;
@@ -238,7 +225,7 @@ alloc_holder:
        mutex_unlock(&group->notification_mutex);
        wake_up(&group->notification_waitq);
-        return ret;
+        return return_event;
 }
 /*
@@ -253,6 +240,8 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        pr_debug("%s: group=%p\n", __func__, group);
        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
        event = holder->event;
@@ -314,25 +303,82 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
 static void initialize_event(struct fsnotify_event *event)
 {
-        event->holder.event = NULL;
        INIT_LIST_HEAD(&event->holder.event_list);
        atomic_set(&event->refcnt, 1);
        spin_lock_init(&event->lock);
-        event->path.dentry = NULL;
-        event->path.mnt = NULL;
-        event->inode = NULL;
-        event->data_type = FSNOTIFY_EVENT_NONE;
        INIT_LIST_HEAD(&event->private_data_list);
+}
+/*
+ * Caller damn well better be holding whatever mutex is protecting the
+ * old_holder->event_list and the new_event must be a clean event which
+ * cannot be found anywhere else in the kernel.
+ */
+int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
+                           struct fsnotify_event *new_event)
+{
+        struct fsnotify_event *old_event = old_holder->event;
+        struct fsnotify_event_holder *new_holder = &new_event->holder;
-        event->to_tell = NULL;
+        enum event_spinlock_class {
+                SPINLOCK_OLD,
+                SPINLOCK_NEW,
+        };
-        event->file_name = NULL;
+        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-        event->name_len = 0;
-        event->sync_cookie = 0;
+        /*
+         * if the new_event's embedded holder is in use someone
+         * screwed up and didn't give us a clean new event.
+         */
+        BUG_ON(!list_empty(&new_holder->event_list));
+        spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
+        spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
+        new_holder->event = new_event;
+        list_replace_init(&old_holder->event_list, &new_holder->event_list);
+        spin_unlock(&new_event->lock);
+        spin_unlock(&old_event->lock);
+        /* event == holder means we are referenced through the in event holder */
+        if (old_holder != &old_event->holder)
+                fsnotify_destroy_event_holder(old_holder);
+        fsnotify_get_event(new_event); /* on the list take reference */
+        fsnotify_put_event(old_event); /* off the list, drop reference */
+        return 0;
+}
+struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
+{
+        struct fsnotify_event *event;
+        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        if (!event)
+                return NULL;
+        pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
+        memcpy(event, old_event, sizeof(*event));
+        initialize_event(event);
+        if (event->name_len) {
+                event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
+                if (!event->file_name) {
+                        kmem_cache_free(fsnotify_event_cachep, event);
+                        return NULL;
+                }
+        }
+        event->tgid = get_pid(old_event->tgid);
+        if (event->data_type == FSNOTIFY_EVENT_PATH)
+                path_get(&event->path);
+        return event;
 }
 /*
@@ -348,15 +394,18 @@ static void initialize_event(struct fsnotify_event *event)
 * @name the filename, if available
 */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-                                             int data_type, const char *name, u32 cookie,
+                                             int data_type, const unsigned char *name,
-                                             gfp_t gfp)
+                                             u32 cookie, gfp_t gfp)
 {
        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
+        event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
        if (!event)
                return NULL;
+        pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
+                 __func__, event, to_tell, mask, data, data_type);
        initialize_event(event);
        if (name) {
@@ -368,30 +417,21 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
                event->name_len = strlen(event->file_name);
        }
+        event->tgid = get_pid(task_tgid(current));
        event->sync_cookie = cookie;
        event->to_tell = to_tell;
+        event->data_type = data_type;
        switch (data_type) {
-        case FSNOTIFY_EVENT_FILE: {
-                struct file *file = data;
-                struct path *path = &file->f_path;
-                event->path.dentry = path->dentry;
-                event->path.mnt = path->mnt;
-                path_get(&event->path);
-                event->data_type = FSNOTIFY_EVENT_PATH;
-                break;
-        }
        case FSNOTIFY_EVENT_PATH: {
                struct path *path = data;
                event->path.dentry = path->dentry;
                event->path.mnt = path->mnt;
                path_get(&event->path);
-                event->data_type = FSNOTIFY_EVENT_PATH;
                break;
        }
        case FSNOTIFY_EVENT_INODE:
                event->inode = data;
-                event->data_type = FSNOTIFY_EVENT_INODE;
                break;
        case FSNOTIFY_EVENT_NONE:
                event->inode = NULL;
@@ -412,8 +452,11 @@ __init int fsnotify_notification_init(void)
        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-        initialize_event(&q_overflow_event);
+        q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
-        q_overflow_event.mask = FS_Q_OVERFLOW;
+                                                 FSNOTIFY_EVENT_NONE, NULL, 0,
+                                                 GFP_KERNEL);
+        if (!q_overflow_event)
+                panic("unable to allocate fsnotify q_overflow_event\n");
        return 0;
 }
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
new file mode 100644
index 000000000000..56772b578fbd
--- /dev/null
+++ b/fs/notify/vfsmount_mark.c
@@ -0,0 +1,187 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+        struct fsnotify_mark *mark, *lmark;
+        struct hlist_node *pos, *n;
+        LIST_HEAD(free_list);
+        spin_lock(&mnt->mnt_root->d_lock);
+        hlist_for_each_entry_safe(mark, pos, n, &mnt->mnt_fsnotify_marks, m.m_list) {
+                list_add(&mark->m.free_m_list, &free_list);
+                hlist_del_init_rcu(&mark->m.m_list);
+                fsnotify_get_mark(mark);
+        }
+        spin_unlock(&mnt->mnt_root->d_lock);
+        list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
+                fsnotify_destroy_mark(mark);
+                fsnotify_put_mark(mark);
+        }
+}
+void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+        fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+}
+/*
+ * Recalculate the mask of events relevant to a given vfsmount locked.
+ */
+static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
+{
+        struct fsnotify_mark *mark;
+        struct hlist_node *pos;
+        __u32 new_mask = 0;
+        assert_spin_locked(&mnt->mnt_root->d_lock);
+        hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list)
+                new_mask |= mark->mask;
+        mnt->mnt_fsnotify_mask = new_mask;
+}
+/*
+ * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this mount point
+ */
+void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
+{
+        spin_lock(&mnt->mnt_root->d_lock);
+        fsnotify_recalc_vfsmount_mask_locked(mnt);
+        spin_unlock(&mnt->mnt_root->d_lock);
+}
+void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
+{
+        struct vfsmount *mnt = mark->m.mnt;
+        assert_spin_locked(&mark->lock);
+        assert_spin_locked(&mark->group->mark_lock);
+        spin_lock(&mnt->mnt_root->d_lock);
+        hlist_del_init_rcu(&mark->m.m_list);
+        mark->m.mnt = NULL;
+        fsnotify_recalc_vfsmount_mask_locked(mnt);
+        spin_unlock(&mnt->mnt_root->d_lock);
+}
+static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group,
+                                                                struct vfsmount *mnt)
+{
+        struct fsnotify_mark *mark;
+        struct hlist_node *pos;
+        assert_spin_locked(&mnt->mnt_root->d_lock);
+        hlist_for_each_entry(mark, pos, &mnt->mnt_fsnotify_marks, m.m_list) {
+                if (mark->group == group) {
+                        fsnotify_get_mark(mark);
+                        return mark;
+                }
+        }
+        return NULL;
+}
+/*
+ * given a group and vfsmount, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
+                                                  struct vfsmount *mnt)
+{
+        struct fsnotify_mark *mark;
+        spin_lock(&mnt->mnt_root->d_lock);
+        mark = fsnotify_find_vfsmount_mark_locked(group, mnt);
+        spin_unlock(&mnt->mnt_root->d_lock);
+        return mark;
+}
+/*
+ * Attach an initialized mark to a given group and vfsmount.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which groups.
+ */
+int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
+                               struct fsnotify_group *group, struct vfsmount *mnt,
+                               int allow_dups)
+{
+        struct fsnotify_mark *lmark;
+        struct hlist_node *node, *last = NULL;
+        int ret = 0;
+        mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
+        assert_spin_locked(&mark->lock);
+        assert_spin_locked(&group->mark_lock);
+        spin_lock(&mnt->mnt_root->d_lock);
+        mark->m.mnt = mnt;
+        /* is mark the first mark? */
+        if (hlist_empty(&mnt->mnt_fsnotify_marks)) {
+                hlist_add_head_rcu(&mark->m.m_list, &mnt->mnt_fsnotify_marks);
+                goto out;
+        }
+        /* should mark be in the middle of the current list? */
+        hlist_for_each_entry(lmark, node, &mnt->mnt_fsnotify_marks, m.m_list) {
+                last = node;
+                if ((lmark->group == group) && !allow_dups) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+                if (mark->group < lmark->group)
+                        continue;
+                hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
+                goto out;
+        }
+        BUG_ON(last == NULL);
+        /* mark should be the last entry.  last is the current last entry */
+        hlist_add_after_rcu(last, &mark->m.m_list);
+out:
+        fsnotify_recalc_vfsmount_mask_locked(mnt);
+        spin_unlock(&mnt->mnt_root->d_lock);
+        return ret;
+}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
 * this problem for now.  We do write the $BITMAP attribute if it is present
 * which is the important one for a directory so things are not too bad.
 */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_dir_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *bmp_vi, *vi = dentry->d_inode;
+        struct inode *bmp_vi, *vi = filp->f_mapping->host;
        int err, ret;
        ntfs_attr na;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * the page at all.  For a more detailed explanation see ntfs_truncate() in
 * fs/ntfs/inode.c.
 *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
 * Return 0 on success and -errno on error.  In the case that an error is
 * encountered it is possible that the initialized size will already have been
 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
 *          held by the caller.
 */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-                struct page **cached_page, struct pagevec *lru_pvec)
 {
        s64 old_init_size;
        loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
- * If a page is newly created, increment its refcount and add it to the
+ * If a page is newly created, add it to lru list
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
 *
 * Note, the page locks are obtained in ascending page index order.
 */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                pgoff_t index, const unsigned nr_pages, struct page **pages,
-                struct page **cached_page, struct pagevec *lru_pvec)
+                struct page **cached_page)
 {
        int err, nr;
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping, index,
                                        GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                goto err_out;
                        }
                        pages[nr] = *cached_page;
-                        page_cache_get(*cached_page);
-                        if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-                                __pagevec_lru_add_file(lru_pvec);
                        *cached_page = NULL;
                }
                index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ssize_t status, written;
        unsigned nr_pages;
        int err;
-        struct pagevec lru_pvec;
        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
                        "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        }
                }
        }
-        pagevec_init(&lru_pvec, 0);
        written = 0;
        /*
         * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ll = ni->initialized_size;
        read_unlock_irqrestore(&ni->size_lock, flags);
        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
+                err = ntfs_attr_extend_initialized(ni, pos);
-                                &lru_pvec);
                if (err < 0) {
                        ntfs_error(vol->sb, "Cannot perform write to inode "
                                        "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-                                pages, &cached_page, &lru_pvec);
+                                pages, &cached_page);
                if (unlikely(status))
                        break;
                /*
@@ -2077,7 +2062,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
                        (long)status);
@@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
 * ntfs_file_fsync - sync a file to disk
 * @filp:       file to be synced
- * @dentry:     dentry describing the file to sync
 * @datasync:   if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
 * Locking: Caller must hold i_mutex on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_file_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *vi = dentry->d_inode;
+        struct inode *vi = filp->f_mapping->host;
        int err, ret = 0;
        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4b57fb1eac2a..93622b175fc7 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2238,7 +2238,7 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
 }
 /**
- * ntfs_clear_big_inode - clean up the ntfs specific part of an inode
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
 * @vi:         vfs inode pending annihilation
 *
 * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
@@ -2247,10 +2247,13 @@ void ntfs_clear_extent_inode(ntfs_inode *ni)
 *
 * If the MFT record is dirty, we commit it before doing anything else.
 */
-void ntfs_clear_big_inode(struct inode *vi)
+void ntfs_evict_big_inode(struct inode *vi)
 {
        ntfs_inode *ni = NTFS_I(vi);
+        truncate_inode_pages(&vi->i_data, 0);
+        end_writeback(vi);
 #ifdef NTFS_RW
        if (NInoDirty(ni)) {
                bool was_bad = (is_bad_inode(vi));
@@ -2879,9 +2882,6 @@ void ntfs_truncate_vfs(struct inode *vi) {
 *
 * Called with ->i_mutex held.  For the ATTR_SIZE (i.e. ->truncate) case, also
 * called with ->i_alloc_sem held for writing.
- *
- * Basically this is a copy of generic notify_change() and inode_setattr()
- * functionality, except we intercept and abort changes in i_size.
 */
 int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 9a113544605d..2dabf813456c 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -279,7 +279,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
 extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
 extern void ntfs_destroy_big_inode(struct inode *inode);
-extern void ntfs_clear_big_inode(struct inode *vi);
+extern void ntfs_evict_big_inode(struct inode *vi);
 extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0de1db6cddbf..512806171bfa 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2700,7 +2700,7 @@ static const struct super_operations ntfs_sops = {
        .put_super      = ntfs_put_super,       /* Syscall: umount. */
        .statfs         = ntfs_statfs,          /* Syscall: statfs */
        .remount_fs     = ntfs_remount,         /* Syscall: mount -o remount. */
-        .clear_inode    = ntfs_clear_big_inode, /* VFS: Called when an inode is
+        .evict_inode    = ntfs_evict_big_inode, /* VFS: Called when an inode is
                                                   removed from memory. */
        //.umount_begin = NULL,                 /* Forced umount. */
        .show_options   = ntfs_show_options,    /* Show mount options in
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
        mmap.o                  \
        namei.o                 \
        refcounttree.o          \
+        reservations.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..391915093fe1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -209,7 +209,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
        }
        inode->i_mode = new_mode;
+        inode->i_ctime = CURRENT_TIME;
        di->i_mode = cpu_to_le16(inode->i_mode);
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ocfs2_journal_dirty(handle, di_bh);
@@ -290,12 +293,30 @@ static int ocfs2_set_acl(handle_t *handle,
 int ocfs2_check_acl(struct inode *inode, int mask)
 {
-        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret = -EAGAIN;
-        if (IS_ERR(acl))
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return ret;
+        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+        brelse(di_bh);
+        if (IS_ERR(acl)) {
+                mlog_errno(PTR_ERR(acl));
                return PTR_ERR(acl);
+        }
        if (acl) {
-                int ret = posix_acl_permission(inode, acl, mask);
+                ret = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
                return ret;
        }
@@ -344,7 +365,7 @@ int ocfs2_init_acl(handle_t *handle,
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
-        int ret = 0;
+        int ret = 0, ret2;
        mode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +402,12 @@ int ocfs2_init_acl(handle_t *handle,
                mode = inode->i_mode;
                ret = posix_acl_create_masq(clone, &mode);
                if (ret >= 0) {
-                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        if (ret2) {
+                                mlog_errno(ret2);
+                                ret = ret2;
+                                goto cleanup;
+                        }
                        if (ret > 0) {
                                ret = ocfs2_set_acl(handle, inode,
                                                    di_bh, ACL_TYPE_ACCESS,
@@ -489,7 +515,7 @@ cleanup:
        return ret;
 }
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ocfs2_xattr_list_acl_access,
@@ -497,7 +523,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .set    = ocfs2_xattr_set_acl,
 };
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..592fae5007d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        int count, status, i;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_super *osb =
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        count = 0;
        while (count < wanted) {
-                status = ocfs2_claim_metadata(osb,
+                status = ocfs2_claim_metadata(handle,
-                                              handle,
                                              meta_ac,
                                              wanted - count,
+                                              &suballoc_loc,
                                              &suballoc_bit_start,
                                              &num_got,
                                              &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
                        eb->h_suballoc_slot =
                                cpu_to_le16(meta_ac->ac_alloc_slot);
+                        eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        /* We'll also be dirtied by the caller, so
                         * this isn't absolutely necessary. */
-                        status = ocfs2_journal_dirty(handle, bhs[i]);
+                        ocfs2_journal_dirty(handle, bhs[i]);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
                }
                count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
                goto out;
        }
-        status = ocfs2_extend_trans(handle, path_num_items(path) +
+        status = ocfs2_extend_trans(handle, path_num_items(path));
-                                    handle->h_buffer_credits);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                next_blkno = le64_to_cpu(eb->h_blkno);
        }
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, *last_eb_bh);
+        ocfs2_journal_dirty(handle, *last_eb_bh);
-        if (status < 0)
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-                mlog_errno(status);
+        if (eb_bh)
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+                ocfs2_journal_dirty(handle, eb_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (eb_bh) {
-                status = ocfs2_journal_dirty(handle, eb_bh);
-                if (status < 0)
-                        mlog_errno(status);
-        }
        /*
         * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
                eb_el->l_recs[i] = root_el->l_recs[i];
-        status = ocfs2_journal_dirty(handle, new_eb_bh);
+        ocfs2_journal_dirty(handle, new_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_et_root_journal_access(handle, et,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        if (root_el->l_tree_depth == cpu_to_le16(1))
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        *ret_new_eb_bh = new_eb_bh;
        new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                                       struct ocfs2_path *right_path,
                                       int subtree_index)
 {
-        int ret, i, idx;
+        int i, idx;
        struct ocfs2_extent_list *el, *left_el, *right_el;
        struct ocfs2_extent_rec *left_rec, *right_rec;
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
                                              right_el);
-                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-                if (ret)
+                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                        mlog_errno(ret);
-                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
        root_bh = left_path->p_node[subtree_index].bh;
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret)
-                mlog_errno(ret);
 }
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        ocfs2_create_empty_extent(right_el);
-        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        ocfs2_journal_dirty(handle, right_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /* Do the copy now. */
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
        le16_add_cpu(&left_el->l_next_free_rec, 1);
-        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        ocfs2_journal_dirty(handle, left_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ocfs2_complete_edge_insert(handle, left_path, right_path,
                                   subtree_index);
@@ -2249,8 +2210,8 @@ out:
 *
 * Will return zero if the path passed in is already the leftmost path.
 */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-                                         struct ocfs2_path *path, u32 *cpos)
+                                  struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int ret;
+        int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
-        if (handle->h_buffer_credits < credits) {
+        if (handle->h_buffer_credits < credits)
                ret = ocfs2_extend_trans(handle,
                                         credits - handle->h_buffer_credits);
-                if (ret)
-                        return ret;
-                if (unlikely(handle->h_buffer_credits < credits))
+        return ret;
-                        return ocfs2_extend_trans(handle, credits);
-        }
-        return 0;
 }
 /*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
         * records for all the bh in the path.
         * So we have to allocate extra credits and access them.
         */
-        ret = ocfs2_extend_trans(handle,
+        ret = ocfs2_extend_trans(handle, subtree_index);
-                                 handle->h_buffer_credits + subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                ocfs2_remove_empty_extent(right_leaf_el);
        }
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-        if (ret)
+        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                mlog_errno(ret);
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-        if (ret)
-                mlog_errno(ret);
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
-                ret = ocfs2_journal_dirty(handle, et_root_bh);
+                ocfs2_journal_dirty(handle, et_root_bh);
-                if (ret)
-                        mlog_errno(ret);
                *deleted = 1;
        } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
        }
        ocfs2_remove_empty_extent(el);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (right_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                if (ret)
-                        mlog_errno(ret);
                ocfs2_complete_edge_insert(handle, left_path, right_path,
                                           subtree_index);
        }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                le32_add_cpu(&rec->e_int_clusters,
                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
        }
 }
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
        if (left_path) {
-                int credits = handle->h_buffer_credits;
                /*
                 * There's a chance that left_path got passed back to
                 * us without being accounted for in the
                 * journal. Extend our transaction here to be sure we
                 * can change those blocks.
                 */
-                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
-                ret = ocfs2_extend_trans(handle, credits);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
                 * dirty this for us.
                 */
                if (left_path)
-                        ret = ocfs2_journal_dirty(handle,
+                        ocfs2_journal_dirty(handle,
-                                                  path_leaf_bh(left_path));
+                                            path_leaf_bh(left_path));
-                        if (ret)
-                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
                                     insert);
-        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        ocfs2_journal_dirty(handle, leaf_bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
                /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
                ocfs2_et_update_clusters(et,
                                         le16_to_cpu(insert_rec->e_leaf_clusters));
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        status = __ocfs2_claim_clusters(handle, data_ac, 1,
                                        clusters_to_add, &bit_off, &num_bits);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= num_bits;
        *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
-        int ret, depth, credits = handle->h_buffer_credits;
+        int ret, depth, credits;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        } else
                rightmost_el = path_leaf_el(path);
-        credits += path->p_tree_depth +
+        credits = path->p_tree_depth +
-                   ocfs2_extend_meta_needed(et->et_root_el);
+                  ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
        return ret;
 }
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              u32 extents_to_split,
+                                              struct ocfs2_alloc_context **ac,
+                                              int extra_blocks)
+{
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *ac = NULL;
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        if (extra_blocks) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        if (ret) {
+                if (*ac) {
+                        ocfs2_free_alloc_context(*ac);
+                        *ac = NULL;
+                }
+        }
+        return ret;
+}
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc)
 {
-        int ret;
+        int ret, credits = 0, extra_blocks = 0;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *tl_inode = osb->osb_tl_inode;
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                            refcount_loc,
+                                                            phys_blkno,
+                                                            len,
+                                                            &credits,
+                                                            &extra_blocks);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+                                                 extra_blocks);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        handle = ocfs2_start_trans(osb,
+                        ocfs2_remove_extent_credits(osb->sb) + credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
        ocfs2_et_update_clusters(et, -len);
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (phys_blkno) {
-        if (ret)
+                if (flags & OCFS2_EXT_REFCOUNTED)
-                mlog_errno(ret);
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 phys_blkno),
+                                        len, meta_ac,
+                                        dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        phys_blkno, len);
+                if (ret)
+                        mlog_errno(ret);
+        }
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
        return ret;
 }
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        }
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
-        status = ocfs2_journal_dirty(handle, tl_bh);
+        ocfs2_journal_dirty(handle, tl_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                tl->tl_used = cpu_to_le16(i);
-                status = ocfs2_journal_dirty(handle, tl_bh);
+                ocfs2_journal_dirty(handle, tl_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                /* TODO: Perhaps we can calculate the bulk of the
                 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_bg;
        u64                                     free_blk;
        unsigned int                            free_bit;
 };
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
        }
        while (head) {
-                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                if (head->free_bg)
-                                                      head->free_bit);
+                        bg_blkno = head->free_bg;
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                              head->free_bit);
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
                     head->free_bit, (unsigned long long)head->free_blk);
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        int ret = 0;
        struct ocfs2_cached_block_free *item;
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc,
-                              unsigned int bit)
+                              u64 blkno, unsigned int bit)
 {
        int ret;
        struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
             type, slot, bit, (unsigned long long)blkno);
+        item->free_bg = suballoc;
        item->free_blk = blkno;
        item->free_bit = bit;
        item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_suballoc_loc),
                                         le64_to_cpu(eb->h_blkno),
                                         le16_to_cpu(eb->h_suballoc_bit));
 }
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       unsigned int clusters_to_del,
-                                       struct ocfs2_path *path,
-                                       struct buffer_head **new_last_eb)
-{
-        int next_free, ret = 0;
-        u32 cpos;
-        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *bh = NULL;
-        *new_last_eb = NULL;
-        /* we have no tree, so of course, no last_eb. */
-        if (!path->p_tree_depth)
-                goto out;
-        /* trunc to zero special case - this makes tree_depth = 0
-         * regardless of what it is.  */
-        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto out;
-        el = path_leaf_el(path);
-        BUG_ON(!el->l_next_free_rec);
-        /*
-         * Make sure that this extent list will actually be empty
-         * after we clear away the data. We can shortcut out if
-         * there's more than one non-empty extent in the
-         * list. Otherwise, a check of the remaining extent is
-         * necessary.
-         */
-        next_free = le16_to_cpu(el->l_next_free_rec);
-        rec = NULL;
-        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                if (next_free > 2)
-                        goto out;
-                /* We may have a valid extent in index 1, check it. */
-                if (next_free == 2)
-                        rec = &el->l_recs[1];
-                /*
-                 * Fall through - no more nonempty extents, so we want
-                 * to delete this leaf.
-                 */
-        } else {
-                if (next_free > 1)
-                        goto out;
-                rec = &el->l_recs[0];
-        }
-        if (rec) {
-                /*
-                 * Check it we'll only be trimming off the end of this
-                 * cluster.
-                 */
-                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-                        goto out;
-        }
-        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        eb = (struct ocfs2_extent_block *) bh->b_data;
-        el = &eb->h_list;
-        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-         * Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        *new_last_eb = bh;
-        get_bh(*new_last_eb);
-        mlog(0, "returning block %llu, (cpos: %u)\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-        brelse(bh);
-        return ret;
-}
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-                           handle_t *handle, struct ocfs2_truncate_context *tc,
-                           u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-        int ret, i, index = path->p_tree_depth;
-        u32 new_edge = 0;
-        u64 deleted_eb = 0;
-        struct buffer_head *bh;
-        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *rec;
-        *delete_start = 0;
-        *flags = 0;
-        while (index >= 0) {
-                bh = path->p_node[index].bh;
-                el = path->p_node[index].el;
-                mlog(0, "traveling tree (index = %d, block = %llu)\n",
-                     index,  (unsigned long long)bh->b_blocknr);
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-                if (index !=
-                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has invalid ext. block %llu",
-                                    inode->i_ino,
-                                    (unsigned long long)bh->b_blocknr);
-                        ret = -EROFS;
-                        goto out;
-                }
-find_tail_record:
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                rec = &el->l_recs[i];
-                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-                     ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-                if (le16_to_cpu(el->l_tree_depth) == 0) {
-                        /*
-                         * If the leaf block contains a single empty
-                         * extent and no records, we can just remove
-                         * the block.
-                         */
-                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                el->l_next_free_rec = cpu_to_le16(0);
-                                goto delete;
-                        }
-                        /*
-                         * Remove any empty extents by shifting things
-                         * left. That should make life much easier on
-                         * the code below. This condition is rare
-                         * enough that we shouldn't see a performance
-                         * hit.
-                         */
-                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                for(i = 0;
-                                    i < le16_to_cpu(el->l_next_free_rec); i++)
-                                        el->l_recs[i] = el->l_recs[i + 1];
-                                memset(&el->l_recs[i], 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                /*
-                                 * We've modified our extent list. The
-                                 * simplest way to handle this change
-                                 * is to being the search from the
-                                 * start again.
-                                 */
-                                goto find_tail_record;
-                        }
-                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-                        /*
-                         * We'll use "new_edge" on our way back up the
-                         * tree to know what our rightmost cpos is.
-                         */
-                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
-                        new_edge += le32_to_cpu(rec->e_cpos);
-                        /*
-                         * The caller will use this to delete data blocks.
-                         */
-                        *delete_start = le64_to_cpu(rec->e_blkno)
-                                + ocfs2_clusters_to_blocks(inode->i_sb,
-                                        le16_to_cpu(rec->e_leaf_clusters));
-                        *flags = rec->e_flags;
-                        /*
-                         * If it's now empty, remove this record.
-                         */
-                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                        }
-                } else {
-                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                goto delete;
-                        }
-                        /* Can this actually happen? */
-                        if (le16_to_cpu(el->l_next_free_rec) == 0)
-                                goto delete;
-                        /*
-                         * We never actually deleted any clusters
-                         * because our leaf was empty. There's no
-                         * reason to adjust the rightmost edge then.
-                         */
-                        if (new_edge == 0)
-                                goto delete;
-                        rec->e_int_clusters = cpu_to_le32(new_edge);
-                        le32_add_cpu(&rec->e_int_clusters,
-                                     -le32_to_cpu(rec->e_cpos));
-                         /*
-                          * A deleted child record should have been
-                          * caught above.
-                          */
-                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-                }
-delete:
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                mlog(0, "extent list container %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u.\n",
-                     (unsigned long long)bh->b_blocknr, i,
-                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                /*
-                 * We must be careful to only attempt delete of an
-                 * extent block (and not the root inode block).
-                 */
-                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-                        struct ocfs2_extent_block *eb =
-                                (struct ocfs2_extent_block *)bh->b_data;
-                        /*
-                         * Save this for use when processing the
-                         * parent block.
-                         */
-                        deleted_eb = le64_to_cpu(eb->h_blkno);
-                        mlog(0, "deleting this extent block.\n");
-                        ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                        /* An error here is not fatal. */
-                        if (ret < 0)
-                                mlog_errno(ret);
-                } else {
-                        deleted_eb = 0;
-                }
-                index--;
-        }
-        ret = 0;
-out:
-        return ret;
-}
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-                             unsigned int clusters_to_del,
-                             struct inode *inode,
-                             struct buffer_head *fe_bh,
-                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc,
-                             struct ocfs2_path *path,
-                             struct ocfs2_alloc_context *meta_ac)
-{
-        int status;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *last_eb = NULL;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh = NULL;
-        u64 delete_blk = 0;
-        u8 rec_flags;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             path, &last_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        /*
-         * Each component will be touched, so we might as well journal
-         * here to avoid having to handle errors later.
-         */
-        status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb_bh) {
-                status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        el = &(fe->id2.i_list);
-        /*
-         * Lower levels depend on this never happening, but it's best
-         * to check it up here before changing the tree.
-         */
-        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %lu has an empty extent record, depth %u\n",
-                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
-                status = -EROFS;
-                goto bail;
-        }
-        dquot_free_space_nodirty(inode,
-                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-                                      clusters_to_del;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
-        status = ocfs2_trim_tree(inode, path, handle, tc,
-                                 clusters_to_del, &delete_blk, &rec_flags);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (le32_to_cpu(fe->i_clusters) == 0) {
-                /* trunc to zero is a special case. */
-                el->l_tree_depth = 0;
-                fe->i_last_eb_blk = 0;
-        } else if (last_eb)
-                fe->i_last_eb_blk = last_eb->h_blkno;
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb) {
-                /* If there will be a new last extent block, then by
-                 * definition, there cannot be any leaves to the right of
-                 * him. */
-                last_eb->h_next_leaf_blk = 0;
-                status = ocfs2_journal_dirty(handle, last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        if (delete_blk) {
-                if (rec_flags & OCFS2_EXT_REFCOUNTED)
-                        status = ocfs2_decrease_refcount(inode, handle,
-                                        ocfs2_blocks_to_clusters(osb->sb,
-                                                                 delete_blk),
-                                        clusters_to_del, meta_ac,
-                                        &tc->tc_dealloc, 1);
-                else
-                        status = ocfs2_truncate_log_append(osb, handle,
-                                                           delete_blk,
-                                                           clusters_to_del);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        status = 0;
-bail:
-        brelse(last_eb_bh);
-        mlog_exit(status);
-        return status;
-}
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
@@ -7091,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
        last_page_bytes = PAGE_ALIGN(end);
        index = start >> PAGE_CACHE_SHIFT;
        do {
-                pages[numpages] = grab_cache_page(mapping, index);
+                pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
                if (!pages[numpages]) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                        goto out_commit;
                did_quota = 1;
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
                        mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
 */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh)
-                          struct ocfs2_truncate_context *tc)
 {
-        int status, i, credits, tl_sem = 0;
+        int status = 0, i, flags = 0;
-        u32 clusters_to_del, new_highest_cpos, range;
+        u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
        u64 blkno = 0;
        struct ocfs2_extent_list *el;
-        handle_t *handle = NULL;
+        struct ocfs2_extent_rec *rec;
-        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_path *path = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_extent_list *root_el = &(di->id2.i_list);
-        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        struct ocfs2_extent_tree et;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
        mlog_entry_void();
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&dealloc);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+        path = ocfs2_new_path(di_bh, &di->id2.i_list,
                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
                goto bail;
        }
-        credits = 0;
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -7480,101 +7064,62 @@ start:
        }
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        range = le32_to_cpu(el->l_recs[i].e_cpos) +
+        rec = &el->l_recs[i];
-                ocfs2_rec_clusters(el, &el->l_recs[i]);
+        flags = rec->e_flags;
-        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
-                clusters_to_del = 0;
-        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+                /*
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+                 * Lower levels depend on this never happening, but it's best
+                 * to check it up here before changing the tree.
+                */
+                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+                        ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                                    "extent record, depth %u\n", inode->i_ino,
+                                    le16_to_cpu(root_el->l_tree_depth));
+                        status = -EROFS;
+                        goto bail;
+                }
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = 0;
+                blkno = 0;
+        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+                /*
+                 * Truncate entire record.
+                 */
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = ocfs2_rec_clusters(el, rec);
+                blkno = le64_to_cpu(rec->e_blkno);
        } else if (range > new_highest_cpos) {
-                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
+                /*
-                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
+                 * Partial truncate. it also should be
-                                  new_highest_cpos;
+                 * the last truncate we're doing.
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+                 */
-                        ocfs2_clusters_to_blocks(inode->i_sb,
+                trunc_cpos = new_highest_cpos;
-                                ocfs2_rec_clusters(el, &el->l_recs[i]) -
+                trunc_len = range - new_highest_cpos;
-                                clusters_to_del);
+                coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+                blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
        } else {
+                /*
+                 * Truncate completed, leave happily.
+                 */
                status = 0;
                goto bail;
        }
-        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+        phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-                         OCFS2_HAS_REFCOUNT_FL));
-                status = ocfs2_lock_refcount_tree(osb,
-                                                le64_to_cpu(di->i_refcount_loc),
-                                                1, &ref_tree, NULL);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-                                                               blkno,
-                                                               clusters_to_del,
-                                                               &credits,
-                                                               &meta_ac);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        tl_sem = 1;
-        /* ocfs2_truncate_log_needs_flush guarantees us at least one
-         * record is free for use. If there isn't any, we flush to get
-         * an empty truncate log.  */
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                status = __ocfs2_flush_truncate_log(osb);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+        status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
-                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                          phys_cpos, trunc_len, flags, &dealloc,
-                                                el);
+                                          refcount_loc);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-                                   tc, path, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mutex_unlock(&tl_inode->i_mutex);
-        tl_sem = 0;
-        ocfs2_commit_trans(osb, handle);
-        handle = NULL;
        ocfs2_reinit_path(path, 1);
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if (ref_tree) {
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-                ref_tree = NULL;
-        }
        /*
         * The check above will catch the case where we've truncated
         * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
        ocfs2_schedule_truncate_log_flush(osb, 1);
-        if (tl_sem)
+        ocfs2_run_deallocs(osb, &dealloc);
-                mutex_unlock(&tl_inode->i_mutex);
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        if (ref_tree)
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
-        /* This will drop the ext_alloc cluster lock for us */
-        ocfs2_free_truncate_context(tc);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc);
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc, u64 blkno,
                              unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh);
-                          struct ocfs2_truncate_context *tc);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                          unsigned int start, unsigned int end, int trunc);
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                  struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
                            struct ocfs2_path *left,
                            struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..0de69c9a08be 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
                        dump_stack();
                        goto bail;
                }
-                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-                mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-                     (unsigned long long)past_eof);
-                if (create && (iblock >= past_eof))
-                        set_buffer_new(bh_result);
        }
+        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+             (unsigned long long)past_eof);
+        if (create && (iblock >= past_eof))
+                set_buffer_new(bh_result);
 bail:
        if (err < 0)
                err = -EIO;
@@ -459,36 +458,6 @@ int walk_page_buffers(	handle_t *handle,
        return ret;
 }
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-                                                         struct page *page,
-                                                         unsigned from,
-                                                         unsigned to)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle;
-        int ret = 0;
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        if (ocfs2_should_order_data(inode)) {
-                ret = ocfs2_jbd2_file_inode(handle, inode);
-                if (ret < 0)
-                        mlog_errno(ret);
-        }
-out:
-        if (ret) {
-                if (!IS_ERR(handle))
-                        ocfs2_commit_trans(osb, handle);
-                handle = ERR_PTR(ret);
-        }
-        return handle;
-}
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t status;
@@ -609,7 +578,9 @@ bail:
 static void ocfs2_dio_end_io(struct kiocb *iocb,
                             loff_t offset,
                             ssize_t bytes,
-                             void *private)
+                             void *private,
+                             int ret,
+                             bool is_async)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int level;
@@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        if (!level)
                up_read(&inode->i_alloc_sem);
        ocfs2_rw_unlock(inode, level);
+        if (is_async)
+                aio_complete(iocb, ret, 0);
 }
 /*
@@ -669,11 +643,10 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (i_size_read(inode) <= offset)
                return 0;
-        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+        ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-                                            inode->i_sb->s_bdev, iov, offset,
+                                   iov, offset, nr_segs,
-                                            nr_segs,
+                                   ocfs2_direct_IO_get_blocks,
-                                            ocfs2_direct_IO_get_blocks,
+                                   ocfs2_dio_end_io, NULL, 0);
-                                            ocfs2_dio_end_io);
        mlog_exit(ret);
        return ret;
@@ -1131,23 +1104,37 @@ out:
 */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                                      struct ocfs2_write_ctxt *wc,
-                                      u32 cpos, loff_t user_pos, int new,
+                                      u32 cpos, loff_t user_pos,
+                                      unsigned user_len, int new,
                                      struct page *mmap_page)
 {
        int ret = 0, i;
-        unsigned long start, target_index, index;
+        unsigned long start, target_index, end_index, index;
        struct inode *inode = mapping->host;
+        loff_t last_byte;
        target_index = user_pos >> PAGE_CACHE_SHIFT;
        /*
         * Figure out how many pages we'll be manipulating here. For
         * non allocating write, we just change the one
-         * page. Otherwise, we'll need a whole clusters worth.
+         * page. Otherwise, we'll need a whole clusters worth.  If we're
+         * writing past i_size, we only need enough pages to cover the
+         * last page of the write.
         */
        if (new) {
                wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
                start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+                /*
+                 * We need the index *past* the last page we could possibly
+                 * touch.  This is the page past the end of the write or
+                 * i_size, whichever is greater.
+                 */
+                last_byte = max(user_pos + user_len, i_size_read(inode));
+                BUG_ON(last_byte < 1);
+                end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+                if ((start + wc->w_num_pages) > end_index)
+                        wc->w_num_pages = end_index - start;
        } else {
                wc->w_num_pages = 1;
                start = target_index;
@@ -1620,21 +1607,20 @@ out:
 * write path can treat it as an non-allocating write, which has no
 * special case code for sparse/nonsparse files.
 */
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
-                                        unsigned len,
+                                        struct buffer_head *di_bh,
+                                        loff_t pos, unsigned len,
                                        struct ocfs2_write_ctxt *wc)
 {
        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        loff_t newsize = pos + len;
-        if (ocfs2_sparse_alloc(osb))
+        BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
-                return 0;
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, pos);
+        ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
        if (ret)
                mlog_errno(ret);
@@ -1644,6 +1630,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        return ret;
 }
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+                           loff_t pos)
+{
+        int ret = 0;
+        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+        if (pos > i_size_read(inode))
+                ret = ocfs2_zero_extend(inode, di_bh, pos);
+        return ret;
+}
 int ocfs2_write_begin_nolock(struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned flags,
                             struct page **pagep, void **fsdata,
@@ -1679,7 +1677,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                }
        }
-        ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+        if (ocfs2_sparse_alloc(osb))
+                ret = ocfs2_zero_tail(inode, di_bh, pos);
+        else
+                ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+                                                   wc);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1735,6 +1737,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                        goto out;
                }
+                if (data_ac)
+                        data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
@@ -1786,7 +1791,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * that we can zero and flush if we error after adding the
         * extent.
         */
-        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                         cluster_of_pages, mmap_page);
        if (ret) {
                mlog_errno(ret);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..c7ee03c22226 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
-             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
        /* Ok, try ECC fixups */
@@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
                goto out;
        }
-        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
             (unsigned int)check.bc_crc32e, (unsigned int)crc);
        rc = -EIO;
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
+        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..cbe2f057cc28 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
+                        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                              " shutdown, state %d\n",
+                              SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
                        break;
        }
@@ -974,7 +977,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                           size_t caller_veclen, u8 target_node, int *status)
 {
-        int ret;
+        int ret = 0;
        struct o2net_msg *msg = NULL;
        size_t veclen, caller_bytes = 0;
        struct kvec *vec = NULL;
@@ -1756,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock)
        struct sockaddr_in sin;
        struct socket *new_sock = NULL;
        struct o2nm_node *node = NULL;
+        struct o2nm_node *local_node = NULL;
        struct o2net_sock_container *sc = NULL;
        struct o2net_node *nn;
@@ -1793,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock)
                goto out;
        }
-        if (o2nm_this_node() > node->nd_num) {
+        if (o2nm_this_node() >= node->nd_num) {
-                mlog(ML_NOTICE, "unexpected connect attempted from a lower "
+                local_node = o2nm_get_node_by_num(o2nm_this_node());
-                     "numbered node '%s' at " "%pI4:%d with num %u\n",
+                mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
-                     node->nd_name, &sin.sin_addr.s_addr,
+                     "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
-                     ntohs(sin.sin_port), node->nd_num);
+                     local_node->nd_name, local_node->nd_num,
+                     &(local_node->nd_ipv4_address),
+                     ntohs(local_node->nd_ipv4_port),
+                     node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1854,6 +1862,8 @@ out:
                sock_release(new_sock);
        if (node)
                o2nm_node_put(node);
+        if (local_node)
+                o2nm_node_put(local_node);
        if (sc)
                sc_put(sc);
        return ret;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..c49f6de0e7ab 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
                        goto bail;
                }
                i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, insert_bh);
+                        ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
                        goto bail;
                }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        }
        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        i_size_write(inode, size);
        inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                ocfs2_init_dir_trailer(inode, new_bh, size);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        int ret;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        u16 dr_suballoc_bit;
-        u64 dr_blkno;
+        u64 suballoc_loc, dr_blkno;
        unsigned int num_bits;
        struct buffer_head *dx_root_bh = NULL;
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dir_block_trailer *trailer =
                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
-                                   &num_bits, &dr_blkno);
+                                   &dr_suballoc_bit, &num_bits, &dr_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                dx_root->dr_list.l_count =
                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
        }
+        ocfs2_journal_dirty(handle, dx_root_bh);
-        ret = ocfs2_journal_dirty(handle, dx_root_bh);
-        if (ret)
-                mlog_errno(ret);
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
        *ret_dx_root_bh = dx_root_bh;
        dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
         * chance of contiguousness as the directory grows in number
         * of entries.
         */
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * if we only get one now, that's enough to continue. The rest
         * will be claimed after the conversion to extents.
         */
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        if (ocfs2_dir_resv_allowed(osb))
+                data_ac->ac_resv = &oi->ip_la_data_resv;
+        ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
-        ret = ocfs2_journal_dirty(handle, dirdata_bh);
+        ocfs2_journal_dirty(handle, dirdata_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
                /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        dir->i_blocks = ocfs2_inode_sector_count(dir);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb)) {
                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * pass. Claim the 2nd cluster as a separate extent.
         */
        if (alloc > len) {
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &len);
                if (ret) {
                        mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (ocfs2_dir_resv_allowed(osb))
+                        data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
             dx_leaf_sort_swap);
-        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
                                           &split_hash);
@@ -3955,6 +3931,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                goto out_commit;
        }
+        cpos = split_hash;
+        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                       data_ac, meta_ac, new_dx_leaves,
+                                       num_dx_leaves);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
        for (i = 0; i < num_dx_leaves; i++) {
                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
                                              orig_dx_leaves[i],
@@ -3963,15 +3948,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                        mlog_errno(ret);
                        goto out_commit;
                }
-        }
-        cpos = split_hash;
+                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
-        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                              new_dx_leaves[i],
-                                       data_ac, meta_ac, new_dx_leaves,
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                       num_dx_leaves);
+                if (ret) {
-        if (ret) {
+                        mlog_errno(ret);
-                mlog_errno(ret);
+                        goto out_commit;
-                goto out_commit;
+                }
        }
        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
@@ -4490,7 +4474,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
        blk = le64_to_cpu(dx_root->dr_blkno);
        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (dx_root->dr_suballoc_loc)
+                bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
                                       bit, bg_blkno, 1);
        if (ret)
@@ -4551,8 +4538,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
-                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
-                                               &dealloc);
+                                               &dealloc, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 12d5eb78a11a..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        return 0;
 }
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
                        mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..765298908f1d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT   (1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES         1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
                      struct dlm_lock_resource *res,
                      struct dlm_lock *lock);
@@ -1028,6 +1030,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
 void dlm_clean_master_list(struct dlm_ctxt *dlm,
                           u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
 int __dlm_lockres_unused(struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90803b47cd8c..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
                        dlm_error(ret);
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf54396..901ca52bf86b 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
 static int debug_buffer_release(struct inode *inode, struct file *file)
 {
-        struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+        struct debug_buffer *db = file->private_data;
        if (db)
                kfree(db->buf);
@@ -636,8 +636,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        spin_lock(&dlm->track_lock);
        if (oldres)
                track_list = &oldres->tracking;
-        else
+        else {
                track_list = &dlm->tracking_list;
+                if (list_empty(track_list)) {
+                        dl = NULL;
+                        spin_unlock(&dlm->track_lock);
+                        goto bail;
+                }
+        }
        list_for_each_entry(res, track_list, tracking) {
                if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +666,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
        } else
                dl = NULL;
+bail:
        /* passed to seq_show */
        return dl;
 }
@@ -715,7 +722,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
                goto bail;
        }
-        seq = (struct seq_file *) file->private_data;
+        seq = file->private_data;
        seq->private = dl;
        dlm_grab(dlm);
@@ -731,7 +738,7 @@ bail:
 static int debug_lockres_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq = (struct seq_file *)file->private_data;
+        struct seq_file *seq = file->private_data;
        struct debug_lockres *dl = (struct debug_lockres *)seq->private;
        if (dl->dl_res)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..11a5c87fd7f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        assert_spin_locked(&dlm->spinlock);
-        printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+        printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        node = exit_msg->node_idx;
-        printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                    &leave_msg, sizeof(leave_msg), node,
                                    NULL);
+        if (status < 0)
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
        mlog(0, "status return %d from o2net_send_message\n", status);
        return status;
@@ -691,6 +693,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
+                dlm_force_free_mles(dlm);
                dlm_complete_dlm_shutdown(dlm);
        }
        dlm_put(dlm);
@@ -904,7 +907,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-                printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
@@ -962,7 +965,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
                                    &cancel_msg, sizeof(cancel_msg), node,
                                    NULL);
        if (status < 0) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
@@ -1029,10 +1034,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-                                    sizeof(join_msg), node,
+                                    sizeof(join_msg), node, &join_resp);
-                                    &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
        dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1109,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
                                    &assert_msg, sizeof(assert_msg), node,
                                    NULL);
        if (status < 0)
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+                     node);
        return status;
 }
@@ -1516,7 +1524,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+        dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
                mlog_errno(-ENOMEM);
                kfree(dlm);
@@ -1550,7 +1558,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
-        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
@@ -1665,7 +1672,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
        struct dlm_ctxt *dlm = NULL;
        struct dlm_ctxt *new_ctxt = NULL;
-        if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+        if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
                ret = -ENAMETOOLONG;
                mlog(ML_ERROR, "domain name length too long\n");
                goto leave;
@@ -1703,6 +1710,7 @@ retry:
                }
                if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+                        spin_unlock(&dlm_domain_lock);
                        mlog(ML_ERROR,
                             "Requested locking protocol version is not "
                             "compatible with already registered domain "
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
                        BUG();
                }
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+        lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9289b4357d27..f564b0e5f80d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
        atomic_dec(&dlm->res_cur_count);
-        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
            !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
-        /* put in dlm_lockres_release */
-        dlm_grab(dlm);
        res->dlm = dlm;
        kref_init(&res->refs);
@@ -617,13 +613,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
-        res = (struct dlm_lock_resource *)
+        res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
                goto error;
-        res->lockname.name = (char *)
+        res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
        if (!res->lockname.name)
                goto error;
@@ -757,8 +751,7 @@ lookup:
                spin_unlock(&dlm->spinlock);
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
-                alloc_mle = (struct dlm_master_list_entry *)
+                alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1535,7 @@ way_up_top:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
-                        mle = (struct dlm_master_list_entry *)
+                        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
@@ -1666,7 +1658,9 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(0, "assert_master returned %d!\n", tmpret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", tmpret,
+                             DLM_ASSERT_MASTER_MSG, dlm->key, to);
                        if (!dlm_is_host_down(tmpret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
@@ -2205,7 +2199,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                     res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2448,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
                goto leave;
        }
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2809,14 +2804,8 @@ again:
                mlog(0, "trying again...\n");
                goto again;
        }
-        /* now that we are sure the MIGRATING state is there, drop
-         * the unneded state which blocked threads trying to DIRTY */
-        spin_lock(&res->spinlock);
-        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
-        BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
-        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
-        spin_unlock(&res->spinlock);
+        ret = 0;
        /* did the target go down or die? */
        spin_lock(&dlm->spinlock);
        if (!test_bit(target, dlm->domain_map)) {
@@ -2827,9 +2816,21 @@ again:
        spin_unlock(&dlm->spinlock);
        /*
+         * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+         * another try; otherwise, we are sure the MIGRATING state is there,
+         * drop the unneded state which blocked threads trying to DIRTY
+         */
+        spin_lock(&res->spinlock);
+        BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+        res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+        if (!ret)
+                BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+        spin_unlock(&res->spinlock);
+        /*
         * at this point:
         *
-         *   o the DLM_LOCK_RES_MIGRATING flag is set
+         *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
         *   o there are no pending asts on this lockres
         *   o all processes trying to reserve an ast on this
         *     lockres must wait for the MIGRATING flag to clear
@@ -2975,7 +2976,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                        mlog(0, "migrate_request returned %d!\n", ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                             dlm->key, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -3033,8 +3036,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
@@ -3044,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        /* check for pre-existing lock */
        spin_lock(&dlm->spinlock);
        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
-        spin_lock(&dlm->master_lock);
        if (res) {
                spin_lock(&res->spinlock);
                if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3063,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
                spin_unlock(&res->spinlock);
        }
+        spin_lock(&dlm->master_lock);
        /* ignore status.  only nonzero status would BUG. */
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
                                    name, namelen,
                                    migrate->new_master,
                                    migrate->master);
-unlock:
        spin_unlock(&dlm->master_lock);
+unlock:
        spin_unlock(&dlm->spinlock);
        if (oldmle) {
@@ -3432,3 +3433,43 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
        wake_up(&res->wq);
        wake_up(&dlm->migration_wq);
 }
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+        int i;
+        struct hlist_head *bucket;
+        struct dlm_master_list_entry *mle;
+        struct hlist_node *tmp, *list;
+        /*
+         * We notified all other nodes that we are exiting the domain and
+         * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+         * around we force free them and wake any processes that are waiting
+         * on the mles
+         */
+        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->master_lock);
+        BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+        BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+        for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+                bucket = dlm_master_hash(dlm, i);
+                hlist_for_each_safe(list, tmp, bucket) {
+                        mle = hlist_entry(list, struct dlm_master_list_entry,
+                                          master_hash_node);
+                        if (mle->type != DLM_MLE_BLOCK) {
+                                mlog(ML_ERROR, "bad mle: %p\n", mle);
+                                dlm_print_one_mle(mle);
+                        }
+                        atomic_set(&mle->woken, 1);
+                        wake_up(&mle->wq);
+                        __dlm_unlink_mle(dlm, mle);
+                        __dlm_mle_detach_hb_events(dlm, mle);
+                        __dlm_put_mle(mle);
+                }
+        }
+        spin_unlock(&dlm->master_lock);
+        spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
                int bit;
-                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+                bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
                if (bit >= O2NM_MAX_NODES || bit < 0)
                        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
                else
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        /* negative status is handled by caller */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                     dlm->key, request_from);
        // return from here, then
        // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                     dlm->key, send_to);
                if (!dlm_is_host_down(ret)) {
-                        mlog_errno(ret);
-                        mlog(ML_ERROR, "%s: unknown error sending data-done "
-                             "to %u\n", dlm->name, send_to);
                        BUG();
                }
        } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                     dlm->key, send_to);
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                 &req, sizeof(req), nodenum, &status);
        /* XXX: negative status not handled properly here. */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+                     dlm->key, nodenum);
        else {
                BUG_ON(status < 0);
                BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1991,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        struct list_head *queue;
        struct dlm_lock *lock, *next;
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&res->spinlock);
        res->state |= DLM_LOCK_RES_RECOVERING;
        if (!list_empty(&res->recovering)) {
                mlog(0,
@@ -2320,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                        /* zero the lvb if necessary */
                        dlm_revalidate_lvb(dlm, res, dead_node);
                        if (res->owner == dead_node) {
-                                if (res->state & DLM_LOCK_RES_DROPPING_REF)
+                                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
-                                        mlog(0, "%s:%.*s: owned by "
+                                        mlog(ML_NOTICE, "Ignore %.*s for "
-                                             "dead node %u, this node was "
+                                             "recovery as it is being freed\n",
-                                             "dropping its ref when it died. "
+                                             res->lockname.len,
-                                             "continue, dropping the flag.\n",
+                                             res->lockname.name);
-                                             dlm->name, res->lockname.len,
+                                } else
-                                             res->lockname.name, dead_node);
+                                        dlm_move_lockres_to_recovery_list(dlm,
+                                                                          res);
-                                /* the wake_up for this will happen when the
-                                 * RECOVERING flag is dropped later */
-                                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
-                                dlm_move_lockres_to_recovery_list(dlm, res);
                        } else if (res->owner == dlm->node_num) {
                                dlm_free_dead_locks(dlm, res, dead_node);
                                __dlm_lockres_calc_usage(dlm, res);
@@ -2640,7 +2644,7 @@ retry:
                if (dlm_is_host_down(ret)) {
                        /* node is down.  not involved in recovery
                         * so just keep going */
-                        mlog(0, "%s: node %u was down when sending "
+                        mlog(ML_NOTICE, "%s: node %u was down when sending "
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
@@ -2660,11 +2664,12 @@ retry:
                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-                            " returned %d\n", dlm->name, nodenum, ret);
+                             "returned %d\n", dlm->name, nodenum, ret);
                        res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
                                                 DLM_RECOVERY_LOCK_NAME_LEN);
                        if (res) {
@@ -2789,7 +2794,9 @@ stage2:
                if (ret >= 0)
                        ret = status;
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+                             dlm->key, nodenum);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery
                                 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1fd1d35..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
 * truly ready to be freed. */
 int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
-        if (!__dlm_lockres_has_locks(res) &&
+        int bit;
-            (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
-                /* try not to scan the bitmap unless the first two
+        if (__dlm_lockres_has_locks(res))
-                 * conditions are already true */
+                return 0;
-                int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-                if (bit >= O2NM_MAX_NODES) {
+        if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
-                        /* since the bit for dlm->node_num is not
+                return 0;
-                         * set, inflight_locks better be zero */
-                        BUG_ON(res->inflight_locks != 0);
+        if (res->state & DLM_LOCK_RES_RECOVERING)
-                        return 1;
+                return 0;
-                }
-        }
+        bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        return 0;
+        if (bit < O2NM_MAX_NODES)
+                return 0;
+        /*
+         * since the bit for dlm->node_num is not set, inflight_locks better
+         * be zero
+         */
+        BUG_ON(res->inflight_locks != 0);
+        return 1;
 }
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        spin_unlock(&dlm->spinlock);
 }
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                             struct dlm_lock_resource *res)
 {
        int master;
        int ret = 0;
-        spin_lock(&res->spinlock);
+        assert_spin_locked(&dlm->spinlock);
-        if (!__dlm_lockres_unused(res)) {
+        assert_spin_locked(&res->spinlock);
-                mlog(0, "%s:%.*s: tried to purge but not unused\n",
-                     dlm->name, res->lockname.len, res->lockname.name);
-                __dlm_print_one_lock_resource(res);
-                spin_unlock(&res->spinlock);
-                BUG();
-        }
-        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "%s:%.*s: Delay dropref as this lockres is "
-                     "being remastered\n", dlm->name, res->lockname.len,
-                     res->lockname.name);
-                /* Re-add the lockres to the end of the purge list */
-                if (!list_empty(&res->purge)) {
-                        list_del_init(&res->purge);
-                        list_add_tail(&res->purge, &dlm->purge_list);
-                }
-                spin_unlock(&res->spinlock);
-                return 0;
-        }
        master = (res->owner == dlm->node_num);
-        if (!master)
-                res->state |= DLM_LOCK_RES_DROPPING_REF;
-        spin_unlock(&res->spinlock);
        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
             res->lockname.name, master);
        if (!master) {
+                res->state |= DLM_LOCK_RES_DROPPING_REF;
                /* drop spinlock...  retake below */
+                spin_unlock(&res->spinlock);
                spin_unlock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
+                spin_lock(&res->spinlock);
        }
-        spin_lock(&res->spinlock);
        if (!list_empty(&res->purge)) {
                mlog(0, "removing lockres %.*s:%p from purgelist, "
                     "master = %d\n", res->lockname.len, res->lockname.name,
                     res, master);
                list_del_init(&res->purge);
-                spin_unlock(&res->spinlock);
                dlm_lockres_put(res);
                dlm->purge_count--;
-        } else
+        }
-                spin_unlock(&res->spinlock);
+        if (!__dlm_lockres_unused(res)) {
+                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
+                __dlm_print_one_lock_resource(res);
+                BUG();
+        }
        __dlm_unhash_lockres(res);
        /* lockres is not in the hash now.  drop the flag and wake up
         * any processes waiting in dlm_get_lock_resource. */
        if (!master) {
-                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
                spin_unlock(&res->spinlock);
                wake_up(&res->wq);
-        }
+        } else
-        return 0;
+                spin_unlock(&res->spinlock);
 }
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                lockres = list_entry(dlm->purge_list.next,
                                     struct dlm_lock_resource, purge);
-                /* Status of the lockres *might* change so double
-                 * check. If the lockres is unused, holding the dlm
-                 * spinlock will prevent people from getting and more
-                 * refs on it -- there's no need to keep the lockres
-                 * spinlock. */
                spin_lock(&lockres->spinlock);
-                unused = __dlm_lockres_unused(lockres);
-                spin_unlock(&lockres->spinlock);
-                if (!unused)
-                        continue;
                purge_jiffies = lockres->last_used +
                        msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                         * in tail order, we can stop at the first
                         * unpurgable resource -- anyone added after
                         * him will have a greater last_used value */
+                        spin_unlock(&lockres->spinlock);
                        break;
                }
+                /* Status of the lockres *might* change so double
+                 * check. If the lockres is unused, holding the dlm
+                 * spinlock will prevent people from getting and more
+                 * refs on it. */
+                unused = __dlm_lockres_unused(lockres);
+                if (!unused ||
+                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+                        mlog(0, "lockres %s:%.*s: is in use or "
+                             "being remastered, used %d, state %d\n",
+                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.name, !unused, lockres->state);
+                        list_move_tail(&dlm->purge_list, &lockres->purge);
+                        spin_unlock(&lockres->spinlock);
+                        continue;
+                }
                dlm_lockres_get(lockres);
-                /* This may drop and reacquire the dlm spinlock if it
+                dlm_purge_lockres(dlm, lockres);
-                 * has to do migration. */
-                if (dlm_purge_lockres(dlm, lockres))
-                        BUG();
                dlm_lockres_put(lockres);
@@ -309,6 +305,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
         * basts right before queueing them all throughout */
+        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
                              DLM_LOCK_RES_RECOVERING|
@@ -337,7 +334,7 @@ converting:
                        /* queue the BAST if not already */
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        /* update the highest_blocked if needed */
                        if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +352,7 @@ converting:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.convert_type)
                                lock->ml.highest_blocked =
@@ -383,7 +380,7 @@ converting:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -402,7 +399,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +415,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +441,7 @@ blocked:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -674,6 +671,7 @@ static int dlm_thread(void *data)
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
+                        spin_lock(&dlm->ast_lock);
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
@@ -694,6 +692,7 @@ static int dlm_thread(void *data)
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
+                                spin_unlock(&dlm->ast_lock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
                                     res->lockname.len, res->lockname.name,
@@ -715,6 +714,7 @@ static int dlm_thread(void *data)
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
+                        spin_unlock(&dlm->ast_lock);
                        dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b47c1b92b82b..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                        mlog(0, "master was in-progress.  retry\n");
                ret = status;
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
                if (dlm_is_host_down(tmpret)) {
                        /* NOTE: this seems strange, but it is what we want.
                         * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b83d6107a1f5..c2903b84bb7a 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode,
 {
        int level, status;
        struct dlmfs_inode_private *ip = DLMFS_I(inode);
-        struct dlmfs_filp_private *fp =
+        struct dlmfs_filp_private *fp = file->private_data;
-                (struct dlmfs_filp_private *) file->private_data;
        if (S_ISDIR(inode->i_mode))
                BUG();
@@ -214,10 +213,12 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
        attr->ia_valid &= ~ATTR_SIZE;
        error = inode_change_ok(inode, attr);
-        if (!error)
+        if (error)
-                error = inode_setattr(inode, attr);
+                return error;
-        return error;
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
@@ -355,13 +356,12 @@ static void dlmfs_destroy_inode(struct inode *inode)
        kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 }
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_evict_inode(struct inode *inode)
 {
        int status;
        struct dlmfs_inode_private *ip;
-        if (!inode)
+        end_writeback(inode);
-                return;
        mlog(0, "inode %lu\n", inode->i_ino);
@@ -631,7 +631,7 @@ static const struct super_operations dlmfs_ops = {
        .statfs         = simple_statfs,
        .alloc_inode    = dlmfs_alloc_inode,
        .destroy_inode  = dlmfs_destroy_inode,
-        .clear_inode    = dlmfs_clear_inode,
+        .evict_inode    = dlmfs_evict_inode,
        .drop_inode     = generic_delete_inode,
 };
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..5e02a893f46e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
 {
-        struct seq_file *seq = (struct seq_file *) file->private_data;
+        struct seq_file *seq = file->private_data;
        struct ocfs2_dlm_seq_priv *priv = seq->private;
        struct ocfs2_lock_res *res = &priv->p_iter_res;
@@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
                goto out;
        }
-        seq = (struct seq_file *) file->private_data;
+        seq = file->private_data;
        seq->private = priv;
        ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
                oinfo->dqi_gi.dqi_free_entry =
                                        be32_to_cpu(lvb->lvb_free_entry);
        } else {
-                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+                                                     oinfo->dqi_giblk, &bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d6..1d596d8c4a4a 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
        OI_LS_PARENT,
        OI_LS_RENAME1,
        OI_LS_RENAME2,
+        OI_LS_REFLINK_TARGET,
 };
 int ocfs2_dlm_init(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5fbd9cea968..9a03c151b5ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -175,13 +176,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int ocfs2_sync_file(struct file *file,
+static int ocfs2_sync_file(struct file *file, int datasync)
-                           struct dentry *dentry,
-                           int datasync)
 {
        int err = 0;
        journal_t *journal;
-        struct inode *inode = dentry->d_inode;
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -191,8 +191,16 @@ static int ocfs2_sync_file(struct file *file,
        if (err)
                goto bail;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+                /*
+                 * We still have to flush drive's caches to get data to the
+                 * platter
+                 */
+                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                                           NULL, BLKDEV_IFL_WAIT);
                goto bail;
+        }
        journal = osb->journal->j_journal;
        err = jbd2_journal_force_commit(journal);
@@ -278,10 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +435,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -449,7 +452,6 @@ static int ocfs2_truncate_file(struct inode *inode,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_truncate_context *tc = NULL;
        mlog_entry("(inode = %llu, new_i_size = %llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +490,9 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_resv_discard(&osb->osb_la_resmap,
+                           &OCFS2_I(inode)->ip_la_data_resv);
        /*
         * The inode lock forced other nodes to sync and drop their
         * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +522,7 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
-        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+        status = ocfs2_commit_truncate(osb, inode, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_unlock_sem;
-        }
-        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail_unlock_sem;
@@ -666,11 +665,7 @@ restarted_transaction:
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -738,61 +733,113 @@ leave:
        return status;
 }
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = NULL;
+        int ret = 0;
+        if (!ocfs2_should_order_data(inode))
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_jbd2_file_inode(handle, inode);
+        if (ret < 0)
+                mlog_errno(ret);
+out:
+        if (ret) {
+                if (!IS_ERR(handle))
+                        ocfs2_commit_trans(osb, handle);
+                handle = ERR_PTR(ret);
+        }
+        return handle;
+}
 /* Some parts of this taken from generic_cont_expand, which turned out
 * to be too fragile to do exactly what we need without us having to
 * worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
-                                 u64 size)
+                                 u64 abs_to)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
-        unsigned long index;
+        unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-        unsigned int offset;
        handle_t *handle = NULL;
-        int ret;
+        int ret = 0;
+        unsigned zero_from, zero_to, block_start, block_end;
-        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
+        BUG_ON(abs_from >= abs_to);
-        /* ugh.  in prepare/commit_write, if from==to==start of block, we
+        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
-        ** skip the prepare.  make sure we never send an offset for the start
+        BUG_ON(abs_from & (inode->i_blkbits - 1));
-        ** of a block
-        */
-        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-                offset++;
-        }
-        index = size >> PAGE_CACHE_SHIFT;
-        page = grab_cache_page(mapping, index);
+        page = find_or_create_page(mapping, index, GFP_NOFS);
        if (!page) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
+        /* Get the offsets within the page that we want to zero */
-        if (ret < 0) {
+        zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
-                mlog_errno(ret);
+        zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
-                goto out_unlock;
+        if (!zero_to)
-        }
+                zero_to = PAGE_CACHE_SIZE;
-        if (ocfs2_should_order_data(inode)) {
+        mlog(0,
-                handle = ocfs2_start_walk_page_trans(inode, page, offset,
+             "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-                                                     offset);
+             (unsigned long long)abs_from, (unsigned long long)abs_to,
-                if (IS_ERR(handle)) {
+             index, zero_from, zero_to);
-                        ret = PTR_ERR(handle);
-                        handle = NULL;
+        /* We know that zero_from is block aligned */
+        for (block_start = zero_from; block_start < zero_to;
+             block_start = block_end) {
+                block_end = block_start + (1 << inode->i_blkbits);
+                /*
+                 * block_start is block-aligned.  Bump it by one to
+                 * force ocfs2_{prepare,commit}_write() to zero the
+                 * whole block.
+                 */
+                ret = ocfs2_prepare_write_nolock(inode, page,
+                                                 block_start + 1,
+                                                 block_start + 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
                        goto out_unlock;
                }
-        }
-        /* must not update i_size! */
+                if (!handle) {
-        ret = block_commit_write(page, offset, offset);
+                        handle = ocfs2_zero_start_ordered_transaction(inode);
-        if (ret < 0)
+                        if (IS_ERR(handle)) {
-                mlog_errno(ret);
+                                ret = PTR_ERR(handle);
-        else
+                                handle = NULL;
-                ret = 0;
+                                break;
+                        }
+                }
+                /* must not update i_size! */
+                ret = block_commit_write(page, block_start + 1,
+                                         block_start + 1);
+                if (ret < 0)
+                        mlog_errno(ret);
+                else
+                        ret = 0;
+        }
        if (handle)
                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock:
        unlock_page(page);
        page_cache_release(page);
@@ -800,22 +847,114 @@ out:
        return ret;
 }
-static int ocfs2_zero_extend(struct inode *inode,
+/*
-                             u64 zero_to_size)
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       u64 zero_start, u64 zero_end,
+                                       u64 *range_start, u64 *range_end)
 {
-        int ret = 0;
+        int rc = 0, needs_cow = 0;
-        u64 start_off;
+        u32 p_cpos, zero_clusters = 0;
-        struct super_block *sb = inode->i_sb;
+        u32 zero_cpos =
+                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
-        start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        while (zero_cpos < last_cpos) {
-        while (start_off < zero_to_size) {
+                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
-                ret = ocfs2_write_zero_page(inode, start_off);
+                                        &num_clusters, &ext_flags);
-                if (ret < 0) {
+                if (rc) {
-                        mlog_errno(ret);
+                        mlog_errno(rc);
+                        goto out;
+                }
+                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+                        zero_clusters = num_clusters;
+                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                                needs_cow = 1;
+                        break;
+                }
+                zero_cpos += num_clusters;
+        }
+        if (!zero_clusters) {
+                *range_end = 0;
+                goto out;
+        }
+        while ((zero_cpos + zero_clusters) < last_cpos) {
+                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+                                        &p_cpos, &num_clusters,
+                                        &ext_flags);
+                if (rc) {
+                        mlog_errno(rc);
                        goto out;
                }
-                start_off += sb->s_blocksize;
+                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+                        break;
+                if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                        needs_cow = 1;
+                zero_clusters += num_clusters;
+        }
+        if ((zero_cpos + zero_clusters) > last_cpos)
+                zero_clusters = last_cpos - zero_cpos;
+        if (needs_cow) {
+                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+                                        UINT_MAX);
+                if (rc) {
+                        mlog_errno(rc);
+                        goto out;
+                }
+        }
+        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+                                             zero_cpos + zero_clusters);
+out:
+        return rc;
+}
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+                                   u64 range_end)
+{
+        int rc = 0;
+        u64 next_pos;
+        u64 zero_pos = range_start;
+        mlog(0, "range_start = %llu, range_end = %llu\n",
+             (unsigned long long)range_start,
+             (unsigned long long)range_end);
+        BUG_ON(range_start >= range_end);
+        while (zero_pos < range_end) {
+                next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+                if (next_pos > range_end)
+                        next_pos = range_end;
+                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+                if (rc < 0) {
+                        mlog_errno(rc);
+                        break;
+                }
+                zero_pos = next_pos;
                /*
                 * Very large extends have the potential to lock up
@@ -824,16 +963,63 @@ static int ocfs2_zero_extend(struct inode *inode,
                cond_resched();
        }
-out:
+        return rc;
+}
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to_size)
+{
+        int ret = 0;
+        u64 zero_start, range_start = 0, range_end = 0;
+        struct super_block *sb = inode->i_sb;
+        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+        mlog(0, "zero_start %llu for i_size %llu\n",
+             (unsigned long long)zero_start,
+             (unsigned long long)i_size_read(inode));
+        while (zero_start < zero_to_size) {
+                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+                                                  zero_to_size,
+                                                  &range_start,
+                                                  &range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                if (!range_end)
+                        break;
+                /* Trim the ends */
+                if (range_start < zero_start)
+                        range_start = zero_start;
+                if (range_end > zero_to_size)
+                        range_end = zero_to_size;
+                ret = ocfs2_zero_extend_range(inode, range_start,
+                                              range_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                zero_start = range_end;
+        }
        return ret;
 }
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+                          u64 new_i_size, u64 zero_to)
 {
        int ret;
        u32 clusters_to_add;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        /*
+         * Only quota files call this without a bh, and they can't be
+         * refcounted.
+         */
+        BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
        if (clusters_to_add < oi->ip_clusters)
                clusters_to_add = 0;
@@ -854,7 +1040,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
         * still need to zero the area between the old i_size and the
         * new i_size.
         */
-        ret = ocfs2_zero_extend(inode, zero_to);
+        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
        if (ret < 0)
                mlog_errno(ret);
@@ -876,27 +1062,15 @@ static int ocfs2_extend_file(struct inode *inode,
                goto out;
        if (i_size_read(inode) == new_i_size)
-                goto out;
+                goto out;
        BUG_ON(new_i_size < i_size_read(inode));
        /*
-         * Fall through for converting inline data, even if the fs
-         * supports sparse files.
-         *
-         * The check for inline data here is legal - nobody can add
-         * the feature since we have i_mutex. We must check it again
-         * after acquiring ip_alloc_sem though, as paths like mmap
-         * might have raced us to converting the inode to extents.
-         */
-        if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                goto out_update_size;
-        /*
         * The alloc sem blocks people in read/write from reading our
         * allocation until we're done changing it. We depend on
         * i_mutex to block other extend/truncate calls while we're
-         * here.
+         * here.  We even have to hold it for sparse files because there
+         * might be some tail zeroing.
         */
        down_write(&oi->ip_alloc_sem);
@@ -913,14 +1087,16 @@ static int ocfs2_extend_file(struct inode *inode,
                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
                if (ret) {
                        up_write(&oi->ip_alloc_sem);
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-                ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+        else
+                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+                                            new_i_size);
        up_write(&oi->ip_alloc_sem);
@@ -946,9 +1122,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
-        int qtype;
-        struct dquot *transfer_from[MAXQUOTAS] = { };
        struct dquot *transfer_to[MAXQUOTAS] = { };
+        int qtype;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -979,10 +1154,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        if (status)
                return status;
+        if (is_quota_modification(inode, attr))
+                dquot_initialize(inode);
        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
        if (size_change) {
-                dquot_initialize(inode);
                status = ocfs2_rw_lock(inode, 1);
                if (status < 0) {
                        mlog_errno(status);
@@ -1032,9 +1207,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
                                                      USRQUOTA);
-                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+                        if (!transfer_to[USRQUOTA]) {
-                                                        USRQUOTA);
-                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1044,9 +1217,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
                                                      GRPQUOTA);
-                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+                        if (!transfer_to[GRPQUOTA]) {
-                                                        GRPQUOTA);
-                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1058,7 +1229,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                        goto bail_unlock;
                }
-                status = dquot_transfer(inode, attr);
+                status = __dquot_transfer(inode, transfer_to);
                if (status < 0)
                        goto bail_commit;
        } else {
@@ -1071,18 +1242,26 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
        /*
-         * This will intentionally not wind up calling vmtruncate(),
+         * This will intentionally not wind up calling truncate_setsize(),
         * since all the work for a size change has been done above.
         * Otherwise, we could get into problems with truncate as
         * ip_alloc_sem is used there to protect against i_size
         * changes.
+         *
+         * XXX: this means the conditional below can probably be removed.
         */
-        status = inode_setattr(inode, attr);
+        if ((attr->ia_valid & ATTR_SIZE) &&
-        if (status < 0) {
+            attr->ia_size != i_size_read(inode)) {
-                mlog_errno(status);
+                status = vmtruncate(inode, attr->ia_size);
-                goto bail_commit;
+                if (status) {
+                        mlog_errno(status);
+                        goto bail_commit;
+                }
        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
        status = ocfs2_mark_inode_dirty(handle, inode, bh);
        if (status < 0)
                mlog_errno(status);
@@ -1098,10 +1277,8 @@ bail:
        brelse(bh);
        /* Release quota pointers in case we acquired them */
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++)
                dqput(transfer_to[qtype]);
-                dqput(transfer_from[qtype]);
-        }
        if (!status && attr->ia_valid & ATTR_MODE) {
                status = ocfs2_acl_chmod(inode);
@@ -1195,9 +1372,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1609,90 @@ out:
        return ret;
 }
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+        int i;
+        struct ocfs2_extent_rec *rec = NULL;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) < pos)
+                        break;
+        }
+        return i;
+}
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_extent_rec *rec,
+                                 u32 trunc_start, u32 *trunc_cpos,
+                                 u32 *trunc_len, u32 *trunc_end,
+                                 u64 *blkno, int *done)
+{
+        int ret = 0;
+        u32 coff, range;
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+                *trunc_cpos = le32_to_cpu(rec->e_cpos);
+                /*
+                 * Skip holes if any.
+                 */
+                if (range < *trunc_end)
+                        *trunc_end = range;
+                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno);
+                *trunc_end = le32_to_cpu(rec->e_cpos);
+        } else if (range > trunc_start) {
+                *trunc_cpos = trunc_start;
+                *trunc_len = *trunc_end - trunc_start;
+                coff = trunc_start - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
+                *trunc_end = trunc_start;
+        } else {
+                /*
+                 * It may have two following possibilities:
+                 *
+                 * - last record has been removed
+                 * - trunc_start was within a hole
+                 *
+                 * both two cases mean the completion of hole punching.
+                 */
+                ret = 1;
+        }
+        *done = ret;
+}
 static int ocfs2_remove_inode_range(struct inode *inode,
                                    struct buffer_head *di_bh, u64 byte_start,
                                    u64 byte_len)
 {
-        int ret = 0;
+        int ret = 0, flags = 0, done = 0, i;
-        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+        u32 cluster_in_el;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
        struct ocfs2_extent_tree et;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1718,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
+        /*
+         * For reflinks, we may need to CoW 2 clusters which might be
+         * partially zero'd later, if hole's start and end offset were
+         * within one cluster(means is not exactly aligned to clustersize).
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
-        if (trunc_len >= trunc_start)
+        cluster_in_el = trunc_end;
-                trunc_len -= trunc_start;
-        else
-                trunc_len = 0;
-        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)byte_start,
-             (unsigned long long)byte_len, trunc_start, trunc_len);
+             (unsigned long long)byte_len, trunc_start, trunc_end);
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
@@ -1487,31 +1754,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
-        cpos = trunc_start;
+        path = ocfs2_new_path_from_et(&et);
-        while (trunc_len) {
+        if (!path) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                ret = -ENOMEM;
-                                         &alloc_size, NULL);
+                mlog_errno(ret);
+                goto out;
+        }
+        while (trunc_end > trunc_start) {
+                ret = ocfs2_find_path(INODE_CACHE(inode), path,
+                                      cluster_in_el);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                if (alloc_size > trunc_len)
+                el = path_leaf_el(path);
-                        alloc_size = trunc_len;
+                i = ocfs2_find_rec(el, trunc_end);
+                /*
+                 * Need to go to previous extent block.
+                 */
+                if (i < 0) {
+                        if (path->p_tree_depth == 0)
+                                break;
-                /* Only do work for non-holes */
+                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
-                if (phys_cpos != 0) {
+                                                            path,
-                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
+                                                            &cluster_in_el);
-                                                       phys_cpos, alloc_size,
-                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+                        /*
+                         * We've reached the leftmost extent block,
+                         * it's safe to leave.
+                         */
+                        if (cluster_in_el == 0)
+                                break;
+                        /*
+                         * The 'pos' searched for previous extent block is
+                         * always one cluster less than actual trunc_end.
+                         */
+                        trunc_end = cluster_in_el + 1;
+                        ocfs2_reinit_path(path, 1);
+                        continue;
+                } else
+                        rec = &el->l_recs[i];
+                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+                                     &trunc_len, &trunc_end, &blkno, &done);
+                if (done)
+                        break;
+                flags = rec->e_flags;
+                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                               phys_cpos, trunc_len, flags,
+                                               &dealloc, refcount_loc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                cpos += alloc_size;
+                cluster_in_el = trunc_end;
-                trunc_len -= alloc_size;
+                ocfs2_reinit_path(path, 1);
        }
        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
@@ -2001,9 +2316,13 @@ relock:
                         * direct write may have instantiated a few
                         * blocks outside i_size. Trim these off again.
                         * Don't need i_size_read because we hold i_mutex.
+                         *
+                         * XXX(truncate): this looks buggy because ocfs2 did not
+                         * actually implement ->truncate.  Take a look at
+                         * the new truncate sequence and update this accordingly
                         */
                        if (*ppos + count > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
+                                truncate_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
@@ -2019,7 +2338,7 @@ out_dio:
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
-            ((file->f_flags & O_DIRECT) && has_refcount)) {
+            ((file->f_flags & O_DIRECT) && !direct_io)) {
                ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                               pos + count - 1);
                if (ret < 0)
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
-                          u64 zero_to);
+                          u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+                      loff_t zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index af189887201c..eece3e05d9d0 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_last_used_slot = 0;
        OCFS2_I(inode)->ip_last_used_group = 0;
+        if (S_ISDIR(inode->i_mode))
+                ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+                                    OCFS2_RESV_FLAG_DIR);
        mlog_exit_void();
 }
@@ -484,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                                     OCFS2_BH_IGNORE_CACHE);
        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
-                if (!status)
+                /*
+                 * If buffer is in jbd, then its checksum may not have been
+                 * computed as yet.
+                 */
+                if (!status && !buffer_jbd(bh))
                        status = ocfs2_validate_inode_block(osb->sb, bh);
        }
        if (status < 0) {
@@ -539,7 +547,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
@@ -582,13 +589,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
-                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                status = ocfs2_commit_truncate(osb, inode, fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto out;
-                }
-                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -659,12 +660,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        ocfs2_journal_dirty(handle, di_bh);
-        status = ocfs2_journal_dirty(handle, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_commit;
-        }
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
        dquot_free_inode(inode);
@@ -977,10 +973,10 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
        truncate_inode_pages(&inode->i_data, 0);
 }
-void ocfs2_delete_inode(struct inode *inode)
+static void ocfs2_delete_inode(struct inode *inode)
 {
        int wipe, status;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +1003,7 @@ void ocfs2_delete_inode(struct inode *inode)
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
         * forever. */
-        sigfillset(&blocked);
+        ocfs2_block_signals(&oldset);
-        status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_cleanup_delete_inode(inode, 1);
-                goto bail;
-        }
        /*
         * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,24 +1077,19 @@ bail_unlock_nfs_sync:
        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
-        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ocfs2_unblock_signals(&oldset);
-        if (status < 0)
-                mlog_errno(status);
 bail:
-        clear_inode(inode);
        mlog_exit_void();
 }
-void ocfs2_clear_inode(struct inode *inode)
+static void ocfs2_clear_inode(struct inode *inode)
 {
        int status;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        mlog_entry_void();
-        if (!inode)
+        end_writeback(inode);
-                goto bail;
        mlog(0, "Clearing inode: %llu, nlink = %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
@@ -1123,6 +1108,10 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+        ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+                           &oi->ip_la_data_resv);
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
         * locks until the journal has finished with it. The only
@@ -1192,16 +1181,27 @@ void ocfs2_clear_inode(struct inode *inode)
        jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
                                       &oi->ip_jinode);
-bail:
        mlog_exit_void();
 }
+void ocfs2_evict_inode(struct inode *inode)
+{
+        if (!inode->i_nlink ||
+            (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+                ocfs2_delete_inode(inode);
+        } else {
+                truncate_inode_pages(&inode->i_data, 0);
+        }
+        ocfs2_clear_inode(inode);
+}
 /* Called under inode_lock, with no more references on the
 * struct inode, so it's safe here to check the flags field
 * and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        int res;
        mlog_entry_void();
@@ -1209,11 +1209,12 @@ void ocfs2_drop_inode(struct inode *inode)
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-                generic_delete_inode(inode);
+                res = 1;
        else
-                generic_drop_inode(inode);
+                res = generic_drop_inode(inode);
        mlog_exit_void();
+        return res;
 }
 /*
@@ -1298,13 +1299,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0)
-                mlog_errno(status);
-        status = 0;
 leave:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0b28e1921a39..6de5a869db30 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
 /*
@@ -121,9 +123,8 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
        return &OCFS2_I(inode)->ip_metadata_cache;
 }
-void ocfs2_clear_inode(struct inode *inode);
+void ocfs2_evict_inode(struct inode *inode);
-void ocfs2_delete_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE           0x1
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..9b57c0350ff9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 /*
- * 'nblocks' is what you want to add to the current
+ * 'nblocks' is what you want to add to the current transaction.
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
 *
 * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-        int status;
+        int status, old_nblocks;
        BUG_ON(!handle);
-        BUG_ON(!nblocks);
+        BUG_ON(nblocks < 0);
+        if (!nblocks)
+                return 0;
+        old_nblocks = handle->h_buffer_credits;
        mlog_entry_void();
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                mlog(0,
                     "jbd2_journal_extend failed, trying "
                     "jbd2_journal_restart\n");
-                status = jbd2_journal_restart(handle, nblocks);
+                status = jbd2_journal_restart(handle,
+                                              old_nblocks + nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -469,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
 }
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -488,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Quota blocks have their own trigger because the struct ocfs2_block_check
 * offset depends on the blocksize.
 */
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -508,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 * Directory blocks also have their own trigger because the
 * struct ocfs2_block_check offset depends on the blocksize.
 */
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
                                 struct buffer_head *bh,
                                 void *data, size_t size)
 {
@@ -541,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 static struct ocfs2_triggers di_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
@@ -549,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
 static struct ocfs2_triggers eb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
@@ -557,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
 static struct ocfs2_triggers rb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -565,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
 static struct ocfs2_triggers gd_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
@@ -573,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
 static struct ocfs2_triggers db_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_db_commit_trigger,
+                .t_frozen = ocfs2_db_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers xb_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -588,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
 static struct ocfs2_triggers dq_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_dq_commit_trigger,
+                .t_frozen = ocfs2_dq_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
 };
 static struct ocfs2_triggers dr_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -603,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
 static struct ocfs2_triggers dl_triggers = {
        .ot_triggers = {
-                .t_commit = ocfs2_commit_trigger,
+                .t_frozen = ocfs2_frozen_trigger,
                .t_abort = ocfs2_abort_trigger,
        },
        .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
        return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
-int ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
-                        struct buffer_head *bh)
 {
        int status;
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
                   (unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
-        if (status < 0)
+        BUG_ON(status);
-                mlog(ML_ERROR, "Could not dirty metadata buffer. "
-                     "(bh->b_blocknr=%llu)\n",
-                     (unsigned long long)bh->b_blocknr);
-        mlog_exit(status);
+        mlog_exit_void();
-        return status;
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -762,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
        if (osb->osb_commit_interval)
                commit_interval = osb->osb_commit_interval;
-        spin_lock(&journal->j_state_lock);
+        write_lock(&journal->j_state_lock);
        journal->j_commit_interval = commit_interval;
        if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
-        spin_unlock(&journal->j_state_lock);
+        write_unlock(&journal->j_state_lock);
 }
 int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -1938,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -1978,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-                schedule_delayed_work(&os->os_orphan_scan_work,
+                queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
-                                      ocfs2_orphan_scan_timeout());
+                                   ocfs2_orphan_scan_timeout());
        }
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
 *      <modify the bh>
 *      ocfs2_journal_dirty(handle, bh);
 */
-int                  ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
-                                         struct buffer_head *bh);
 /*
 *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
        return blocks;
 }
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+        return ocfs2_extent_recs_per_gd(sb);
+}
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
                                                unsigned int clusters_to_del,
                                                struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c983715d8d8c..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                             struct ocfs2_dinode *alloc,
-                                             u32 numbits);
+                                             u32 *numbits,
+                                             struct ocfs2_alloc_reservation *resv);
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,151 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K    group: 126M     la: 121M
+ * csize: 8K    group: 252M     la: 243M
+ * csize: 16K   group: 504M     la: 486M
+ * csize: 32K   group: 1008M    la: 972M
+ * csize: 64K   group: 2016M    la: 1944M
+ * csize: 128K  group: 4032M    la: 3888M
+ * csize: 256K  group: 8064M    la: 7776M
+ * csize: 512K  group: 16128M   la: 15552M
+ * csize: 1024K group: 32256M   la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT    8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+        unsigned int la_mb;
+        unsigned int gd_mb;
+        unsigned int la_max_mb;
+        unsigned int megs_per_slot;
+        struct super_block *sb = osb->sb;
+        gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+        /*
+         * This takes care of files systems with very small group
+         * descriptors - 512 byte blocksize at cluster sizes lower
+         * than 16K and also 1k blocksize with 4k cluster size.
+         */
+        if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+            || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+                return OCFS2_LA_OLD_DEFAULT;
+        /*
+         * Leave enough room for some block groups and make the final
+         * value we work from a multiple of 4.
+         */
+        gd_mb -= 16;
+        gd_mb &= 0xFFFFFFFB;
+        la_mb = gd_mb;
+        /*
+         * Keep window sizes down to a reasonable default
+         */
+        if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+                /*
+                 * Some clustersize / blocksize combinations will have
+                 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+                 * default size, but get poor distribution when
+                 * limited to exactly 256 megabytes.
+                 *
+                 * As an example, 16K clustersize at 4K blocksize
+                 * gives us a cluster group size of 504M. Paring the
+                 * local alloc size down to 256 however, would give us
+                 * only one window and around 200MB left in the
+                 * cluster group. Instead, find the first size below
+                 * 256 which would give us an even distribution.
+                 *
+                 * Larger cluster group sizes actually work out pretty
+                 * well when pared to 256, so we don't have to do this
+                 * for any group that fits more than two
+                 * OCFS2_LA_MAX_DEFAULT_MB windows.
+                 */
+                if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+                        la_mb = 256;
+                else {
+                        unsigned int gd_mult = gd_mb;
+                        while (gd_mult > 256)
+                                gd_mult = gd_mult >> 1;
+                        la_mb = gd_mult;
+                }
+        }
+        megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+        megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+        /* Too many nodes, too few disk clusters. */
+        if (megs_per_slot < la_mb)
+                la_mb = megs_per_slot;
+        /* We can't store more bits than we can in a block. */
+        la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        if (la_mb > la_max_mb)
+                la_mb = la_max_mb;
+        return la_mb;
+}
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+        struct super_block *sb = osb->sb;
+        unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+        unsigned int la_max_mb;
+        la_max_mb = ocfs2_clusters_to_megabytes(sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+             requested_mb, la_max_mb, la_default_mb);
+        if (requested_mb == -1) {
+                /* No user request - use defaults */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_default_mb);
+        } else if (requested_mb > la_max_mb) {
+                /* Request is too big, we give the maximum available */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_max_mb);
+        } else {
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, requested_mb);
+        }
+        osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +302,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
                osb->local_alloc_bits =
                        ocfs2_megabytes_to_clusters(osb->sb,
-                                                    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+                                                    ocfs2_la_default_mb(osb));
        }
        /* read the alloc off disk */
@@ -262,6 +408,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        osb->local_alloc_state = OCFS2_LA_DISABLED;
+        ocfs2_resmap_uninit(&osb->osb_la_resmap);
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -305,12 +453,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, bh);
-        status = ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_commit;
-        }
        brelse(bh);
        osb->local_alloc_bh = NULL;
@@ -481,46 +624,6 @@ out:
        return status;
 }
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-                                      struct ocfs2_alloc_context *ac,
-                                      u32 bits_wanted)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *alloc;
-        struct ocfs2_local_alloc *la;
-        int start;
-        u64 block_off;
-        if (!ac->ac_max_block)
-                return 1;
-        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-        if (start == -1) {
-                mlog_errno(-ENOSPC);
-                return 0;
-        }
-        /*
-         * Converting (bm_off + start + bits_wanted) to blocks gives us
-         * the blkno just past our actual allocation.  This is perfect
-         * to compare with ac_max_block.
-         */
-        block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-                                             le32_to_cpu(la->la_bm_off) +
-                                             start + bits_wanted);
-        mlog(0, "Checking %llu against %llu\n",
-             (unsigned long long)block_off,
-             (unsigned long long)ac->ac_max_block);
-        if (block_off > ac->ac_max_block)
-                return 0;
-        return 1;
-}
 /*
 * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +716,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                mlog(0, "Calling in_range for max block %llu\n",
                     (unsigned long long)ac->ac_max_block);
-        if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-                                        bits_wanted)) {
-                /*
-                 * The window is outside ac->ac_max_block.
-                 * This errno tells the caller to keep localalloc enabled
-                 * but to get the allocation from the main bitmap.
-                 */
-                status = -EFBIG;
-                goto bail;
-        }
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +756,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+                                                  ac->ac_resv);
        if (start == -1) {
                /* TODO: Shouldn't we just BUG here? */
                status = -ENOSPC;
@@ -674,8 +767,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        bitmap = la->la_bitmap;
        *bit_off = le32_to_cpu(la->la_bm_off) + start;
-        /* local alloc is always contiguous by nature -- we never
-         * delete bits from it! */
        *num_bits = bits_wanted;
        status = ocfs2_journal_access_di(handle,
@@ -687,18 +778,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+                                  bits_wanted);
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -722,13 +810,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-                                             struct ocfs2_dinode *alloc,
+                                     struct ocfs2_dinode *alloc,
-                                             u32 numbits)
+                                     u32 *numbits,
+                                     struct ocfs2_alloc_reservation *resv)
 {
        int numfound, bitoff, left, startoff, lastzero;
+        int local_resv = 0;
+        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
+        struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
-        mlog_entry("(numbits wanted = %u)\n", numbits);
+        mlog_entry("(numbits wanted = %u)\n", *numbits);
        if (!alloc->id1.bitmap1.i_total) {
                mlog(0, "No bits in my window!\n");
@@ -736,6 +828,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        if (!resv) {
+                local_resv = 1;
+                ocfs2_resv_init_once(&r);
+                ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+                resv = &r;
+        }
+        numfound = *numbits;
+        if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+                if (numfound < *numbits)
+                        *numbits = numfound;
+                goto bail;
+        }
+        /*
+         * Code error. While reservations are enabled, local
+         * allocation should _always_ go through them.
+         */
+        BUG_ON(osb->osb_resv_level != 0);
+        /*
+         * Reservations are disabled. Handle this the old way.
+         */
        bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
        numfound = bitoff = startoff = 0;
@@ -761,7 +877,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                        startoff = bitoff+1;
                }
                /* we got everything we needed */
-                if (numfound == numbits) {
+                if (numfound == *numbits) {
                        /* mlog(0, "Found it all!\n"); */
                        break;
                }
@@ -770,12 +886,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
             numfound);
-        if (numfound == numbits)
+        if (numfound == *numbits)
                bitoff = startoff - numfound;
        else
                bitoff = -1;
 bail:
+        if (local_resv)
+                ocfs2_resv_discard(resmap, resv);
        mlog_exit(bitoff);
        return bitoff;
 }
@@ -1049,7 +1168,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-        status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+        status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
                                      &cluster_off, &cluster_count);
        if (status == -ENOSPC) {
 retry_enospc:
@@ -1063,7 +1182,7 @@ retry_enospc:
                        goto bail;
                ac->ac_bits_wanted = osb->local_alloc_default_bits;
-                status = ocfs2_claim_clusters(osb, handle, ac,
+                status = ocfs2_claim_clusters(handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
                                              &cluster_count);
@@ -1098,6 +1217,9 @@ retry_enospc:
        memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
               le16_to_cpu(la->la_size));
+        ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+                             OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
        mlog(0, "New window allocated:\n");
        mlog(0, "window la_bm_off = %u\n",
             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1291,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
                                          main_bm_inode, main_bm_bh);
@@ -1192,7 +1309,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        atomic_inc(&osb->alloc_stats.moves);
-        status = 0;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                                     int node_num,
                                     struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7898bd3a99f5..4c18f4ad93b4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -41,44 +41,20 @@
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-        /* The best way to deal with signals in the vm path is
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(blocked);
-        /* We should technically never get a bad return value
-         * from sigprocmask */
-        return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-        return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int error, ret;
+        int ret;
        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-        error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (error < 0) {
-                mlog_errno(error);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = filemap_fault(area, vmf);
+        ocfs2_unblock_signals(&oldset);
-        error = ocfs2_vm_op_unblock_sigs(&oldset);
-        if (error < 0)
-                mlog_errno(error);
-out:
        mlog_exit_ptr(vmf->page);
        return ret;
 }
@@ -98,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
        /*
         * Another node might have truncated while we were waiting on
         * cluster locks.
+         * We don't check size == 0 before the shift. This is borrowed
+         * from do_generic_file_read.
         */
-        last_index = size >> PAGE_CACHE_SHIFT;
+        last_index = (size - 1) >> PAGE_CACHE_SHIFT;
-        if (page->index > last_index) {
+        if (unlikely(!size || page->index > last_index)) {
                ret = -EINVAL;
                goto out;
        }
@@ -131,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
         * because the "write" would invalidate their data.
         */
        if (page->index == last_index)
-                len = size & ~PAGE_CACHE_MASK;
+                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
        ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
                                       &fsdata, di_bh, page);
@@ -158,14 +136,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int ret, ret2;
+        int ret;
-        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        /*
         * The cluster locks taken will block a truncate from another
@@ -193,9 +167,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ocfs2_inode_unlock(inode, 1);
 out:
-        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        ocfs2_unblock_signals(&oldset);
-        if (ret2 < 0)
-                mlog_errno(ret2);
        if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4cbb18f26c5f..a00dda2e4f16 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
                inode->i_nlink = 2;
        else
                inode->i_nlink = 1;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        dquot_initialize(inode);
        return inode;
 }
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
        };
        int did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
                ocfs2_add_links_count(dirfe, 1);
-                status = ocfs2_journal_dirty(handle, parent_fe_bh);
+                ocfs2_journal_dirty(handle, parent_fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto leave;
-                }
                inc_nlink(dir);
        }
@@ -439,6 +434,8 @@ leave:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -475,31 +472,23 @@ leave:
        return status;
 }
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+static int __ocfs2_mknod_locked(struct inode *dir,
-                              struct inode *dir,
+                                struct inode *inode,
-                              struct inode *inode,
+                                dev_t dev,
-                              dev_t dev,
+                                struct buffer_head **new_fe_bh,
-                              struct buffer_head **new_fe_bh,
+                                struct buffer_head *parent_fe_bh,
-                              struct buffer_head *parent_fe_bh,
+                                handle_t *handle,
-                              handle_t *handle,
+                                struct ocfs2_alloc_context *inode_ac,
-                              struct ocfs2_alloc_context *inode_ac)
+                                u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-        u64 fe_blkno = 0;
-        u16 suballoc_bit;
        u16 feat;
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
-                                       inode_ac, &suballoc_bit, &fe_blkno);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
@@ -531,6 +520,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_generation = cpu_to_le32(inode->i_generation);
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
+        fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +557,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
        }
-        status = ocfs2_journal_dirty(handle, *new_fe_bh);
+        ocfs2_journal_dirty(handle, *new_fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        ocfs2_populate_inode(inode, fe, 1);
        ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -596,6 +582,34 @@ leave:
        return status;
 }
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+                              struct inode *dir,
+                              struct inode *inode,
+                              dev_t dev,
+                              struct buffer_head **new_fe_bh,
+                              struct buffer_head *parent_fe_bh,
+                              handle_t *handle,
+                              struct ocfs2_alloc_context *inode_ac)
+{
+        int status = 0;
+        u64 suballoc_loc, fe_blkno = 0;
+        u16 suballoc_bit;
+        *new_fe_bh = NULL;
+        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+                                       inode_ac, &suballoc_loc,
+                                       &suballoc_bit, &fe_blkno);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+                                    parent_fe_bh, handle, inode_ac,
+                                    fe_blkno, suballoc_loc, suballoc_bit);
+}
 static int ocfs2_mkdir(struct inode *dir,
                       struct dentry *dentry,
                       int mode)
@@ -637,6 +651,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +708,9 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
        err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
@@ -705,14 +723,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, fe_bh);
-        err = ocfs2_journal_dirty(handle, fe_bh);
-        if (err < 0) {
-                ocfs2_add_links_count(fe, -1);
-                drop_nlink(inode);
-                mlog_errno(err);
-                goto out_commit;
-        }
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
@@ -736,6 +747,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
+        ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
@@ -909,12 +921,7 @@ static int ocfs2_unlink(struct inode *dir,
                drop_nlink(inode);
        drop_nlink(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        ocfs2_set_links_count(newfe, 0);
                else
                        ocfs2_add_links_count(newfe, -1);
+                ocfs2_journal_dirty(handle, newfe_bh);
-                status = ocfs2_journal_dirty(handle, newfe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1358,7 @@ static int ocfs2_rename(struct inode *old_dir,
                old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
                old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(handle, old_inode_bh);
-                status = ocfs2_journal_dirty(handle, old_inode_bh);
-                if (status < 0)
-                        mlog_errno(status);
        } else
                mlog_errno(status);
@@ -1431,7 +1430,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
-                        status = ocfs2_journal_dirty(handle, old_dir_bh);
+                        ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1562,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
                       bytes_left);
-                status = ocfs2_journal_dirty(handle, bhs[virtual]);
+                ocfs2_journal_dirty(handle, bhs[virtual]);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                virtual++;
                p_blkno++;
@@ -1611,6 +1606,8 @@ static int ocfs2_symlink(struct inode *dir,
        };
        int did_quota = 0, did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1703,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto bail;
@@ -1814,6 +1815,8 @@ bail:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
@@ -1868,61 +1871,117 @@ bail:
        return status;
 }
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
-                                    struct inode **ret_orphan_dir,
+                                        struct inode **ret_orphan_dir,
-                                    u64 blkno,
+                                        struct buffer_head **ret_orphan_dir_bh)
-                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
-        int status = 0;
+        int ret = 0;
-        status = ocfs2_blkno_stringify(blkno, name);
-        if (status < 0) {
-                mlog_errno(status);
-                return status;
-        }
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
                                                       ORPHAN_DIR_SYSTEM_INODE,
                                                       osb->slot_num);
        if (!orphan_dir_inode) {
-                status = -ENOENT;
+                ret = -ENOENT;
-                mlog_errno(status);
+                mlog_errno(ret);
-                return status;
+                return ret;
        }
        mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-        if (status < 0) {
+        if (ret < 0) {
-                mlog_errno(status);
+                mutex_unlock(&orphan_dir_inode->i_mutex);
-                goto leave;
+                iput(orphan_dir_inode);
+                mlog_errno(ret);
+                return ret;
        }
-        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+        *ret_orphan_dir = orphan_dir_inode;
-                                              orphan_dir_bh, name,
+        *ret_orphan_dir_bh = orphan_dir_bh;
-                                              OCFS2_ORPHAN_NAMELEN, lookup);
-        if (status < 0) {
-                ocfs2_inode_unlock(orphan_dir_inode, 1);
-                mlog_errno(status);
+        return 0;
-                goto leave;
+}
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+                                      struct buffer_head *orphan_dir_bh,
+                                      u64 blkno,
+                                      char *name,
+                                      struct ocfs2_dir_lookup_result *lookup)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+        ret = ocfs2_blkno_stringify(blkno, name);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+                                           orphan_dir_bh, name,
+                                           OCFS2_ORPHAN_NAMELEN, lookup);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        return 0;
+}
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ *          ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+                                    struct inode **ret_orphan_dir,
+                                    u64 blkno,
+                                    char *name,
+                                    struct ocfs2_dir_lookup_result *lookup)
+{
+        struct inode *orphan_dir_inode = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        int ret = 0;
+        ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+                                           &orphan_dir_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+                                         blkno, name, lookup);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
        }
        *ret_orphan_dir = orphan_dir_inode;
-leave:
+out:
-        if (status) {
+        brelse(orphan_dir_bh);
+        if (ret) {
+                ocfs2_inode_unlock(orphan_dir_inode, 1);
                mutex_unlock(&orphan_dir_inode->i_mutex);
                iput(orphan_dir_inode);
        }
-        brelse(orphan_dir_bh);
+        mlog_exit(ret);
+        return ret;
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
@@ -1961,12 +2020,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2119,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
@@ -2079,6 +2128,99 @@ leave:
        return status;
 }
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure. 
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+                                        struct buffer_head *dir_bh,
+                                        char *orphan_name,
+                                        struct inode **ret_orphan_dir,
+                                        u64 *ret_di_blkno,
+                                        struct ocfs2_dir_lookup_result *orphan_insert,
+                                        struct ocfs2_alloc_context **ret_inode_ac)
+{
+        int ret;
+        u64 di_blkno;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        struct inode *orphan_dir = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        struct ocfs2_alloc_context *inode_ac = NULL;
+        ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /* reserve an inode spot */
+        ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+                                       &di_blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+                                         di_blkno, orphan_name, orphan_insert);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret == 0) {
+                *ret_orphan_dir = orphan_dir;
+                *ret_di_blkno = di_blkno;
+                *ret_inode_ac = inode_ac;
+                /*
+                 * orphan_name and orphan_insert are already up to
+                 * date via prepare_orphan_dir
+                 */
+        } else {
+                /* Unroll reserve_new_inode* */
+                if (inode_ac)
+                        ocfs2_free_alloc_context(inode_ac);
+                /* Unroll orphan dir locking */
+                mutex_unlock(&orphan_dir->i_mutex);
+                ocfs2_inode_unlock(orphan_dir, 1);
+                iput(orphan_dir);
+        }
+        brelse(orphan_dir_bh);
+        return 0;
+}
 int ocfs2_create_inode_in_orphan(struct inode *dir,
                                 int mode,
                                 struct inode **new_inode)
@@ -2094,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        struct buffer_head *new_di_bh = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        u64 uninitialized_var(di_blkno), suballoc_loc;
+        u16 suballoc_bit;
        status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
        if (status < 0) {
@@ -2102,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                return status;
        }
-        /*
+        status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
-         * We give the orphan dir the root blkno to fake an orphan name,
+                                              orphan_name, &orphan_dir,
-         * and allocate enough space for our insertion.
+                                              &di_blkno, &orphan_insert, &inode_ac);
-         */
-        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
-                                          osb->root_blkno,
-                                          orphan_name, &orphan_insert);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        /* reserve an inode spot */
-        status = ocfs2_reserve_new_inode(osb, &inode_ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -2142,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
                goto leave;
        did_quota_inode = 1;
-        inode->i_nlink = 0;
+        status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
-        /* do the real work now. */
+                                              &suballoc_loc,
-        status = ocfs2_mknod_locked(osb, dir, inode,
+                                              &suballoc_bit, di_blkno);
-                                    0, &new_di_bh, parent_di_bh, handle,
-                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+        inode->i_nlink = 0;
+        /* do the real work now. */
+        status = __ocfs2_mknod_locked(dir, inode,
+                                      0, &new_di_bh, parent_di_bh, handle,
+                                      inode_ac, di_blkno, suballoc_loc,
+                                      suballoc_bit);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2ebc2c4..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
+#include "reservations.h"
 /* Caching of metadata buffers */
@@ -341,6 +342,9 @@ struct ocfs2_super
         */
        unsigned int local_alloc_bits;
        unsigned int local_alloc_default_bits;
+        /* osb_clusters_at_boot can become stale! Do not trust it to
+         * be up to date. */
+        unsigned int osb_clusters_at_boot;
        enum ocfs2_local_alloc_state local_alloc_state; /* protected
                                                         * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
        u64 la_last_gd;
+        struct ocfs2_reservation_map    osb_la_resmap;
+        unsigned int    osb_resv_level;
+        unsigned int    osb_dir_resv_level;
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+                return 1;
+        return 0;
+}
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
        if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+                                                       unsigned int clusters)
+{
+        return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
        ext2_set_bit(bit, bitmap);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..fa31d05e41b7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE    0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG     0x2000
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -231,18 +235,31 @@
 #define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 /* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL          (0x00000001)    /* Secure deletion */
+#define OCFS2_SECRM_FL                  FS_SECRM_FL     /* Secure deletion */
-#define OCFS2_UNRM_FL           (0x00000002)    /* Undelete */
+#define OCFS2_UNRM_FL                   FS_UNRM_FL      /* Undelete */
-#define OCFS2_COMPR_FL          (0x00000004)    /* Compress file */
+#define OCFS2_COMPR_FL                  FS_COMPR_FL     /* Compress file */
-#define OCFS2_SYNC_FL           (0x00000008)    /* Synchronous updates */
+#define OCFS2_SYNC_FL                   FS_SYNC_FL      /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL      (0x00000010)    /* Immutable file */
+#define OCFS2_IMMUTABLE_FL              FS_IMMUTABLE_FL /* Immutable file */
-#define OCFS2_APPEND_FL         (0x00000020)    /* writes to file may only append */
+#define OCFS2_APPEND_FL                 FS_APPEND_FL    /* writes to file may only append */
-#define OCFS2_NODUMP_FL         (0x00000040)    /* do not dump file */
+#define OCFS2_NODUMP_FL                 FS_NODUMP_FL    /* do not dump file */
-#define OCFS2_NOATIME_FL        (0x00000080)    /* do not update atime */
+#define OCFS2_NOATIME_FL                FS_NOATIME_FL   /* do not update atime */
-#define OCFS2_DIRSYNC_FL        (0x00010000)    /* dirsync behaviour (directories only) */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL                  FS_DIRTY_FL
-#define OCFS2_FL_VISIBLE        (0x000100FF)    /* User visible flags */
+#define OCFS2_COMPRBLK_FL               FS_COMPRBLK_FL  /* One or more compressed clusters */
-#define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
+#define OCFS2_NOCOMP_FL                 FS_NOCOMP_FL    /* Don't compress */
+#define OCFS2_ECOMPR_FL                 FS_ECOMPR_FL    /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL                  FS_BTREE_FL     /* btree format dir */
+#define OCFS2_INDEX_FL                  FS_INDEX_FL     /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL                 FS_IMAGIC_FL    /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL           FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL                 FS_NOTAIL_FL    /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL                FS_DIRSYNC_FL   /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL                 FS_TOPDIR_FL    /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL               FS_RESERVED_FL  /* reserved for ext2 lib */
+#define OCFS2_FL_VISIBLE                FS_FL_USER_VISIBLE      /* User visible flags */
+#define OCFS2_FL_MODIFIABLE             FS_FL_USER_MODIFIABLE   /* User modifiable flags */
 /*
 * Extent record flags (e_node.leaf.flags)
@@ -283,14 +300,6 @@
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
-/*
 * Inline extended attribute size (in bytes)
 * The value chosen should be aligned to 16 byte boundaries.
 */
@@ -512,7 +521,10 @@ struct ocfs2_extent_block
                                           block group */
        __le32 h_fs_generation;         /* Must match super block */
        __le64 h_blkno;                 /* Offset on disk, in blocks */
-/*20*/  __le64 h_reserved3;
+/*20*/  __le64 h_suballoc_loc;          /* Suballocator block group this
+                                           eb belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
        __le64 h_next_leaf_blk;         /* Offset on disk, in blocks,
                                           of next leaf header pointing
                                           to data */
@@ -679,7 +691,11 @@ struct ocfs2_dinode {
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
 /*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
 /*90*/  __le64 i_refcount_loc;
-        __le64 i_reserved2[4];
+        __le64 i_suballoc_loc;          /* Suballocator block group this
+                                           inode belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
+/*A0*/  __le64 i_reserved2[3];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -814,7 +830,12 @@ struct ocfs2_dx_root_block {
        __le32          dr_reserved2;
        __le64          dr_free_blk;            /* Pointer to head of free
                                                 * unindexed block list. */
-        __le64          dr_reserved3[15];
+        __le64          dr_suballoc_loc;        /* Suballocator block group
+                                                   this root belongs to.
+                                                   Only valid if allocated
+                                                   from a discontiguous
+                                                   block group */
+        __le64          dr_reserved3[14];
        union {
                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
                                                   * bits for maximum space
@@ -840,6 +861,13 @@ struct ocfs2_dx_leaf {
 };
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE        256
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -860,7 +888,29 @@ struct ocfs2_group_desc
        __le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
        __le64   bg_reserved2;
-/*40*/  __u8    bg_bitmap[0];
+/*40*/  union {
+                __u8    bg_bitmap[0];
+                struct {
+                        /*
+                         * Block groups may be discontiguous when
+                         * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+                         * The extents of a discontigous block group are
+                         * stored in bg_list.  It is a flat list.
+                         * l_tree_depth must always be zero.  A
+                         * discontiguous group is signified by a non-zero
+                         * bg_list->l_next_free_rec.  Only block groups
+                         * can be discontiguous; Cluster groups cannot.
+                         * We've never made a block group with more than
+                         * 2048 blocks (256 bytes of bg_bitmap).  This
+                         * codifies that limit so that we can fit bg_list.
+                         * bg_size of a discontiguous block group will
+                         * be 256 to match bg_bitmap_filler.
+                         */
+                        __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/                 struct ocfs2_extent_list bg_list;
+                };
+        };
+/* Actual on-disk size is one block */
 };
 struct ocfs2_refcount_rec {
@@ -905,7 +955,11 @@ struct ocfs2_refcount_block {
 /*40*/  __le32 rf_generation;           /* generation number. all be the same
                                         * for the same refcount tree. */
        __le32 rf_reserved0;
-        __le64 rf_reserved1[7];
+        __le64 rf_suballoc_loc;         /* Suballocator block group this
+                                           refcount block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
+/*50*/  __le64 rf_reserved1[6];
 /*80*/  union {
                struct ocfs2_refcount_list rf_records;  /* List of refcount
                                                          records */
@@ -1017,7 +1071,10 @@ struct ocfs2_xattr_block {
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
        __le32  xb_reserved1;
-        __le64  xb_reserved2;
+        __le64  xb_suballoc_loc;        /* Suballocator block group this
+                                           xattr block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
 /*30*/  union {
                struct ocfs2_xattr_header xb_header; /* xattr header if this
                                                        block contains xattr */
@@ -1254,6 +1311,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
        int size;
@@ -1284,13 +1351,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
        return size;
 }
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+                                          int suballocator,
+                                          u32 feature_incompat)
 {
-        int size;
+        int size = sb->s_blocksize -
-        size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1402,23 +1479,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
        int size;
        size = blocksize -
-                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
-        return size;
+        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
        int size;
        size = blocksize -
+                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+        return size;
+}
+static inline int ocfs2_group_bitmap_size(int blocksize,
+                                          int suballocator,
+                                          uint32_t feature_incompat)
+{
+        int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1491,5 +1588,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
        de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+        if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+             le16_to_cpu(gd->bg_size)) !=
+            offsetof(struct ocfs2_group_desc, bg_list))
+                return 0;
+        /*
+         * Only valid to check l_next_free_rec if
+         * bg_bitmap + bg_size == bg_list.
+         */
+        if (!gd->bg_list.l_next_free_rec)
+                return 0;
+        return 1;
+}
 #endif  /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a83..5d241505690b 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
 /*
 * ioctl commands
 */
-#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
+#define OCFS2_IOC_GETFLAGS      FS_IOC_GETFLAGS
-#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
+#define OCFS2_IOC_SETFLAGS      FS_IOC_SETFLAGS
-#define OCFS2_IOC32_GETFLAGS    _IOR('f', 1, int)
+#define OCFS2_IOC32_GETFLAGS    FS_IOC32_GETFLAGS
-#define OCFS2_IOC32_SETFLAGS    _IOW('f', 2, int)
+#define OCFS2_IOC32_SETFLAGS    FS_IOC32_SETFLAGS
 /*
 * Space reservation / allocation / free ioctls and argument structure
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
 struct ocfs2_dquot {
        struct dquot dq_dquot;  /* Generic VFS dquot */
        loff_t dq_local_off;    /* Offset in the local quota file */
+        u64 dq_local_phys_blk;  /* Physical block carrying quota structure */
        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
        s64 dq_origspace;       /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        u64 dqi_giblk;                  /* Number of block with global information header */
        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
-        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct buffer_head *dqi_libh;   /* Buffer with local information header */
        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
 int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
-                           struct buffer_head **bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+                                struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab42a74c7539..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -25,8 +25,44 @@
 #include "dlmglue.h"
 #include "uptodate.h"
 #include "super.h"
+#include "buffer_head_io.h"
 #include "quota.h"
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *                                                     -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *                                   -> alloc space for gf
+ *                                   -> start_trans -> qinfo_lock -> write to gf
+ *           -> ip_alloc_sem of lf -> alloc space for lf
+ *           -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *           -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
@@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
        .is_id = ocfs2_global_is_id,
 };
-static int ocfs2_validate_quota_block(struct super_block *sb,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
-                                      struct buffer_head *bh)
 {
        struct ocfs2_disk_dqtrailer *dqt =
                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
 }
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
-                           struct buffer_head **bh)
+                                struct buffer_head **bhp)
 {
-        int rc = 0;
+        int rc;
-        struct buffer_head *tmp = *bh;
+        *bhp = NULL;
-        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+        rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
-                ocfs2_error(inode->i_sb,
+                               ocfs2_validate_quota_block);
-                            "Quota file %llu is probably corrupted! Requested "
-                            "to read block %Lu but file has size only %Lu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)v_block,
-                            (unsigned long long)i_size_read(inode));
-                return -EIO;
-        }
-        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
-                                    ocfs2_validate_quota_block);
        if (rc)
                mlog_errno(rc);
-        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
-        if (!rc && !*bh)
-                *bh = tmp;
        return rc;
 }
-static int ocfs2_get_quota_block(struct inode *inode, int block,
-                                 struct buffer_head **bh)
-{
-        u64 pblock, pcount;
-        int err;
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (err) {
-                mlog_errno(err);
-                return err;
-        }
-        *bh = sb_getblk(inode->i_sb, pblock);
-        if (!*bh) {
-                err = -EIO;
-                mlog_errno(err);
-        }
-        return err;
-}
 /* Read data from global quotafile - avoid pagecache and such because we cannot
 * afford acquiring the locks... We use quota cluster lock to serialize
 * operations. Caller is responsible for acquiring it. */
@@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        int err = 0;
        struct buffer_head *bh;
        size_t toread, tocopy;
+        u64 pblock = 0, pcount = 0;
        if (off > i_size)
                return 0;
@@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        toread = len;
        while (toread > 0) {
                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                if (!pcount) {
+                        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+                                                          &pcount, NULL);
+                        if (err) {
+                                mlog_errno(err);
+                                return err;
+                        }
+                } else {
+                        pcount--;
+                        pblock++;
+                }
                bh = NULL;
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                if (err) {
                        mlog_errno(err);
                        return err;
@@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        int err = 0, new = 0, ja_type;
        struct buffer_head *bh = NULL;
        handle_t *handle = journal_current_handle();
+        u64 pblock, pcount;
        if (!handle) {
                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
        }
-        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
                loff_t rounded_end =
                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                /* Space is already allocated in ocfs2_global_read_dquot() */
+                /* Space is already allocated in ocfs2_acquire_dquot() */
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
                                               rounded_end);
@@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                        goto out;
                new = 1;
        }
+        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+        if (err) {
+                mlog_errno(err);
+                goto out;
+        }
        /* Not rewriting whole block? */
        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
            !new) {
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
        } else {
-                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                bh = sb_getblk(sb, pblock);
+                if (!bh)
+                        err = -ENOMEM;
                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
        }
        if (err) {
@@ -261,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                brelse(bh);
                goto out;
        }
-        err = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
        brelse(bh);
-        if (err < 0)
-                goto out;
 out:
        if (err) {
-                mutex_unlock(&gqinode->i_mutex);
                mlog_errno(err);
                return err;
        }
        gqinode->i_version++;
        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
-        mutex_unlock(&gqinode->i_mutex);
        return len;
 }
@@ -291,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
        else
                WARN_ON(bh != oinfo->dqi_gqi_bh);
        spin_unlock(&dq_data_lock);
+        if (ex) {
+                mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+                down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        } else {
+                down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        return 0;
 }
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
+        if (ex) {
+                up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+                mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+        } else {
+                up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
        brelse(oinfo->dqi_gqi_bh);
        spin_lock(&dq_data_lock);
@@ -313,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        struct ocfs2_global_disk_dqinfo dinfo;
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        u64 pcount;
        int status;
        mlog_entry_void();
@@ -339,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                mlog_errno(status);
                goto out_err;
        }
+        status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+                                             &pcount, NULL);
+        if (status < 0)
+                goto out_unlock;
+        status = ocfs2_qinfo_lock(oinfo, 0);
+        if (status < 0)
+                goto out_unlock;
        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
                                      sizeof(struct ocfs2_global_disk_dqinfo),
                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_qinfo_unlock(oinfo, 0);
        ocfs2_unlock_global_qf(oinfo, 0);
        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -368,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 out_err:
        mlog_exit(status);
        return status;
+out_unlock:
+        ocfs2_unlock_global_qf(oinfo, 0);
+        mlog_errno(status);
+        goto out_err;
 }
 /* Write information to global quota file. Expects exlusive lock on quota
@@ -426,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
 static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
 {
-        /* We modify all the allocated blocks, tree root, and info block */
+        /* We modify all the allocated blocks, tree root, info block and
+         * the inode */
        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
-                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
-}
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
-{
-        int err, err2, ex = 0;
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
-        struct inode *gqinode = info->dqi_gqinode;
-        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
-        handle_t *handle = NULL;
-        err = ocfs2_qinfo_lock(info, 0);
-        if (err < 0)
-                goto out;
-        err = qtree_read_dquot(&info->dqi_gi, dquot);
-        if (err < 0)
-                goto out_qlock;
-        OCFS2_DQUOT(dquot)->dq_use_count++;
-        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
-        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
-        ocfs2_qinfo_unlock(info, 0);
-        if (!dquot->dq_off) {   /* No real quota entry? */
-                ex = 1;
-                /*
-                 * Add blocks to quota file before we start a transaction since
-                 * locking allocators ranks above a transaction start
-                 */
-                WARN_ON(journal_current_handle());
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                err = ocfs2_extend_no_holes(gqinode,
-                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
-                        gqinode->i_size);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
-                        goto out;
-        }
-        handle = ocfs2_start_trans(osb,
-                                   ocfs2_calc_global_qinit_credits(sb, type));
-        if (IS_ERR(handle)) {
-                err = PTR_ERR(handle);
-                goto out;
-        }
-        err = ocfs2_qinfo_lock(info, ex);
-        if (err < 0)
-                goto out_trans;
-        err = qtree_write_dquot(&info->dqi_gi, dquot);
-        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
-                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
-                if (!err)
-                        err = err2;
-        }
-out_qlock:
-        if (ex)
-                ocfs2_qinfo_unlock(info, 1);
-        else
-                ocfs2_qinfo_unlock(info, 0);
-out_trans:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-out:
-        if (err < 0)
-                mlog_errno(err);
-        return err;
 }
 /* Sync local information about quota modifications with global quota file.
@@ -638,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
        }
        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
-        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        if (status < 0)
                mlog_errno(status);
        /* We have to write local structure as well... */
-        dquot_mark_dquot_dirty(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-        status = dquot_commit(dquot);
        if (status < 0)
                mlog_errno(status);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -684,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out;
        }
-        status = dquot_commit(dquot);
+        mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+        status = ocfs2_local_write_dquot(dquot);
+        mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
@@ -715,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mutex_lock(&dquot->dq_lock);
+        /* Check whether we are not racing with some other dqget() */
+        if (atomic_read(&dquot->dq_count) > 1)
+                goto out;
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
@@ -725,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
-        status = dquot_release(dquot);
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_local_release_dquot(handle, dquot);
+        /*
+         * If we fail here, we cannot do much as global structure is
+         * already released. So just complain...
+         */
+        if (status < 0)
+                mlog_errno(status);
+        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        struct ocfs2_mem_dqinfo *oinfo =
+        int status = 0, err;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int ex = 0;
-        int status = 0;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        /* We need an exclusive lock, because we're going to update use count
+        mutex_lock(&dquot->dq_lock);
-         * and instantiate possibly new dquot structure */
+        /*
-        status = ocfs2_lock_global_qf(oinfo, 1);
+         * We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure
+         */
+        status = ocfs2_lock_global_qf(info, 1);
        if (status < 0)
                goto out;
-        status = dquot_acquire(dquot);
+        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
-        ocfs2_unlock_global_qf(oinfo, 1);
+                status = ocfs2_qinfo_lock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+                status = qtree_read_dquot(&info->dqi_gi, dquot);
+                ocfs2_qinfo_unlock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+        }
+        set_bit(DQ_READ_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                status = ocfs2_extend_no_holes(gqinode, NULL,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                if (status < 0)
+                        goto out_dq;
+        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto out_dq;
+        }
+        status = ocfs2_qinfo_lock(info, ex);
+        if (status < 0)
+                goto out_trans;
+        status = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(sb, type))) {
+                err = __ocfs2_global_write_info(sb, type);
+                if (!status)
+                        status = err;
+        }
+        ocfs2_qinfo_unlock(info, ex);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_dq:
+        ocfs2_unlock_global_qf(info, 1);
+        if (status < 0)
+                goto out;
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0)
+                goto out;
+        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
@@ -770,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        dquot_mark_dquot_dirty(dquot);
        /* In case user set some limits, sync dquot immediately to global
         * quota file so that information propagates quicker */
@@ -793,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
        if (status < 0) {
                mlog_errno(status);
-                goto out_trans;
+                goto out_dlock;
        }
        /* Now write updated local dquot structure */
-        status = dquot_commit(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-out_trans:
+out_dlock:
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -852,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 const struct dquot_operations ocfs2_quota_operations = {
-        .write_dquot    = ocfs2_write_dquot,
+        /* We never make dquot dirty so .write_dquot is never called */
        .acquire_dquot  = ocfs2_acquire_dquot,
        .release_dquot  = ocfs2_release_dquot,
        .mark_dirty     = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9ad49305f450..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -22,6 +22,7 @@
 #include "dlmglue.h"
 #include "quota.h"
 #include "uptodate.h"
+#include "super.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -119,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        lock_buffer(bh);
        modify(bh, private);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_commit_trans(OCFS2_SB(sb), handle);
-                return status;
-        }
        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
        if (status < 0) {
                mlog_errno(status);
@@ -133,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        return 0;
 }
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                                  struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+                ocfs2_error(inode->i_sb,
+                            "Quota file %llu is probably corrupted! Requested "
+                            "to read block %Lu but file has size only %Lu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)v_block,
+                            (unsigned long long)i_size_read(inode));
+                return -EIO;
+        }
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /* Check whether we understand format of quota files */
 static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 {
@@ -523,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
-                        status = ocfs2_journal_dirty(handle, qbh);
+                        ocfs2_journal_dirty(handle, qbh);
-                        if (status < 0)
-                                mlog_errno(status);
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                lock_buffer(bh);
                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
                unlock_buffer(bh);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0)
-                        mlog_errno(status);
 out_trans:
                ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -679,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        INIT_LIST_HEAD(&oinfo->dqi_chunk);
        oinfo->dqi_rec = NULL;
        oinfo->dqi_lqi_bh = NULL;
-        oinfo->dqi_ibh = NULL;
+        oinfo->dqi_libh = NULL;
        status = ocfs2_global_read_info(sb, type);
        if (status < 0)
@@ -705,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
-        oinfo->dqi_ibh = bh;
+        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
        if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -767,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
 {
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
-                                                ->dqi_ibh;
+                                                ->dqi_libh;
        int status;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -790,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        int mark_clean = 1, len;
        int status;
-        /* At this point we know there are no more dquots and thus
-         * even if there's some sync in the pdflush queue, it won't
-         * find any dquots and return without doing anything */
-        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
        iput(oinfo->dqi_gqinode);
        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -828,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        /* Mark local file as clean */
        info->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
-                                 oinfo->dqi_ibh,
+                                 oinfo->dqi_libh,
                                 olq_update_info,
                                 info);
        if (status < 0) {
@@ -838,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 out:
        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
-        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_libh);
        brelse(oinfo->dqi_lqi_bh);
        kfree(oinfo);
        return 0;
@@ -866,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 }
 /* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
+        struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
        int status;
-        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+        status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
-                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                             &bh);
-                                    &bh);
        if (status) {
                mlog_errno(status);
                goto out;
        }
-        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+        status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
-                                 olq_set_dquot, od);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -950,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + 2 * sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
@@ -981,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        }
        /* Initialize chunk header */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1009,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Initialize new block with structures */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1040,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        lock_buffer(dbh);
        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(dbh);
-        status = ocfs2_journal_dirty(handle, dbh);
+        ocfs2_journal_dirty(handle, dbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
@@ -1105,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                return ocfs2_local_quota_add_chunk(sb, type, offset);
        /* We are protected by dqio_sem so no locking needed */
-        status = ocfs2_extend_no_holes(lqinode,
+        status = ocfs2_extend_no_holes(lqinode, NULL,
                                       lqinode->i_size + sb->s_blocksize,
                                       lqinode->i_size);
        if (status < 0) {
@@ -1120,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        }
        /* Get buffer from the just added block */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1155,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(bh);
        memset(bh->b_data, 0, sb->s_blocksize);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
                                         chunk->qc_headerbh,
@@ -1173,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(chunk->qc_headerbh);
        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
        unlock_buffer(chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1210,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
 }
 /* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        int type = dquot->dq_type;
@@ -1219,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
        int offset;
        int status;
+        u64 pcount;
+        down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        chunk = ocfs2_find_free_entry(sb, type, &offset);
        if (!chunk) {
                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
-                if (IS_ERR(chunk))
+                if (IS_ERR(chunk)) {
-                        return PTR_ERR(chunk);
+                        status = PTR_ERR(chunk);
+                        goto out;
+                }
        } else if (IS_ERR(chunk)) {
-                return PTR_ERR(chunk);
+                status = PTR_ERR(chunk);
+                goto out;
        }
        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
        od->dq_chunk = chunk;
+        status = ocfs2_extent_map_get_blocks(lqinode,
+                                     ol_dqblk_block(sb, chunk->qc_num, offset),
+                                     &od->dq_local_phys_blk,
+                                     &pcount,
+                                     NULL);
        /* Initialize dquot structure on disk */
        status = ocfs2_local_write_dquot(dquot);
@@ -1246,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
                goto out;
        }
 out:
+        up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        return status;
 }
-/* Create entry in local file for dquot, load data from the global file */
+/*
-static int ocfs2_local_read_dquot(struct dquot *dquot)
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
-{
+ * already started a transaction and written all changes to global quota file
-        int status;
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
-        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-        status = ocfs2_global_read_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        /* Now create entry in the local quota file */
-        status = ocfs2_create_local_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        mlog_exit(0);
-        return 0;
-out_err:
-        mlog_exit(status);
-        return status;
-}
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
 {
        int status;
        int type = dquot->dq_type;
@@ -1286,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_local_disk_chunk *dchunk;
        int offset;
-        handle_t *handle = journal_current_handle();
-        BUG_ON(!handle);
-        /* First write all local changes to global file */
-        status = ocfs2_global_release_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle,
                        INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
-        status = 0;
 out:
        /* Clear the read bit so that next time someone uses this
         * dquot he reads fresh info from disk and allocates local
@@ -1331,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
        .read_file_info         = ocfs2_local_read_info,
        .write_file_info        = ocfs2_global_write_info,
        .free_file_info         = ocfs2_local_free_info,
-        .read_dqblk             = ocfs2_local_read_dquot,
-        .commit_dqblk           = ocfs2_local_write_dquot,
-        .release_dqblk          = ocfs2_local_release_dquot,
 };
 struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5cbcd0f008fc..efdd75607406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
                goto out_commit;
        }
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
        if (le32_to_cpu(rb->rf_count) == 1) {
                blk = le64_to_cpu(rb->rf_blkno);
                bit = le16_to_cpu(rb->rf_suballoc_bit);
-                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+                if (rb->rf_suballoc_loc)
+                        bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
                alloc_inode = ocfs2_get_system_file_inode(osb,
                                        EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
        } else if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
 }
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got, new_cpos;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *root_rb =
                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
         * 2 more credits, one for the leaf refcount block, one for
         * the extent block contains the extent rec.
         */
-        ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+        ret = ocfs2_extend_trans(handle, 2);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
        if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (index == 0) {
                ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                        ocfs2_refcount_rec_merge(rb, index);
        }
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
         */
        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
                                        le16_to_cpu(rb->rf_suballoc_slot),
+                                        le64_to_cpu(rb->rf_suballoc_loc),
                                        le64_to_cpu(rb->rf_blkno),
                                        le16_to_cpu(rb->rf_suballoc_bit));
        if (ret) {
@@ -2437,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
                          le32_to_cpu(rec.r_clusters)) - cpos;
                /*
-                 * If the refcount rec already exist, cool. We just need
-                 * to check whether there is a split. Otherwise we just need
-                 * to increase the refcount.
-                 * If we will insert one, increases recs_add.
-                 *
                 * We record all the records which will be inserted to the
                 * same refcount block, so that we can tell exactly whether
                 * we need a new refcount block or not.
+                 *
+                 * If we will insert a new one, this is easy and only happens
+                 * during adding refcounted flag to the extent, so we don't
+                 * have a chance of spliting. We just need one record.
+                 *
+                 * If the refcount rec already exists, that would be a little
+                 * complicated. we may have to:
+                 * 1) split at the beginning if the start pos isn't aligned.
+                 *    we need 1 more record in this case.
+                 * 2) split int the end if the end pos isn't aligned.
+                 *    we need 1 more record in this case.
+                 * 3) split in the middle because of file system fragmentation.
+                 *    we need 2 more records in this case(we can't detect this
+                 *    beforehand, so always think of the worst case).
                 */
                if (rec.r_refcount) {
+                        recs_add += 2;
                        /* Check whether we need a split at the beginning. */
                        if (cpos == start_cpos &&
                            cpos != le64_to_cpu(rec.r_cpos))
@@ -2516,20 +2525,19 @@ out:
 *
 * Normally the refcount blocks store these refcount should be
 * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
+ * We will at most add split 2 refcount records and 2 more
- * 2 more refcount block, so just check it in a rough way.
+ * refcount blocks, so just check it in a rough way.
 *
 * Caller must hold refcount tree lock.
 */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac)
+                                          int *ref_blocks)
 {
-        int ret, ref_blocks = 0;
+        int ret;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2554,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-                                      le64_to_cpu(di->i_refcount_loc), &tree);
+                                      refcount_loc, &tree);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_refcount_block(&tree->rf_ci,
+        ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
-                                        le64_to_cpu(di->i_refcount_loc),
                                        &ref_root_bh);
        if (ret) {
                mlog_errno(ret);
@@ -2564,21 +2571,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                               &tree->rf_ci,
                                               ref_root_bh,
                                               start_cpos, clusters,
-                                               &ref_blocks, credits);
+                                               ref_blocks, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        mlog(0, "reserve new metadata %d, credits = %d\n",
+        mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-             ref_blocks, *credits);
+             *ref_blocks, *credits);
-        if (ref_blocks) {
-                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-                                                        ref_blocks, meta_ac);
-                if (ret)
-                        mlog_errno(ret);
-        }
 out:
        brelse(ref_root_bh);
@@ -2941,6 +2941,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+        /*
+         * We only duplicate pages until we reach the page contains i_size - 1.
+         * So trim 'end' to i_size.
+         */
+        if (end > i_size_read(context->inode))
+                end = i_size_read(context->inode);
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2954,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (map_end & (PAGE_CACHE_SIZE - 1))
                        to = map_end & (PAGE_CACHE_SIZE - 1);
-                page = grab_cache_page(mapping, page_index);
+                page = find_or_create_page(mapping, page_index, GFP_NOFS);
                /*
                 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -3040,11 +3046,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
                }
                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-                ret = ocfs2_journal_dirty(handle, new_bh);
+                ocfs2_journal_dirty(handle, new_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
                brelse(new_bh);
                brelse(old_bh);
@@ -3177,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
-                page = grab_cache_page(context->inode->i_mapping, page_index);
+                page = find_or_create_page(context->inode->i_mapping,
+                                           page_index, GFP_NOFS);
                BUG_ON(!page);
                wait_on_page_writeback(page);
@@ -3282,7 +3285,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                } else {
                        delete = 1;
-                        ret = __ocfs2_claim_clusters(osb, handle,
+                        ret = __ocfs2_claim_clusters(handle,
                                                     context->data_ac,
                                                     1, set_len,
                                                     &new_bit, &new_len);
@@ -4180,6 +4183,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
        struct inode *inode = old_dentry->d_inode;
        struct buffer_head *new_bh = NULL;
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+                ret = -EINVAL;
+                mlog_errno(ret);
+                goto out;
+        }
        ret = filemap_fdatawrite(inode->i_mapping);
        if (ret) {
                mlog_errno(ret);
@@ -4192,8 +4201,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
                goto out;
        }
-        mutex_lock(&new_inode->i_mutex);
+        mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
-        ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+        ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+                                      OI_LS_REFLINK_TARGET);
        if (ret) {
                mlog_errno(ret);
                goto out_unlock;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
                            struct ocfs2_cached_dealloc_ctxt *dealloc,
                            int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac);
+                                          int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..3e78db361bc7
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,844 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+DEFINE_SPINLOCK(resv_lock);
+#define OCFS2_MIN_RESV_WINDOW_BITS      8
+#define OCFS2_MAX_RESV_WINDOW_BITS      1024
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+        return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+                                           struct ocfs2_alloc_reservation *resv)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        unsigned int bits;
+        if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+                /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+                bits = 4 << osb->osb_resv_level;
+        } else {
+                bits = 4 << osb->osb_dir_resv_level;
+        }
+        return bits;
+}
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_len)
+                return resv->r_start + resv->r_len - 1;
+        return resv->r_start;
+}
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+        return !!(resv->r_len == 0);
+}
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+        if (resmap->m_osb->osb_resv_level == 0)
+                return 1;
+        return 0;
+}
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        int i = 0;
+        mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+             osb->dev_str, resmap->m_bitmap_len);
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+                     "\tlast_len: %u\n", resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                node = rb_next(node);
+                i++;
+        }
+        mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+        i = 0;
+        list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+                mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+                     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                i++;
+        }
+}
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+                                      int i,
+                                      struct ocfs2_alloc_reservation *resv)
+{
+        char *disk_bitmap = resmap->m_disk_bitmap;
+        unsigned int start = resv->r_start;
+        unsigned int end = ocfs2_resv_end(resv);
+        while (start <= end) {
+                if (ocfs2_test_bit(start, disk_bitmap)) {
+                        mlog(ML_ERROR,
+                             "reservation %d covers an allocated area "
+                             "starting at bit %u!\n", i, start);
+                        return 1;
+                }
+                start++;
+        }
+        return 0;
+}
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+        unsigned int off = 0;
+        int i = 0;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (i > 0 && resv->r_start <= off) {
+                        mlog(ML_ERROR, "reservation %d has bad start off!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_len == 0) {
+                        mlog(ML_ERROR, "reservation %d has no length!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_start > ocfs2_resv_end(resv)) {
+                        mlog(ML_ERROR, "reservation %d has invalid range!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+                        mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_validate_resmap_bits(resmap, i, resv))
+                        goto bad;
+                off = ocfs2_resv_end(resv);
+                node = rb_next(node);
+                i++;
+        }
+        return;
+bad:
+        ocfs2_dump_resv(resmap);
+        BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+}
+#endif
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+        memset(resv, 0, sizeof(*resv));
+        INIT_LIST_HEAD(&resv->r_lru);
+}
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags)
+{
+        BUG_ON(flags & ~OCFS2_RESV_TYPES);
+        resv->r_flags |= flags;
+}
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap)
+{
+        memset(resmap, 0, sizeof(*resmap));
+        resmap->m_osb = osb;
+        resmap->m_reservations = RB_ROOT;
+        /* m_bitmap_len is initialized to zero by the above memset. */
+        INIT_LIST_HEAD(&resmap->m_lru);
+        return 0;
+}
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+                                struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        if (!list_empty(&resv->r_lru))
+                list_del_init(&resv->r_lru);
+        list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+        resv->r_len = 0;
+        resv->r_start = 0;
+}
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+                list_del_init(&resv->r_lru);
+                rb_erase(&resv->r_node, &resmap->m_reservations);
+                resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+        }
+}
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                                 struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        __ocfs2_resv_trunc(resv);
+        /*
+         * last_len and last_start no longer make sense if
+         * we're changing the range of our allocations.
+         */
+        resv->r_last_len = resv->r_last_start = 0;
+        ocfs2_resv_remove(resmap, resv);
+}
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv)
+{
+        if (resv) {
+                spin_lock(&resv_lock);
+                __ocfs2_resv_discard(resmap, resv);
+                spin_unlock(&resv_lock);
+        }
+}
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        assert_spin_locked(&resv_lock);
+        while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                __ocfs2_resv_discard(resmap, resv);
+        }
+}
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap)
+{
+        if (ocfs2_resmap_disabled(resmap))
+                return;
+        spin_lock(&resv_lock);
+        ocfs2_resmap_clear_all_resv(resmap);
+        resmap->m_bitmap_len = clen;
+        resmap->m_disk_bitmap = disk_bitmap;
+        spin_unlock(&resv_lock);
+}
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+        /* Does nothing for now. Keep this around for API symmetry */
+}
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *new)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        struct rb_node *parent = NULL;
+        struct rb_node **p = &root->rb_node;
+        struct ocfs2_alloc_reservation *tmp;
+        assert_spin_locked(&resv_lock);
+        mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+             new->r_len);
+        while (*p) {
+                parent = *p;
+                tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+                if (new->r_start < tmp->r_start) {
+                        p = &(*p)->rb_left;
+                        /*
+                         * This is a good place to check for
+                         * overlapping reservations.
+                         */
+                        BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+                } else if (new->r_start > ocfs2_resv_end(tmp)) {
+                        p = &(*p)->rb_right;
+                } else {
+                        /* This should never happen! */
+                        mlog(ML_ERROR, "Duplicate reservation window!\n");
+                        BUG();
+                }
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, root);
+        new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+        ocfs2_resv_mark_lru(resmap, new);
+        ocfs2_check_resmap(resmap);
+}
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+        struct ocfs2_alloc_reservation *resv = NULL;
+        struct ocfs2_alloc_reservation *prev_resv = NULL;
+        struct rb_node *node = resmap->m_reservations.rb_node;
+        assert_spin_locked(&resv_lock);
+        if (!node)
+                return NULL;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+                        break;
+                /* Check if we overshot the reservation just before goal? */
+                if (resv->r_start > goal) {
+                        resv = prev_resv;
+                        break;
+                }
+                prev_resv = resv;
+                node = rb_next(node);
+        }
+        return resv;
+}
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+                                       unsigned int wanted,
+                                       unsigned int search_start,
+                                       unsigned int search_len,
+                                       unsigned int *rstart,
+                                       unsigned int *rlen)
+{
+        void *bitmap = resmap->m_disk_bitmap;
+        unsigned int best_start, best_len = 0;
+        int offset, start, found;
+        mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+             wanted, search_start, search_len, resmap->m_bitmap_len);
+        found = best_start = best_len = 0;
+        start = search_start;
+        while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+                                                 start)) != -1) {
+                /* Search reached end of the region */
+                if (offset >= (search_start + search_len))
+                        break;
+                if (offset == start) {
+                        /* we found a zero */
+                        found++;
+                        /* move start to the next bit to test */
+                        start++;
+                } else {
+                        /* got a zero after some ones */
+                        found = 1;
+                        start = offset + 1;
+                }
+                if (found > best_len) {
+                        best_len = found;
+                        best_start = start - found;
+                }
+                if (found >= wanted)
+                        break;
+        }
+        if (best_len == 0)
+                return 0;
+        if (best_len >= wanted)
+                best_len = wanted;
+        *rlen = best_len;
+        *rstart = best_start;
+        mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+        return *rlen;
+}
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int goal, unsigned int wanted)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        unsigned int gap_start, gap_end, gap_len;
+        struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+        struct rb_node *prev, *next;
+        unsigned int cstart, clen;
+        unsigned int best_start = 0, best_len = 0;
+        /*
+         * Nasty cases to consider:
+         *
+         * - rbtree is empty
+         * - our window should be first in all reservations
+         * - our window should be last in all reservations
+         * - need to make sure we don't go past end of bitmap
+         */
+        mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+        assert_spin_locked(&resv_lock);
+        if (RB_EMPTY_ROOT(root)) {
+                /*
+                 * Easiest case - empty tree. We can just take
+                 * whatever window of free bits we want.
+                 */
+                mlog(0, "Empty root\n");
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   resmap->m_bitmap_len - goal,
+                                                   &cstart, &clen);
+                /*
+                 * This should never happen - the local alloc window
+                 * will always have free bits when we're called.
+                 */
+                BUG_ON(goal == 0 && clen == 0);
+                if (clen == 0)
+                        return;
+                resv->r_start = cstart;
+                resv->r_len = clen;
+                ocfs2_resv_insert(resmap, resv);
+                return;
+        }
+        prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+        if (prev_resv == NULL) {
+                mlog(0, "Goal on LHS of leftmost window\n");
+                /*
+                 * A NULL here means that the search code couldn't
+                 * find a window that starts before goal.
+                 *
+                 * However, we can take the first window after goal,
+                 * which is also by definition, the leftmost window in
+                 * the entire tree. If we can find free bits in the
+                 * gap between goal and the LHS window, then the
+                 * reservation can safely be placed there.
+                 *
+                 * Otherwise we fall back to a linear search, checking
+                 * the gaps in between windows for a place to
+                 * allocate.
+                 */
+                next = rb_first(root);
+                next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+                                     r_node);
+                /*
+                 * The search should never return such a window. (see
+                 * comment above
+                 */
+                if (next_resv->r_start <= goal) {
+                        mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+                             goal, next_resv->r_start, next_resv->r_len);
+                        ocfs2_dump_resv(resmap);
+                        BUG();
+                }
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   next_resv->r_start - goal,
+                                                   &cstart, &clen);
+                if (clen) {
+                        best_len = clen;
+                        best_start = cstart;
+                        if (best_len == wanted)
+                                goto out_insert;
+                }
+                prev_resv = next_resv;
+                next_resv = NULL;
+        }
+        prev = &prev_resv->r_node;
+        /* Now we do a linear search for a window, starting at 'prev_rsv' */
+        while (1) {
+                next = rb_next(prev);
+                if (next) {
+                        mlog(0, "One more resv found in linear search\n");
+                        next_resv = rb_entry(next,
+                                             struct ocfs2_alloc_reservation,
+                                             r_node);
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_end = next_resv->r_start - 1;
+                        gap_len = gap_end - gap_start + 1;
+                } else {
+                        mlog(0, "No next node\n");
+                        /*
+                         * We're at the rightmost edge of the
+                         * tree. See if a reservation between this
+                         * window and the end of the bitmap will work.
+                         */
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_len = resmap->m_bitmap_len - gap_start;
+                        gap_end = resmap->m_bitmap_len - 1;
+                }
+                /*
+                 * No need to check this gap if we have already found
+                 * a larger region of free bits.
+                 */
+                if (gap_len <= best_len)
+                        goto next_resv;
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+                                                   gap_len, &cstart, &clen);
+                if (clen == wanted) {
+                        best_len = clen;
+                        best_start = cstart;
+                        goto out_insert;
+                } else if (clen > best_len) {
+                        best_len = clen;
+                        best_start = cstart;
+                }
+next_resv:
+                if (!next)
+                        break;
+                prev = next;
+                prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+                                     r_node);
+        }
+out_insert:
+        if (best_len) {
+                resv->r_start = best_start;
+                resv->r_len = best_len;
+                ocfs2_resv_insert(resmap, resv);
+        }
+}
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        struct ocfs2_alloc_reservation *lru_resv;
+        int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+        unsigned int min_bits;
+        if (!tmpwindow)
+                min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+        else
+                min_bits = wanted; /* We at know the temp window will use all
+                                    * of these bits */
+        /*
+         * Take the first reservation off the LRU as our 'target'. We
+         * don't try to be smart about it. There might be a case for
+         * searching based on size but I don't have enough data to be
+         * sure. --Mark (3/16/2010)
+         */
+        lru_resv = list_first_entry(&resmap->m_lru,
+                                    struct ocfs2_alloc_reservation, r_lru);
+        mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+             lru_resv->r_len, ocfs2_resv_end(lru_resv));
+        /*
+         * Cannibalize (some or all) of the target reservation and
+         * feed it to the current window.
+         */
+        if (lru_resv->r_len <= min_bits) {
+                /*
+                 * Discard completely if size is less than or equal to a
+                 * reasonable threshold - 50% of window bits for non temporary
+                 * windows.
+                 */
+                resv->r_start = lru_resv->r_start;
+                resv->r_len = lru_resv->r_len;
+                __ocfs2_resv_discard(resmap, lru_resv);
+        } else {
+                unsigned int shrink;
+                if (tmpwindow)
+                        shrink = min_bits;
+                else
+                        shrink = lru_resv->r_len / 2;
+                lru_resv->r_len -= shrink;
+                resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+                resv->r_len = shrink;
+        }
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_resv_insert(resmap, resv);
+}
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        unsigned int goal = 0;
+        BUG_ON(!ocfs2_resv_empty(resv));
+        /*
+         * Begin by trying to get a window as close to the previous
+         * one as possible. Using the most recent allocation as a
+         * start goal makes sense.
+         */
+        if (resv->r_last_len) {
+                goal = resv->r_last_start + resv->r_last_len;
+                if (goal >= resmap->m_bitmap_len)
+                        goal = 0;
+        }
+        __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+        /* Search from last alloc didn't work, try once more from beginning. */
+        if (ocfs2_resv_empty(resv) && goal != 0)
+                __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+        if (ocfs2_resv_empty(resv)) {
+                /*
+                 * Still empty? Pull oldest one off the LRU, remove it from
+                 * tree, put this one in it's place.
+                 */
+                ocfs2_cannibalize_resv(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+}
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen)
+{
+        if (resv == NULL || ocfs2_resmap_disabled(resmap))
+                return -ENOSPC;
+        spin_lock(&resv_lock);
+        if (ocfs2_resv_empty(resv)) {
+                /*
+                 * We don't want to over-allocate for temporary
+                 * windows. Otherwise, we run the risk of fragmenting the
+                 * allocation space.
+                 */
+                unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+                if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                        wanted = *clen;
+                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * Try to get a window here. If it works, we must fall
+                 * through and test the bitmap . This avoids some
+                 * ping-ponging of windows due to non-reserved space
+                 * being allocation before we initialize a window for
+                 * that inode.
+                 */
+                ocfs2_resv_find_window(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+        *cstart = resv->r_start;
+        *clen = resv->r_len;
+        spin_unlock(&resv_lock);
+        return 0;
+}
+static void
+        ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int start, unsigned int end)
+{
+        unsigned int rhs = 0;
+        unsigned int old_end = ocfs2_resv_end(resv);
+        BUG_ON(start != resv->r_start || old_end < end);
+        /*
+         * Completely used? We can remove it then.
+         */
+        if (old_end == end) {
+                __ocfs2_resv_discard(resmap, resv);
+                return;
+        }
+        rhs = old_end - end;
+        /*
+         * This should have been trapped above.
+         */
+        BUG_ON(rhs == 0);
+        resv->r_start = end + 1;
+        resv->r_len = old_end - resv->r_start + 1;
+}
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen)
+{
+        unsigned int cend = cstart + clen - 1;
+        if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+                return;
+        if (resv == NULL)
+                return;
+        BUG_ON(cstart != resv->r_start);
+        spin_lock(&resv_lock);
+        mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+             "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+             cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+             resv->r_len, resv->r_last_start, resv->r_last_len);
+        BUG_ON(cstart < resv->r_start);
+        BUG_ON(cstart > ocfs2_resv_end(resv));
+        BUG_ON(cend > ocfs2_resv_end(resv));
+        ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+        resv->r_last_start = cstart;
+        resv->r_last_len = clen;
+        /*
+         * May have been discarded above from
+         * ocfs2_adjust_resv_from_alloc().
+         */
+        if (!ocfs2_resv_empty(resv))
+                ocfs2_resv_mark_lru(resmap, resv);
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_check_resmap(resmap);
+        spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+#include <linux/rbtree.h>
+#define OCFS2_DEFAULT_RESV_LEVEL        2
+#define OCFS2_MAX_RESV_LEVEL    9
+#define OCFS2_MIN_RESV_LEVEL    0
+struct ocfs2_alloc_reservation {
+        struct rb_node  r_node;
+        unsigned int    r_start;        /* Begining of current window */
+        unsigned int    r_len;          /* Length of the window */
+        unsigned int    r_last_len;     /* Length of most recent alloc */
+        unsigned int    r_last_start;   /* Start of most recent alloc */
+        struct list_head        r_lru;  /* LRU list head */
+        unsigned int    r_flags;
+};
+#define OCFS2_RESV_FLAG_INUSE   0x01    /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP     0x02    /* Temporary reservation, will be
+                                         * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR     0x04    /* Reservation is for an unindexed
+                                         * directory btree */
+struct ocfs2_reservation_map {
+        struct rb_root          m_reservations;
+        char                    *m_disk_bitmap;
+        struct ocfs2_super      *m_osb;
+        /* The following are not initialized to meaningful values until a disk
+         * bitmap is provided. */
+        u32                     m_bitmap_len;   /* Number of valid
+                                                 * bits available */
+        struct list_head        m_lru;          /* LRU of reservations
+                                                 * structures. */
+};
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+#define OCFS2_RESV_TYPES        (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags);
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv);
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap);
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen);
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen);
+#endif  /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
        }
-        ret = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_rollback;
-        }
        /* update the inode accordingly. */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
                     "Force to do offline resize.");
                ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small."
                     " Force to do offline resize.");
                ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        group = (struct ocfs2_group_desc *)group_bh->b_data;
        group->bg_next_group = cr->c_blkno;
+        ocfs2_journal_dirty(handle, group_bh);
-        ret = ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
                                      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 19ba00f28547..849c2f0e0a0e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,32 @@
 #define OCFS2_MAX_TO_STEAL              1024
+struct ocfs2_suballoc_result {
+        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
+                                           to 0 when a block group is
+                                           contiguous. */
+        u64             sr_bg_stable_blkno; /*
+                                             * Doesn't change, always
+                                             * set to target block
+                                             * group descriptor
+                                             * block.
+                                             */
+        u64             sr_blkno;       /* The first allocated block */
+        unsigned int    sr_bit_offset;  /* The bit in the bg */
+        unsigned int    sr_bits;        /* How many bits we claimed */
+};
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+        if (res->sr_blkno == 0)
+                return 0;
+        if (res->sr_bg_blkno)
+                return res->sr_bg_blkno;
+        return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +86,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +100,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found);
+                                      struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found);
+                                    struct ocfs2_suballoc_result *res);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res);
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +154,11 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        }
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
+        ac->ac_resv = NULL;
+        if (ac->ac_find_loc_priv) {
+                kfree(ac->ac_find_loc_priv);
+                ac->ac_find_loc_priv = NULL;
+        }
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +354,38 @@ out:
        return rc;
 }
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_chain_list *cl,
+                                          u64 p_blkno, unsigned int clusters)
+{
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(!ocfs2_supports_discontig_bg(osb));
+        if (!el->l_next_free_rec)
+                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+        rec->e_blkno = cpu_to_le64(p_blkno);
+        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+                                  le16_to_cpu(cl->cl_bpc));
+        rec->e_leaf_clusters = cpu_to_le16(clusters);
+        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&bg->bg_free_bits_count,
+                     clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&el->l_next_free_rec, 1);
+}
 static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
@@ -359,19 +412,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
        memset(bg, 0, sb->s_blocksize);
        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
-        bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+                                                osb->s_feature_incompat));
        bg->bg_chain = cpu_to_le16(my_chain);
        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
        bg->bg_blkno = cpu_to_le64(group_blkno);
+        if (group_clusters == le16_to_cpu(cl->cl_cpg))
+                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+        else
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+                                              group_clusters);
        /* set the 1st bit in the bitmap to account for the descriptor block */
        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
-        status = ocfs2_journal_dirty(handle, bg_bh);
+        ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0)
-                mlog_errno(status);
        /* There is no need to zero out or otherwise initialize the
         * other blocks in a group - All valid FS metadata in a block
@@ -397,6 +454,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
        return best;
 }
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+                               struct inode *alloc_inode,
+                               struct ocfs2_alloc_context *ac,
+                               struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        struct buffer_head *bg_bh;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        status = ocfs2_claim_clusters(handle, ac,
+                                      le16_to_cpu(cl->cl_cpg), &bit_off,
+                                      &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                brelse(bg_bh);
+                mlog_errno(status);
+        }
+bail:
+        return status ? ERR_PTR(status) : bg_bh;
+}
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+                                        handle_t *handle,
+                                        struct ocfs2_alloc_context *ac,
+                                        unsigned int min_bits,
+                                        u32 *bit_off, u32 *num_bits)
+{
+        int status = 0;
+        while (min_bits) {
+                status = ocfs2_claim_clusters(handle, ac, min_bits,
+                                              bit_off, num_bits);
+                if (status != -ENOSPC)
+                        break;
+                min_bits >>= 1;
+        }
+        return status;
+}
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+                                            struct inode *alloc_inode,
+                                            struct buffer_head *bg_bh,
+                                            struct ocfs2_alloc_context *ac,
+                                            struct ocfs2_chain_list *cl,
+                                            unsigned int min_bits)
+{
+        int status;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        struct ocfs2_group_desc *bg =
+                (struct ocfs2_group_desc *)bg_bh->b_data;
+        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        u32 p_cpos, clusters;
+        u64 p_blkno;
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         bg_bh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+                                le16_to_cpu(el->l_count))) {
+                if (min_bits > needed)
+                        min_bits = needed;
+                status = ocfs2_block_group_claim_bits(osb, handle, ac,
+                                                      min_bits, &p_cpos,
+                                                      &clusters);
+                if (status < 0) {
+                        if (status != -ENOSPC)
+                                mlog_errno(status);
+                        goto bail;
+                }
+                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+                                              clusters);
+                min_bits = clusters;
+                needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        }
+        if (needed > 0) {
+                /*
+                 * We have used up all the extent rec but can't fill up
+                 * the cpg. So bail out.
+                 */
+                status = -ENOSPC;
+                goto bail;
+        }
+        ocfs2_journal_dirty(handle, bg_bh);
+bail:
+        return status;
+}
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+                                   struct ocfs2_alloc_context *cluster_ac,
+                                   struct inode *alloc_inode,
+                                   struct buffer_head *bg_bh)
+{
+        int i, ret;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        if (!bg_bh)
+                return;
+        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+        el = &bg->bg_list;
+        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+                                          cluster_ac->ac_bh,
+                                          le64_to_cpu(rec->e_blkno),
+                                          le32_to_cpu(rec->e_leaf_clusters));
+                if (ret)
+                        mlog_errno(ret);
+                /* Try all the clusters to free */
+        }
+        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+        brelse(bg_bh);
+}
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+                                  struct inode *alloc_inode,
+                                  struct ocfs2_alloc_context *ac,
+                                  struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+        struct buffer_head *bg_bh = NULL;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        if (!ocfs2_supports_discontig_bg(osb)) {
+                status = -ENOSPC;
+                goto bail;
+        }
+        status = ocfs2_extend_trans(handle,
+                                    ocfs2_calc_bg_discontig_credits(osb->sb));
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
+        /*
+         * We're going to be grabbing from multiple cluster groups.
+         * We don't have enough credits to relink them all, and the
+         * cluster groups will be staying in cache for the duration of
+         * this operation.
+         */
+        ac->ac_allow_chain_relink = 0;
+        /* Claim the first region */
+        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+                                              &bit_off, &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        min_bits = num_bits;
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+                                                  bg_bh, ac, cl, min_bits);
+        if (status)
+                mlog_errno(status);
+bail:
+        if (status)
+                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+        return status ? ERR_PTR(status) : bg_bh;
+}
 /*
 * We expect the block group allocator to already be locked.
 */
@@ -412,9 +701,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        struct ocfs2_chain_list *cl;
        struct ocfs2_alloc_context *ac = NULL;
        handle_t *handle = NULL;
-        u32 bit_off, num_bits;
        u16 alloc_rec;
-        u64 bg_blkno;
        struct buffer_head *bg_bh = NULL;
        struct ocfs2_group_desc *bg;
@@ -447,44 +734,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
-        status = ocfs2_claim_clusters(osb,
-                                      handle,
+        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
-                                      ac,
+                                               ac, cl);
-                                      le16_to_cpu(cl->cl_cpg),
+        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
-                                      &bit_off,
+                bg_bh = ocfs2_block_group_alloc_discontig(handle,
-                                      &num_bits);
+                                                          alloc_inode,
-        if (status < 0) {
+                                                          ac, cl);
+        if (IS_ERR(bg_bh)) {
+                status = PTR_ERR(bg_bh);
+                bg_bh = NULL;
                if (status != -ENOSPC)
                        mlog_errno(status);
                goto bail;
        }
-        alloc_rec = ocfs2_find_smallest_chain(cl);
-        /* setup the group */
-        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
-             alloc_rec, (unsigned long long)bg_blkno);
-        bg_bh = sb_getblk(osb->sb, bg_blkno);
-        if (!bg_bh) {
-                status = -EIO;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-        status = ocfs2_block_group_fill(handle,
-                                        alloc_inode,
-                                        bg_bh,
-                                        bg_blkno,
-                                        alloc_rec,
-                                        cl);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +757,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        alloc_rec = le16_to_cpu(bg->bg_chain);
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
                     le16_to_cpu(bg->bg_free_bits_count));
-        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+                     le16_to_cpu(bg->bg_bits));
+        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -506,11 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1021,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
                                             (u32)osb->slot_num, NULL,
-                                             ALLOC_NEW_GROUP);
+                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
        if (status >= 0) {
@@ -946,11 +1207,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-                if (status == -EFBIG) {
+                if ((status < 0) && (status != -ENOSPC)) {
-                        /* The local alloc window is outside ac_max_block.
-                         * use the main bitmap. */
-                        status = -ENOSPC;
-                } else if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
                }
@@ -1033,8 +1290,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                                             struct buffer_head *bg_bh,
                                             unsigned int bits_wanted,
                                             unsigned int total_bits,
-                                             u16 *bit_off,
+                                             struct ocfs2_suballoc_result *res)
-                                             u16 *bits_found)
 {
        void *bitmap;
        u16 best_offset, best_size;
@@ -1078,14 +1334,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                }
        }
-        /* XXX: I think the first clause is equivalent to the second
+        if (best_size) {
-         *      - jlbec */
+                res->sr_bit_offset = best_offset;
-        if (found == bits_wanted) {
+                res->sr_bits = best_size;
-                *bit_off = start - found;
-                *bits_found = found;
-        } else if (best_size) {
-                *bit_off = best_offset;
-                *bits_found = best_size;
        } else {
                status = -ENOSPC;
                /* No error log here -- see the comment above
@@ -1129,16 +1380,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
-        status = ocfs2_journal_dirty(handle,
+        ocfs2_journal_dirty(handle, group_bh);
-                                     group_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -1202,12 +1447,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        prev_bg->bg_next_group = bg->bg_next_group;
+        ocfs2_journal_dirty(handle, prev_bg_bh);
-        status = ocfs2_journal_dirty(handle, prev_bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1457,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+        ocfs2_journal_dirty(handle, bg_bh);
-        status = ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1467,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
-        status = 0;
 out_rollback:
        if (status < 0) {
                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1492,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found)
+                                      struct ocfs2_suballoc_result *res)
 {
        int search = -ENOSPC;
        int ret;
        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1525,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
-                                                        max_bits,
+                                                        max_bits, res);
-                                                        &tmp_off, &tmp_found);
                if (ret)
                        return ret;
                if (max_block) {
                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
                                                          gd_cluster_off +
-                                                          tmp_off + tmp_found);
+                                                          res->sr_bit_offset +
+                                                          res->sr_bits);
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1317,16 +1545,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
                 * of bits. */
-                if (min_bits <= tmp_found) {
+                if (min_bits <= res->sr_bits)
-                        *bit_off = tmp_off;
-                        *bits_found = tmp_found;
                        search = 0; /* success */
-                } else if (tmp_found) {
+                else if (res->sr_bits) {
                        /*
                         * Don't show bits which we'll be returning
                         * for allocation to the local alloc bitmap.
                         */
-                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
                }
        }
@@ -1337,7 +1563,7 @@ static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found)
+                                    struct ocfs2_suballoc_result *res)
 {
        int ret = -ENOSPC;
        u64 blkoff;
@@ -1350,10 +1576,10 @@ static int ocfs2_block_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
-                                                        bit_off, bits_found);
+                                                        res);
                if (!ret && max_block) {
-                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+                        blkoff = le64_to_cpu(bg->bg_blkno) +
-                                *bits_found;
+                                res->sr_bit_offset + res->sr_bits;
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1386,33 +1612,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
 }
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+                                         struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_chain_list *cl)
+{
+        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+        unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+        if (res->sr_bit_offset < bitoff)
+                return 0;
+        if (res->sr_bit_offset >= (bitoff + bitcount))
+                return 0;
+        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+                (res->sr_bit_offset - bitoff);
+        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+        return 1;
+}
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_suballoc_result *res)
+{
+        int i;
+        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+        struct ocfs2_chain_list *cl = &di->id2.i_chain;
+        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+                res->sr_blkno = 0;
+                return;
+        }
+        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+            !bg->bg_list.l_next_free_rec)
+                return;
+        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+                rec = &bg->bg_list.l_recs[i];
+                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+                        res->sr_bg_blkno = bg_blkno;  /* Restore */
+                        break;
+                }
+        }
+}
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
                                  handle_t *handle,
                                  u32 bits_wanted,
                                  u32 min_bits,
-                                  u16 *bit_off,
+                                  struct ocfs2_suballoc_result *res,
-                                  unsigned int *num_bits,
-                                  u64 gd_blkno,
                                  u16 *bits_left)
 {
        int ret;
-        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+        ret = ocfs2_read_group_descriptor(alloc_inode, di,
-                                          &group_bh);
+                                          res->sr_bg_blkno, &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1420,17 +1689,27 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                  ac->ac_max_block, bit_off, &found);
+                                  ac->ac_max_block, res);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
                goto out;
        }
-        *num_bits = found;
+        if (!ret)
+                ocfs2_bg_discontig_fix_result(ac, gd, res);
+        /*
+         * sr_bg_blkno might have been changed by
+         * ocfs2_bg_discontig_fix_result
+         */
+        res->sr_bg_stable_blkno = group_bh->b_blocknr;
+        if (ac->ac_find_loc_only)
+                goto out_loc_only;
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-                                               *num_bits,
+                                               res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
        if (ret < 0) {
                mlog_errno(ret);
@@ -1438,10 +1717,11 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-                                         *bit_off, *num_bits);
+                                         res->sr_bit_offset, res->sr_bits);
        if (ret < 0)
                mlog_errno(ret);
+out_loc_only:
        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
 out:
@@ -1454,14 +1734,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                              handle_t *handle,
                              u32 bits_wanted,
                              u32 min_bits,
-                              u16 *bit_off,
+                              struct ocfs2_suballoc_result *res,
-                              unsigned int *num_bits,
-                              u64 *bg_blkno,
                              u16 *bits_left)
 {
        int status;
-        u16 chain, tmp_bits;
+        u16 chain;
-        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
        struct buffer_head *group_bh = NULL;
@@ -1489,8 +1766,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
                                             bits_wanted, min_bits,
-                                             ac->ac_max_block, bit_off,
+                                             ac->ac_max_block,
-                                             &tmp_bits)) == -ENOSPC) {
+                                             res)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
@@ -1515,11 +1792,19 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+             res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
-        *num_bits = tmp_bits;
+        BUG_ON(res->sr_bits == 0);
+        if (!status)
+                ocfs2_bg_discontig_fix_result(ac, bg, res);
-        BUG_ON(*num_bits == 0);
+        /*
+         * sr_bg_blkno might have been changed by
+         * ocfs2_bg_discontig_fix_result
+         */
+        res->sr_bg_stable_blkno = group_bh->b_blocknr;
        /*
         * Keep track of previous block descriptor read. When
@@ -1536,7 +1821,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         */
        if (ac->ac_allow_chain_relink &&
            (prev_group_bh) &&
-            (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
                status = ocfs2_relink_block_group(handle, alloc_inode,
                                                  ac->ac_bh, group_bh,
                                                  prev_group_bh, chain);
@@ -1546,24 +1831,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                }
        }
-        /* Ok, claim our bits now: set the info on dinode, chainlist
+        if (ac->ac_find_loc_only)
-         * and then the group */
+                goto out_loc_only;
-        status = ocfs2_journal_access_di(handle,
-                                         INODE_CACHE(alloc_inode),
-                                         ac->ac_bh,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-        fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
-        status = ocfs2_journal_dirty(handle,
+        status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
-                                     ac->ac_bh);
+                                                  ac->ac_bh, res->sr_bits,
-        if (status < 0) {
+                                                  chain);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
@@ -1572,17 +1846,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                                            alloc_inode,
                                            bg,
                                            group_bh,
-                                            *bit_off,
+                                            res->sr_bit_offset,
-                                            *num_bits);
+                                            res->sr_bits);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
-        *bg_blkno = le64_to_cpu(bg->bg_blkno);
+out_loc_only:
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1593,19 +1867,16 @@ bail:
 }
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res)
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno)
 {
        int status;
        u16 victim, i;
        u16 bits_left = 0;
-        u64 hint_blkno = ac->ac_last_group;
+        u64 hint = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
@@ -1623,7 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+                ocfs2_error(ac->ac_inode->i_sb,
+                            "Chain allocator dinode %llu has %u used "
                            "bits but only %u total.",
                            (unsigned long long)le64_to_cpu(fe->i_blkno),
                            le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1904,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                goto bail;
        }
-        if (hint_blkno) {
+        res->sr_bg_blkno = hint;
+        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
                 * allocation group. This helps us mantain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
-                                                min_bits, bit_off, num_bits,
+                                                min_bits, res, &bits_left);
-                                                hint_blkno, &bits_left);
+                if (!status)
-                if (!status) {
-                        /* Be careful to update *bg_blkno here as the
-                         * caller is expecting it to be filled in, and
-                         * ocfs2_search_one_group() won't do that for
-                         * us. */
-                        *bg_blkno = hint_blkno;
                        goto set_hint;
-                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1660,10 +1926,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_chain = victim;
        ac->ac_allow_chain_relink = 1;
-        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
+        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                    num_bits, bg_blkno, &bits_left);
+                                    res, &bits_left);
-        if (!status)
+        if (!status) {
+                hint = ocfs2_group_from_res(res);
                goto set_hint;
+        }
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
                goto bail;
@@ -1685,10 +1953,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                            bit_off, num_bits, bg_blkno,
+                                            res, &bits_left);
-                                            &bits_left);
+                if (!status) {
-                if (!status)
+                        hint = ocfs2_group_from_res(res);
                        break;
+                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1703,7 +1972,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                        ac->ac_last_group = *bg_blkno;
+                        ac->ac_last_group = hint;
        }
 bail:
@@ -1711,37 +1980,37 @@ bail:
        return status;
 }
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         unsigned int *num_bits,
                         u64 *blkno_start)
 {
        int status;
-        u64 bg_blkno;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           bits_wanted,
                                           1,
-                                           suballoc_bit_start,
+                                           &res);
-                                           num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+        *suballoc_loc = res.sr_bg_blkno;
-        ac->ac_bits_given += (*num_bits);
+        *suballoc_bit_start = res.sr_bit_offset;
+        *blkno_start = res.sr_blkno;
+        ac->ac_bits_given += res.sr_bits;
+        *num_bits = res.sr_bits;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1749,10 +2018,10 @@ bail:
 }
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-                                      struct buffer_head *parent_fe_bh,
+                                      struct buffer_head *parent_di_bh,
                                      struct ocfs2_alloc_context *ac)
 {
-        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
        /*
         * Try to allocate inodes from some specific group.
         *
@@ -1766,10 +2035,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
        if (OCFS2_I(dir)->ip_last_used_group &&
            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
-                ac->ac_last_group = ocfs2_which_suballoc_group(
+                if (di->i_suballoc_loc)
-                                        le64_to_cpu(fe->i_blkno),
+                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
-                                        le16_to_cpu(fe->i_suballoc_bit));
+                else
+                        ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(di->i_blkno),
+                                        le16_to_cpu(di->i_suballoc_bit));
+        }
 }
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2052,146 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_find_new_inode_loc(struct inode *dir,
-                          handle_t *handle,
+                             struct buffer_head *parent_fe_bh,
+                             struct ocfs2_alloc_context *ac,
+                             u64 *fe_blkno)
+{
+        int ret;
+        handle_t *handle = NULL;
+        struct ocfs2_suballoc_result *res;
+        BUG_ON(!ac);
+        BUG_ON(ac->ac_bits_given != 0);
+        BUG_ON(ac->ac_bits_wanted != 1);
+        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+        res = kzalloc(sizeof(*res), GFP_NOFS);
+        if (res == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+        /*
+         * The handle started here is for chain relink. Alternatively,
+         * we could just disable relink for these calls.
+         */
+        handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * This will instruct ocfs2_claim_suballoc_bits and
+         * ocfs2_search_one_group to search but save actual allocation
+         * for later.
+         */
+        ac->ac_find_loc_only = 1;
+        ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ac->ac_find_loc_priv = res;
+        *fe_blkno = res->sr_blkno;
+out:
+        if (handle)
+                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+        if (ret)
+                kfree(res);
+        return ret;
+}
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+                                 struct inode *dir,
+                                 struct ocfs2_alloc_context *ac,
+                                 u64 *suballoc_loc,
+                                 u16 *suballoc_bit,
+                                 u64 di_blkno)
+{
+        int ret;
+        u16 chain;
+        struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+        struct buffer_head *bg_bh = NULL;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+        /*
+         * Since di_blkno is being passed back in, we check for any
+         * inconsistencies which may have happened between
+         * calls. These are code bugs as di_blkno is not expected to
+         * change once returned from ocfs2_find_new_inode_loc()
+         */
+        BUG_ON(res->sr_blkno != di_blkno);
+        ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+                                          res->sr_bg_stable_blkno, &bg_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+        chain = le16_to_cpu(bg->bg_chain);
+        ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+                                               ac->ac_bh, res->sr_bits,
+                                               chain);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_block_group_set_bits(handle,
+                                         ac->ac_inode,
+                                         bg,
+                                         bg_bh,
+                                         res->sr_bit_offset,
+                                         res->sr_bits);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+             (unsigned long long)di_blkno);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+        BUG_ON(res->sr_bits != 1);
+        *suballoc_loc = res->sr_bg_blkno;
+        *suballoc_bit = res->sr_bit_offset;
+        ac->ac_bits_given++;
+        ocfs2_save_inode_ac_group(dir, ac);
+out:
+        brelse(bg_bh);
+        return ret;
+}
+int ocfs2_claim_new_inode(handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
 {
        int status;
-        unsigned int num_bits;
+        struct ocfs2_suballoc_result res;
-        u64 bg_blkno;
        mlog_entry_void();
@@ -1800,23 +2202,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           1,
                                           1,
-                                           suballoc_bit,
+                                           &res);
-                                           &num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        BUG_ON(num_bits != 1);
+        BUG_ON(res.sr_bits != 1);
-        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+        *suballoc_loc = res.sr_bg_blkno;
+        *suballoc_bit = res.sr_bit_offset;
+        *fe_blkno = res.sr_blkno;
        ac->ac_bits_given++;
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
@@ -1886,8 +2287,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 * contig. allocation, set to '1' to indicate we can deal with extents
 * of any size.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -1896,8 +2296,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        int status;
        unsigned int bits_wanted = max_clusters;
-        u64 bg_blkno = 0;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
-        u16 bg_bit_off;
+        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
        mlog_entry_void();
@@ -1907,6 +2307,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
               && ac->ac_which != OCFS2_AC_USE_MAIN);
        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+                WARN_ON(min_clusters > 1);
                status = ocfs2_claim_local_alloc_bits(osb,
                                                      handle,
                                                      ac,
@@ -1929,20 +2331,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (bits_wanted > (osb->bitmap_cpg - 1))
                        bits_wanted = osb->bitmap_cpg - 1;
-                status = ocfs2_claim_suballoc_bits(osb,
+                status = ocfs2_claim_suballoc_bits(ac,
-                                                   ac,
                                                   handle,
                                                   bits_wanted,
                                                   min_clusters,
-                                                   &bg_bit_off,
+                                                   &res);
-                                                   num_clusters,
-                                                   &bg_blkno);
                if (!status) {
+                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
                        *cluster_start =
                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-                                                                 bg_blkno,
+                                                                 res.sr_bg_blkno,
-                                                                 bg_bit_off);
+                                                                 res.sr_bit_offset);
                        atomic_inc(&osb->alloc_stats.bitmap_data);
+                        *num_clusters = res.sr_bits;
                }
        }
        if (status < 0) {
@@ -1958,8 +2359,7 @@ bail:
        return status;
 }
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -1967,7 +2367,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
-        return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+        return __ocfs2_claim_clusters(handle, ac, min_clusters,
                                      bits_wanted, cluster_start, num_clusters);
 }
@@ -2023,9 +2423,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
-        status = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        return status;
 }
@@ -2092,12 +2490,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
                     count);
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+        ocfs2_journal_dirty(handle, alloc_bh);
-        status = ocfs2_journal_dirty(handle, alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        brelse(group_bh);
@@ -2126,6 +2519,8 @@ int ocfs2_free_dinode(handle_t *handle,
        u16 bit = le16_to_cpu(di->i_suballoc_bit);
        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (di->i_suballoc_loc)
+                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
@@ -2338,7 +2733,8 @@ out:
 * suballoc_bit.
 */
 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
-                                       u16 *suballoc_slot, u16 *suballoc_bit)
+                                       u16 *suballoc_slot, u64 *group_blkno,
+                                       u16 *suballoc_bit)
 {
        int status;
        struct buffer_head *inode_bh = NULL;
@@ -2375,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
        if (suballoc_bit)
                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+        if (group_blkno)
+                *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
 bail:
        brelse(inode_bh);
@@ -2392,10 +2790,11 @@ bail:
 */
 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct inode *suballoc,
-                                   struct buffer_head *alloc_bh, u64 blkno,
+                                   struct buffer_head *alloc_bh,
+                                   u64 group_blkno, u64 blkno,
                                   u16 bit, int *res)
 {
-        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_dinode *alloc_di;
        struct ocfs2_group_desc *group;
        struct buffer_head *group_bh = NULL;
        u64 bg_blkno;
@@ -2404,17 +2803,18 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
                   (unsigned int)bit);
-        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
-        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
                     (unsigned int)bit,
-                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
                status = -EINVAL;
                goto bail;
        }
-        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        bg_blkno = group_blkno ? group_blkno :
-        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                   ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
                mlog(ML_ERROR, "read group %llu failed %d\n",
@@ -2448,6 +2848,7 @@ bail:
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 {
        int status;
+        u64 group_blkno = 0;
        u16 suballoc_bit = 0, suballoc_slot = 0;
        struct inode *inode_alloc_inode;
        struct buffer_head *alloc_bh = NULL;
@@ -2455,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        mlog_entry("blkno: %llu", (unsigned long long)blkno);
        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
-                                             &suballoc_bit);
+                                             &group_blkno, &suballoc_bit);
        if (status < 0) {
                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
                goto bail;
@@ -2483,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        }
        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
-                                         blkno, suballoc_bit, res);
+                                         group_blkno, blkno, suballoc_bit, res);
        if (status < 0)
                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df357e6..b8afabfeede4 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
                             u32,                       /* bits_wanted */
                             u32,                       /* min_bits */
                             u64,                       /* max_block */
-                             u16 *,                     /* *bit_off */
+                             struct ocfs2_suballoc_result *);
-                             u16 *);                    /* *bits_found */
+                                                        /* found bits */
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,6 +55,11 @@ struct ocfs2_alloc_context {
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+        int    ac_find_loc_only;  /* hack for reflink operation ordering */
+        struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+        struct ocfs2_alloc_reservation  *ac_resv;
 };
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +86,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         u32 *num_bits,
                         u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -104,8 +109,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
 * number of clusters smaller than the allocation reserved.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -196,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+                             struct buffer_head *parent_fe_bh,
+                             struct ocfs2_alloc_context *ac,
+                             u64 *fe_blkno);
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+                                 struct inode *dir,
+                                 struct ocfs2_alloc_context *ac,
+                                 u64 *suballoc_loc,
+                                 u16 *suballoc_bit,
+                                 u64 di_blkno);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..fa1be1b304d1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
-        unsigned int    localalloc_opt;
+        int             localalloc_opt;
+        unsigned int    resv_level;
+        int             dir_resv_level;
        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
@@ -143,8 +145,7 @@ static const struct super_operations ocfs2_sops = {
        .alloc_inode    = ocfs2_alloc_inode,
        .destroy_inode  = ocfs2_destroy_inode,
        .drop_inode     = ocfs2_drop_inode,
-        .clear_inode    = ocfs2_clear_inode,
+        .evict_inode    = ocfs2_evict_inode,
-        .delete_inode   = ocfs2_delete_inode,
        .sync_fs        = ocfs2_sync_fs,
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
@@ -176,6 +177,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+        Opt_resv_level,
+        Opt_dir_resv_level,
        Opt_err,
 };
@@ -202,6 +205,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+        {Opt_resv_level, "resv_level=%u"},
+        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
 };
@@ -873,13 +878,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                if (unsuspend)
-                        status = vfs_quota_enable(
+                        status = dquot_resume(sb, type);
-                                        sb_dqopt(sb)->files[type],
+                else {
-                                        type, QFMT_OCFS2,
+                        struct ocfs2_mem_dqinfo *oinfo;
-                                        DQUOT_SUSPENDED);
-                else
+                        /* Cancel periodic syncing before suspending */
-                        status = vfs_quota_disable(sb, type,
+                        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-                                                   DQUOT_SUSPENDED);
+                        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+                        status = dquot_suspend(sb, type);
+                }
                if (status < 0)
                        break;
        }
@@ -910,8 +917,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
                        status = -ENOENT;
                        goto out_quota_off;
                }
-                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                status = dquot_enable(inode[type], type, QFMT_OCFS2,
-                                                DQUOT_USAGE_ENABLED);
+                                      DQUOT_USAGE_ENABLED);
                if (status < 0)
                        goto out_quota_off;
        }
@@ -932,18 +939,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        int type;
        struct inode *inode;
        struct super_block *sb = osb->sb;
+        struct ocfs2_mem_dqinfo *oinfo;
        /* We mostly ignore errors in this function because there's not much
         * we can do when we see them */
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_loaded(sb, type))
                        continue;
+                /* Cancel periodic syncing before we grab dqonoff_mutex */
+                oinfo = sb_dqinfo(sb, type)->dqi_priv;
+                cancel_delayed_work_sync(&oinfo->dqi_sync_work);
                inode = igrab(sb->s_dquot.files[type]);
                /* Turn off quotas. This will remove all dquot structures from
                 * memory and so they will be automatically synced to global
                 * quota files */
-                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
-                                            DQUOT_LIMITS_ENABLED);
+                                        DQUOT_LIMITS_ENABLED);
                if (!inode)
                        continue;
                iput(inode);
@@ -952,7 +963,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-                          char *path, int remount)
+                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -960,30 +971,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                return -EINVAL;
-        if (remount)
+        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                return 0;       /* Just ignore it has been handled in
+                            format_id, DQUOT_LIMITS_ENABLED);
-                                 * ocfs2_remount() */
-        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
-                                    format_id, DQUOT_LIMITS_ENABLED);
 }
 /* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
 {
-        if (remount)
+        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-                return 0;       /* Ignore now and handle later in
-                                 * ocfs2_remount() */
-        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
        .quota_on       = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk,
+        .set_dqblk      = dquot_set_dqblk,
 };
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -1028,8 +1033,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+        osb->osb_resv_level = parsed_options.resv_level;
+        osb->osb_dir_resv_level = parsed_options.resv_level;
+        if (parsed_options.dir_resv_level == -1)
+                osb->osb_dir_resv_level = parsed_options.resv_level;
+        else
+                osb->osb_dir_resv_level = parsed_options.dir_resv_level;
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1285,11 +1296,13 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        mopt->commit_interval = 0;
-        mopt->mount_opt = 0;
+        mopt->mount_opt = OCFS2_MOUNT_NOINTR;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
-        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->localalloc_opt = -1;
        mopt->cluster_stack[0] = '\0';
+        mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+        mopt->dir_resv_level = -1;
        if (!options) {
                status = 1;
@@ -1380,7 +1393,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                status = 0;
                                goto bail;
                        }
-                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                        if (option >= 0)
                                mopt->localalloc_opt = option;
                        break;
                case Opt_localflocks:
@@ -1433,6 +1446,28 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
+                case Opt_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->resv_level = option;
+                        break;
+                case Opt_dir_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->dir_resv_level = option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1487,7 +1522,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                           (unsigned) (osb->osb_commit_interval / HZ));
        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-        if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+        if (local_alloc_megs != ocfs2_la_default_mb(osb))
                seq_printf(s, ",localalloc=%d", local_alloc_megs);
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1549,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        else
                seq_printf(s, ",noacl");
+        if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+                seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+        if (osb->osb_dir_resv_level != osb->osb_resv_level)
+                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
        return 0;
 }
@@ -1688,6 +1729,8 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_blkno = 0ULL;
        oi->ip_clusters = 0;
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2085,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        init_waitqueue_head(&osb->osb_mount_event);
+        status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
        if (!osb->vol_label) {
                mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2273,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+        osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
        iput(inode);
-        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+                                 osb->s_feature_incompat) * 8;
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
@@ -2420,7 +2471,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
-         * allocate osb->journal at the start of ocfs2_initalize_osb(),
+         * allocate osb->journal at the start of ocfs2_initialize_osb(),
         * we free it here.
         */
        kfree(osb->journal);
@@ -2509,5 +2560,25 @@ void __ocfs2_abort(struct super_block* sb,
        ocfs2_handle_error(sb);
 }
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+        int rc;
+        sigset_t blocked;
+        sigfillset(&blocked);
+        rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+        BUG_ON(rc);
+}
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+        int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+        BUG_ON(rc);
+}
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
        }
        /* Fast symlinks can't be large */
-        len = strlen(target);
+        len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
        link = kzalloc(len + 1, GFP_NOFS);
        if (!link) {
                status = -ENOMEM;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e7773089b96..06fa5e77c40e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_alloc_context *data_ac;
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        int set_abort;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
 };
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
        &ocfs2_xattr_acl_access_handler,
        &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
                                        = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
                handler = ocfs2_xattr_handler_map[name_index];
@@ -708,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         struct ocfs2_xattr_value_buf *vb,
                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int status = 0;
+        int status = 0, credits;
        handle_t *handle = ctxt->handle;
        enum ocfs2_alloc_restarted why;
        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -718,42 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
        ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
-        status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+        while (clusters_to_add) {
-                              OCFS2_JOURNAL_ACCESS_WRITE);
+                status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-        if (status < 0) {
+                                       OCFS2_JOURNAL_ACCESS_WRITE);
-                mlog_errno(status);
+                if (status < 0) {
-                goto leave;
+                        mlog_errno(status);
-        }
+                        break;
+                }
-        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-        status = ocfs2_add_clusters_in_btree(handle,
-                                             &et,
-                                             &logical_start,
-                                             clusters_to_add,
-                                             0,
-                                             ctxt->data_ac,
-                                             ctxt->meta_ac,
-                                             &why);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        status = ocfs2_journal_dirty(handle, vb->vb_bh);
+                prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-        if (status < 0) {
+                status = ocfs2_add_clusters_in_btree(handle,
-                mlog_errno(status);
+                                                     &et,
-                goto leave;
+                                                     &logical_start,
-        }
+                                                     clusters_to_add,
+                                                     0,
+                                                     ctxt->data_ac,
+                                                     ctxt->meta_ac,
+                                                     &why);
+                if ((status < 0) && (status != -EAGAIN)) {
+                        if (status != -ENOSPC)
+                                mlog_errno(status);
+                        break;
+                }
-        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+                ocfs2_journal_dirty(handle, vb->vb_bh);
-        /*
+                clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
-         * We should have already allocated enough space before the transaction,
+                                         prev_clusters;
-         * so no need to restart.
-         */
-        BUG_ON(why != RESTART_NONE || clusters_to_add);
-leave:
+                if (why != RESTART_NONE && clusters_to_add) {
+                        /*
+                         * We can only fail in case the alloc file doesn't give
+                         * up enough clusters.
+                         */
+                        BUG_ON(why == RESTART_META);
+                        mlog(0, "restarting xattr value extension for %u"
+                             " clusters,.\n", clusters_to_add);
+                        credits = ocfs2_calc_extend_credits(inode->i_sb,
+                                                            &vb->vb_xv->xr_list,
+                                                            clusters_to_add);
+                        status = ocfs2_extend_trans(handle, credits);
+                        if (status < 0) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                break;
+                        }
+                }
+        }
        return status;
 }
@@ -786,12 +799,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
        }
        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (ext_flags & OCFS2_EXT_REFCOUNTED)
                ret = ocfs2_decrease_refcount(inode, handle,
@@ -1278,13 +1286,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        down_read(&oi->ip_xattr_sem);
        ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
                                    buffer_size, &xis);
        if (ret == -ENODATA && di->i_xattr_loc)
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
-        up_read(&oi->ip_xattr_sem);
        return ret;
 }
@@ -1308,8 +1314,10 @@ static int ocfs2_xattr_get(struct inode *inode,
                mlog_errno(ret);
                return ret;
        }
+        down_read(&OCFS2_I(inode)->ip_xattr_sem);
        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
                                     name, buffer, buffer_size);
+        up_read(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 0);
@@ -1374,11 +1382,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                memset(bh->b_data + cp_len, 0,
                                       blocksize - cp_len);
-                        ret = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
                        brelse(bh);
                        bh = NULL;
@@ -2148,15 +2152,19 @@ alloc_value:
                orig_clusters = ocfs2_xa_value_clusters(loc);
                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
                if (rc < 0) {
-                        /*
+                        ctxt->set_abort = 1;
-                         * If we tried to grow an existing external value,
-                         * ocfs2_xa_cleanuP-value_truncate() is going to
-                         * let it stand.  We have to restore its original
-                         * value size.
-                         */
-                        loc->xl_entry->xe_value_size = orig_value_size;
                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
                                                        orig_clusters);
+                        /*
+                         * If we were growing an existing value,
+                         * ocfs2_xa_cleanup_value_truncate() won't remove
+                         * the entry. We need to restore the original value
+                         * size.
+                         */
+                        if (loc->xl_entry) {
+                                BUG_ON(!orig_value_size);
+                                loc->xl_entry->xe_value_size = orig_value_size;
+                        }
                        mlog_errno(rc);
                }
        }
@@ -2479,7 +2487,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (xb->xb_suballoc_loc)
+                bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
                                EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2605,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2724,9 +2733,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+        ocfs2_journal_dirty(ctxt->handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
@@ -2846,9 +2853,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
@@ -2859,9 +2865,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
                goto end;
        }
-        ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
+        ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
-                                   &suballoc_bit_start, &num_got,
+                                   &suballoc_loc, &suballoc_bit_start,
-                                   &first_blkno);
+                                   &num_got, &first_blkno);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
@@ -2883,8 +2889,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+        xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-        xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+        xblk->xb_fs_generation =
+                cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2964,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                ret = ocfs2_xa_set(&loc, xi, ctxt);
                if (!ret)
                        xs->here = loc.xl_entry;
-                else if (ret != -ENOSPC)
+                else if ((ret != -ENOSPC) || ctxt->set_abort)
                        goto end;
                else {
                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3320,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                goto out;
                        }
-                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                        ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-                } else if (ret == -ENOSPC) {
+                } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
                                                             xi->xi_name_index,
@@ -3343,8 +3350,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -3378,8 +3384,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                                ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -4249,7 +4254,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        u32 bit_off, len;
        u64 blkno;
        handle_t *handle = ctxt->handle;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
@@ -4277,7 +4281,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
                goto out;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
@@ -4887,8 +4891,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
         * We need to update the first bucket of the old extent and all
         * the buckets going to the new extent.
         */
-        credits = ((num_buckets + 1) * blks_per_bucket) +
+        credits = ((num_buckets + 1) * blks_per_bucket);
-                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4958,7 +4961,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+        int ret, credits = 2 * blk_per_bucket;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5099,7 +5102,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -5153,9 +5156,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 leave:
        return ret;
@@ -5200,8 +5201,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
         * existing bucket.  Then we add the last existing bucket, the
         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket);
-                  handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5477,12 +5477,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+        ocfs2_journal_dirty(handle, root_bh);
-        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
@@ -6809,16 +6804,15 @@ out:
        return ret;
 }
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
                                u64 blkno, u64 new_blkno, u32 clusters,
+                                u32 *cpos, int num_buckets,
                                struct ocfs2_alloc_context *meta_ac,
                                struct ocfs2_alloc_context *data_ac,
                                struct ocfs2_reflink_xattr_tree_args *args)
 {
        int i, j, ret = 0;
        struct super_block *sb = args->reflink->old_inode->i_sb;
-        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        u32 num_buckets = clusters * bpc;
        int bpb = args->old_bucket->bu_blocks;
        struct ocfs2_xattr_value_buf vb = {
                .vb_access = ocfs2_journal_access,
@@ -6837,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                        break;
                }
-                /*
-                 * The real bucket num in this series of blocks is stored
-                 * in the 1st bucket.
-                 */
-                if (i == 0)
-                        num_buckets = le16_to_cpu(
-                                bucket_xh(args->old_bucket)->xh_num_buckets);
                ret = ocfs2_xattr_bucket_journal_access(handle,
                                                args->new_bucket,
                                                OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6858,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                               bucket_block(args->old_bucket, j),
                               sb->s_blocksize);
+                /*
+                 * Record the start cpos so that we can use it to initialize
+                 * our xattr tree we also set the xh_num_bucket for the new
+                 * bucket.
+                 */
+                if (i == 0) {
+                        *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+                                            xh_entries[0].xe_name_hash);
+                        bucket_xh(args->new_bucket)->xh_num_buckets =
+                                cpu_to_le16(num_buckets);
+                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6887,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                }
                ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
                ocfs2_xattr_bucket_relse(args->old_bucket);
                ocfs2_xattr_bucket_relse(args->new_bucket);
        }
@@ -6895,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
        ocfs2_xattr_bucket_relse(args->new_bucket);
        return ret;
 }
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+                                struct inode *inode,
+                                struct ocfs2_reflink_xattr_tree_args *args,
+                                struct ocfs2_extent_tree *et,
+                                struct ocfs2_alloc_context *meta_ac,
+                                struct ocfs2_alloc_context *data_ac,
+                                u64 blkno, u32 cpos, u32 len)
+{
+        int ret, first_inserted = 0;
+        u32 p_cluster, num_clusters, reflink_cpos = 0;
+        u64 new_blkno;
+        unsigned int num_buckets, reflink_buckets;
+        unsigned int bpc =
+                ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+        ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+        ocfs2_xattr_bucket_relse(args->old_bucket);
+        while (len && num_buckets) {
+                ret = ocfs2_claim_clusters(handle, data_ac,
+                                           1, &p_cluster, &num_clusters);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+                reflink_buckets = min(num_buckets, bpc * num_clusters);
+                ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+                                                 new_blkno, num_clusters,
+                                                 &reflink_cpos, reflink_buckets,
+                                                 meta_ac, data_ac, args);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                /*
+                 * For the 1st allocated cluster, we make it use the same cpos
+                 * so that the xattr tree looks the same as the original one
+                 * in the most case.
+                 */
+                if (!first_inserted) {
+                        reflink_cpos = cpos;
+                        first_inserted = 1;
+                }
+                ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+                                          num_clusters, 0, meta_ac);
+                if (ret)
+                        mlog_errno(ret);
+                mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+                     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+                len -= num_clusters;
+                blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+                num_buckets -= reflink_buckets;
+        }
+out:
+        return ret;
+}
 /*
 * Create the same xattr extent record in the new inode's xattr tree.
 */
@@ -6906,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                                   void *para)
 {
        int ret, credits = 0;
-        u32 p_cluster, num_clusters;
-        u64 new_blkno;
        handle_t *handle;
        struct ocfs2_reflink_xattr_tree_args *args =
                        (struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6916,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_extent_tree et;
+        mlog(0, "reflink xattr buckets %llu len %u\n",
+             (unsigned long long)blkno, len);
        ocfs2_init_xattr_tree_extent_tree(&et,
                                          INODE_CACHE(args->reflink->new_inode),
                                          args->new_blk_bh);
@@ -6935,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac,
+        ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
-                                   len, &p_cluster, &num_clusters);
+                                          meta_ac, data_ac,
-        if (ret) {
+                                          blkno, cpos, len);
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-        mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
-             (unsigned long long)blkno, (unsigned long long)new_blkno, len);
-        ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
-                                          meta_ac, data_ac, args);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-             (unsigned long long)new_blkno, len, cpos);
-        ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
-                                  len, 0, meta_ac);
        if (ret)
                mlog_errno(ret);
-out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
@@ -7234,7 +7283,7 @@ int ocfs2_init_security_set(handle_t *handle,
                                     xattr_ac, data_ac);
 }
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ocfs2_xattr_security_list,
        .get    = ocfs2_xattr_security_get,
@@ -7278,7 +7327,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ocfs2_xattr_trusted_list,
        .get    = ocfs2_xattr_trusted_get,
@@ -7334,7 +7383,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ocfs2_xattr_user_list,
        .get    = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
        size_t value_len;
 };
-extern struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index b42d62419034..393f3f659da7 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -25,11 +25,10 @@ static struct buffer_head *omfs_get_bucket(struct inode *dir,
                const char *name, int namelen, int *ofs)
 {
        int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
-        int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
        int bucket = omfs_hash(name, namelen, nbuckets);
        *ofs = OMFS_DIR_START + bucket * 8;
-        return sb_bread(dir->i_sb, block);
+        return omfs_bread(dir->i_sb, dir->i_ino);
 }
 static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
@@ -42,8 +41,7 @@ static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
        *prev_block = ~0;
        while (block != ~0) {
-                bh = sb_bread(dir->i_sb,
+                bh = omfs_bread(dir->i_sb, block);
-                        clus_to_blk(OMFS_SB(dir->i_sb), block));
                if (!bh) {
                        err = -EIO;
                        goto err;
@@ -86,11 +84,10 @@ static struct buffer_head *omfs_find_entry(struct inode *dir,
 int omfs_make_empty(struct inode *inode, struct super_block *sb)
 {
        struct omfs_sb_info *sbi = OMFS_SB(sb);
-        int block = clus_to_blk(sbi, inode->i_ino);
        struct buffer_head *bh;
        struct omfs_inode *oi;
-        bh = sb_bread(sb, block);
+        bh = omfs_bread(sb, inode->i_ino);
        if (!bh)
                return -ENOMEM;
@@ -134,7 +131,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
        brelse(bh);
        /* now set the sibling and parent pointers on the new inode */
-        bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+        bh = omfs_bread(dir->i_sb, inode->i_ino);
        if (!bh)
                goto out;
@@ -190,8 +187,7 @@ static int omfs_delete_entry(struct dentry *dentry)
        if (prev != ~0) {
                /* found in middle of list, get list ptr */
                brelse(bh);
-                bh = sb_bread(dir->i_sb,
+                bh = omfs_bread(dir->i_sb, prev);
-                        clus_to_blk(OMFS_SB(dir->i_sb), prev));
                if (!bh)
                        goto out;
@@ -224,8 +220,7 @@ static int omfs_dir_is_empty(struct inode *inode)
        u64 *ptr;
        int i;
-        bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
+        bh = omfs_bread(inode->i_sb, inode->i_ino);
-                        inode->i_ino));
        if (!bh)
                return 0;
@@ -353,8 +348,7 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
        /* follow chain in this bucket */
        while (fsblock != ~0) {
-                bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
+                bh = omfs_bread(dir->i_sb, fsblock);
-                                fsblock));
                if (!bh)
                        goto out;
@@ -466,7 +460,7 @@ static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        hchain = (filp->f_pos >> 20) - 1;
        hindex = filp->f_pos & 0xfffff;
-        bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+        bh = omfs_bread(dir->i_sb, dir->i_ino);
        if (!bh)
                goto out;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..8a6d34fa668a 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -50,7 +50,7 @@ int omfs_shrink_inode(struct inode *inode)
        if (inode->i_size != 0)
                goto out;
-        bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+        bh = omfs_bread(inode->i_sb, next);
        if (!bh)
                goto out;
@@ -90,7 +90,7 @@ int omfs_shrink_inode(struct inode *inode)
                if (next == ~0)
                        break;
-                bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+                bh = omfs_bread(inode->i_sb, next);
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -222,7 +222,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
        struct buffer_head *bh;
        sector_t next, offset;
        int ret;
-        u64 new_block;
+        u64 uninitialized_var(new_block);
        u32 max_extents;
        int extent_count;
        struct omfs_extent *oe;
@@ -232,7 +232,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
        int remain;
        ret = -EIO;
-        bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+        bh = omfs_bread(inode->i_sb, inode->i_ino);
        if (!bh)
                goto out;
@@ -265,7 +265,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                        break;
                brelse(bh);
-                bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+                bh = omfs_bread(inode->i_sb, next);
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
@@ -312,9 +312,17 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return block_write_begin(file, mapping, pos, len, flags,
-                                pagep, fsdata, omfs_get_block);
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                omfs_get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
@@ -329,11 +337,33 @@ const struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = simple_fsync,
+        .fsync = generic_file_fsync,
        .splice_read = generic_file_splice_read,
 };
+static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
 const struct inode_operations omfs_file_inops = {
+        .setattr = omfs_setattr,
        .truncate = omfs_truncate
 };
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index c82af6acc2e7..14a22863291a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -3,7 +3,6 @@
 * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
 * Released under GPL v2.
 */
-#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -20,6 +19,15 @@ MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
 MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
 MODULE_LICENSE("GPL");
+struct buffer_head *omfs_bread(struct super_block *sb, sector_t block)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        if (block >= sbi->s_num_blocks)
+                return NULL;
+        return sb_bread(sb, clus_to_blk(sbi, block));
+}
 struct inode *omfs_new_inode(struct inode *dir, int mode)
 {
        struct inode *inode;
@@ -38,9 +46,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
                goto fail;
        inode->i_ino = new_block;
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -96,15 +102,13 @@ static int __omfs_write_inode(struct inode *inode, int wait)
        struct omfs_inode *oi;
        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
        struct buffer_head *bh, *bh2;
-        unsigned int block;
        u64 ctime;
        int i;
        int ret = -EIO;
        int sync_failed = 0;
        /* get current inode since we may have written sibling ptrs etc. */
-        block = clus_to_blk(sbi, inode->i_ino);
+        bh = omfs_bread(inode->i_sb, inode->i_ino);
-        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto out;
@@ -143,8 +147,7 @@ static int __omfs_write_inode(struct inode *inode, int wait)
        /* if mirroring writes, copy to next fsblock */
        for (i = 1; i < sbi->s_mirrors; i++) {
-                bh2 = sb_bread(inode->i_sb, block + i *
+                bh2 = omfs_bread(inode->i_sb, inode->i_ino + i);
-                        (sbi->s_blocksize / sbi->s_sys_blocksize));
                if (!bh2)
                        goto out_brelse;
@@ -178,9 +181,13 @@ int omfs_sync_inode(struct inode *inode)
 * called when an entry is deleted, need to clear the bits in the
 * bitmaps.
 */
-static void omfs_delete_inode(struct inode *inode)
+static void omfs_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
+        if (inode->i_nlink)
+                return;
        if (S_ISREG(inode->i_mode)) {
                inode->i_size = 0;
@@ -188,7 +195,6 @@ static void omfs_delete_inode(struct inode *inode)
        }
        omfs_clear_range(inode->i_sb, inode->i_ino, 2);
-        clear_inode(inode);
 }
 struct inode *omfs_iget(struct super_block *sb, ino_t ino)
@@ -196,7 +202,6 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
        struct omfs_sb_info *sbi = OMFS_SB(sb);
        struct omfs_inode *oi;
        struct buffer_head *bh;
-        unsigned int block;
        u64 ctime;
        unsigned long nsecs;
        struct inode *inode;
@@ -207,8 +212,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
        if (!(inode->i_state & I_NEW))
                return inode;
-        block = clus_to_blk(sbi, ino);
+        bh = omfs_bread(inode->i_sb, ino);
-        bh = sb_bread(inode->i_sb, block);
        if (!bh)
                goto iget_failed;
@@ -287,7 +291,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static const struct super_operations omfs_sops = {
        .write_inode    = omfs_write_inode,
-        .delete_inode   = omfs_delete_inode,
+        .evict_inode    = omfs_evict_inode,
        .put_super      = omfs_put_super,
        .statfs         = omfs_statfs,
        .show_options   = generic_show_options,
@@ -322,6 +326,9 @@ static int omfs_get_imap(struct super_block *sb)
                goto nomem;
        block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+        if (block >= sbi->s_num_blocks)
+                goto nomem;
        ptr = sbi->s_imap;
        for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
                bh = sb_bread(sb, block++);
@@ -420,7 +427,6 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
        struct omfs_root_block *omfs_rb;
        struct omfs_sb_info *sbi;
        struct inode *root;
-        sector_t start;
        int ret = -EINVAL;
        save_mount_options(sb, (char *) data);
@@ -489,8 +495,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
                get_bitmask_order(sbi->s_sys_blocksize);
-        start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
+        bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block));
-        bh2 = sb_bread(sb, start);
        if (!bh2)
                goto out_brelse_bh;
@@ -507,6 +512,21 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_brelse_bh2;
        }
+        if (sbi->s_bitmap_ino != ~0ULL &&
+            sbi->s_bitmap_ino > sbi->s_num_blocks) {
+                printk(KERN_ERR "omfs: free space bitmap location is corrupt "
+                        "(%llx, total blocks %llx)\n",
+                        (unsigned long long) sbi->s_bitmap_ino,
+                        (unsigned long long) sbi->s_num_blocks);
+                goto out_brelse_bh2;
+        }
+        if (sbi->s_clustersize < 1 ||
+            sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) {
+                printk(KERN_ERR "omfs: cluster size out of range (%d)",
+                        sbi->s_clustersize);
+                goto out_brelse_bh2;
+        }
        ret = omfs_get_imap(sb);
        if (ret)
                goto out_brelse_bh2;
@@ -532,6 +552,8 @@ out_brelse_bh2:
 out_brelse_bh:
        brelse(bh);
 end:
+        if (ret)
+                kfree(sbi);
        return ret;
 }
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index ebe2fdbe535e..7d414fef501a 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -58,6 +58,7 @@ extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
 extern int omfs_shrink_inode(struct inode *inode);
 /* inode.c */
+extern struct buffer_head *omfs_bread(struct super_block *sb, sector_t block);
 extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
 extern struct inode *omfs_new_inode(struct inode *dir, int mode);
 extern int omfs_reserve_block(struct super_block *sb, sector_t block);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index 12cca245d6e8..ee5e4327de92 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -17,6 +17,7 @@
 #define OMFS_EXTENT_CONT 0x40
 #define OMFS_XOR_COUNT 19
 #define OMFS_MAX_BLOCK_SIZE 8192
+#define OMFS_MAX_CLUSTER_SIZE 8
 struct omfs_super_block {
        char s_fill1[256];
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..d74e1983e8dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
 #include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
-#include <linux/vfs.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -30,174 +29,10 @@
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
+#include <linux/dnotify.h>
 #include "internal.h"
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int retval = -ENODEV;
-        if (dentry) {
-                retval = -ENOSYS;
-                if (dentry->d_sb->s_op->statfs) {
-                        memset(buf, 0, sizeof(*buf));
-                        retval = security_sb_statfs(dentry);
-                        if (retval)
-                                return retval;
-                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
-                        if (retval == 0 && buf->f_frsize == 0)
-                                buf->f_frsize = buf->f_bsize;
-                }
-        }
-        return retval;
-}
-EXPORT_SYMBOL(vfs_statfs);
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                if (sizeof buf->f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
-                             st.f_bsize | st.f_frsize) &
-                            0xffffffff00000000ULL)
-                                return -EOVERFLOW;
-                        /*
-                         * f_files and f_ffree may be -1; it's okay to stuff
-                         * that into 32 bits
-                         */
-                        if (st.f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                }
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
-{
-        struct path path;
-        int error;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs tmp;
-                error = vfs_statfs_native(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct path path;
-        long error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs64 tmp;
-                error = vfs_statfs64(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-{
-        struct file * file;
-        struct statfs tmp;
-        int error;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs_native(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
-SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct file * file;
-        struct statfs64 tmp;
-        int error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs64(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
 {
@@ -276,7 +111,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
        error = locks_verify_truncate(inode, NULL, length);
        if (!error)
-                error = security_path_truncate(&path, length, 0);
+                error = security_path_truncate(&path);
        if (!error)
                error = do_truncate(path.dentry, length, 0, NULL);
@@ -331,8 +166,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
-                error = security_path_truncate(&file->f_path, length,
+                error = security_path_truncate(&file->f_path);
-                                               ATTR_MTIME|ATTR_CTIME);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
@@ -533,7 +367,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
        if (error)
                goto out;
-        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;
@@ -562,7 +396,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
        if (!S_ISDIR(inode->i_mode))
                goto out_putf;
-        error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -580,7 +414,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
        if (error)
                goto out;
-        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;
@@ -841,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.mnt = mnt;
        f->f_pos = 0;
        f->f_op = fops_get(inode->i_fop);
-        file_move(f, &inode->i_sb->s_files);
+        file_sb_list_add(f, inode->i_sb);
        error = security_dentry_open(f, cred);
        if (error)
@@ -887,7 +721,7 @@ cleanup_all:
                        mnt_drop_write(mnt);
                }
        }
-        file_kill(f);
+        file_sb_list_del(f);
        f->f_path.dentry = NULL;
        f->f_path.mnt = NULL;
 cleanup_file:
@@ -1054,7 +888,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
                        } else {
-                                fsnotify_open(f->f_path.dentry);
+                                fsnotify_open(f);
                                fd_install(fd, f);
                        }
                }
@@ -1197,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
 /*
 * This is used by subsystems that don't want seekable
- * file descriptors
+ * file descriptors. The function is not supposed to ever fail, the only
+ * reason it returns an 'int' and not 'void' is so that it can be plugged
+ * directly into file_operations structure.
 */
 int nonseekable_open(struct inode *inode, struct file *filp)
 {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..fbeb697374d5 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
        nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
                   (le32_to_cpu(dr->disc_size) >> 9);
-        if (name)
+        if (name) {
-                printk(" [%s]", name);
+                strlcat(state->pp_buf, " [", PAGE_SIZE);
+                strlcat(state->pp_buf, name, PAGE_SIZE);
+                strlcat(state->pp_buf, "]", PAGE_SIZE);
+        }
        put_partition(state, slot, first_sector, nr_sects);
        return dr;
 }
@@ -70,25 +73,25 @@ struct riscix_record {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int riscix_partition(struct parsed_partitions *state,
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
+                            unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                            unsigned long nr_sects)
 {
        Sector sect;
        struct riscix_record *rr;
        
-        rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+        rr = read_part_sector(state, first_sect, &sect);
        if (!rr)
                return -1;
-        printk(" [RISCiX]");
+        strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
        if (rr->magic == RISCIX_MAGIC) {
                unsigned long size = nr_sects > 2 ? 2 : nr_sects;
                int part;
-                printk(" <");
+                strlcat(state->pp_buf, " <", PAGE_SIZE);
                put_partition(state, slot++, first_sect, size);
                for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
                                put_partition(state, slot++,
                                        le32_to_cpu(rr->part[part].start),
                                        le32_to_cpu(rr->part[part].length));
-                                printk("(%s)", rr->part[part].name);
+                                strlcat(state->pp_buf, "(", PAGE_SIZE);
+                                strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+                                strlcat(state->pp_buf, ")", PAGE_SIZE);
                        }
                }
-                printk(" >\n");
+                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
        } else {
                put_partition(state, slot++, first_sect, nr_sects);
        }
@@ -123,23 +128,23 @@ struct linux_part {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int linux_partition(struct parsed_partitions *state,
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
+                           unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                           unsigned long nr_sects)
 {
        Sector sect;
        struct linux_part *linuxp;
        unsigned long size = nr_sects > 2 ? 2 : nr_sects;
-        printk(" [Linux]");
+        strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
        put_partition(state, slot++, first_sect, size);
-        linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+        linuxp = read_part_sector(state, first_sect, &sect);
        if (!linuxp)
                return -1;
-        printk(" <");
+        strlcat(state->pp_buf, " <", PAGE_SIZE);
        while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
               linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
                if (slot == state->limit)
@@ -149,7 +154,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
                                 le32_to_cpu(linuxp->nr_sects));
                linuxp ++;
        }
-        printk(" >");
+        strlcat(state->pp_buf, " >", PAGE_SIZE);
        put_dev_sector(sect);
        return slot;
@@ -157,8 +162,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
+int adfspart_check_CUMANA(struct parsed_partitions *state)
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long first_sector = 0;
        unsigned int start_blk = 0;
@@ -185,7 +189,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
                struct adfs_discrecord *dr;
                unsigned int nr_sects;
-                data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+                data = read_part_sector(state, start_blk * 2 + 6, &sect);
                if (!data)
                        return -1;
@@ -217,14 +221,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                        /* RISCiX - we don't know how to find the next one. */
-                        slot = riscix_partition(state, bdev, first_sector,
+                        slot = riscix_partition(state, first_sector, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, first_sector,
+                        slot = linux_partition(state, first_sector, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
                put_dev_sector(sect);
@@ -249,8 +253,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 *          hda1 = ADFS partition on first drive.
 *          hda2 = non-ADFS partition.
 */
-int
+int adfspart_check_ADFS(struct parsed_partitions *state)
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long start_sect, nr_sects, sectscyl, heads;
        Sector sect;
@@ -259,7 +262,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        unsigned char id;
        int slot = 1;
-        data = read_dev_sector(bdev, 6, &sect);
+        data = read_part_sector(state, 6, &sect);
        if (!data)
                return -1;
@@ -278,25 +281,25 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Work out start of non-adfs partition.
         */
-        nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+        nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
        if (start_sect) {
                switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                case PARTITION_RISCIX_MFM:
-                        slot = riscix_partition(state, bdev, start_sect,
+                        slot = riscix_partition(state, start_sect, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, start_sect,
+                        slot = linux_partition(state, start_sect, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -308,10 +311,11 @@ struct ics_part {
        __le32 size;
 };
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+                                   unsigned long block)
 {
        Sector sect;
-        unsigned char *data = read_dev_sector(bdev, block, &sect);
+        unsigned char *data = read_part_sector(state, block, &sect);
        int result = 0;
        if (data) {
@@ -349,8 +353,7 @@ static inline int valid_ics_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_ICS(struct parsed_partitions *state)
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 {
        const unsigned char *data;
        const struct ics_part *p;
@@ -360,7 +363,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Try ICS style partitions - sector 0 contains partition info.
         */
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -369,7 +372,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
-        printk(" [ICS]");
+        strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
        for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
                u32 start = le32_to_cpu(p->start);
@@ -392,7 +395,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
                         * partition is.  We must not make this visible
                         * to the filesystem.
                         */
-                        if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+                        if (size > 1 && adfspart_check_ICSLinux(state, start)) {
                                start += 1;
                                size -= 1;
                        }
@@ -403,7 +406,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
        }
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -446,8 +449,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -455,7 +457,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
        int slot = 1;
        int i;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -464,7 +466,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
                return 0;
        }
-        printk(" [POWERTEC]");
+        strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
        for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
                u32 start = le32_to_cpu(p->start);
@@ -475,7 +477,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
        }
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -508,8 +510,7 @@ static const char eesox_name[] = {
 *  1. The individual ADFS boot block entries that are placed on the disk.
 *  2. The start address of the next entry.
 */
-int
+int adfspart_check_EESOX(struct parsed_partitions *state)
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -518,7 +519,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        sector_t start = 0;
        int i, slot = 1;
-        data = read_dev_sector(bdev, 7, &sect);
+        data = read_part_sector(state, 7, &sect);
        if (!data)
                return -1;
@@ -545,9 +546,9 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        if (i != 0) {
                sector_t size;
-                size = get_capacity(bdev->bd_disk);
+                size = get_capacity(state->bdev->bd_disk);
                put_partition(state, slot++, start, size - start);
-                printk("\n");
+                strlcat(state->pp_buf, "\n", PAGE_SIZE);
        }
        return i ? 1 : 0;
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
 *  format, and everyone stick to it?
 */
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ADFS(struct parsed_partitions *state);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ICS(struct parsed_partitions *state);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..70cbf44a1560 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
        return sum;
 }
-int
+int amiga_partition(struct parsed_partitions *state)
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 0; ; blk++, put_dev_sector(sect)) {
                if (blk == RDB_ALLOCATION_LIMIT)
                        goto rdb_done;
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read RDB block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
@@ -64,22 +63,28 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                printk("Dev %s: RDB in block %d has bad checksum\n",
-                               bdevname(bdev, b), blk);
+                       bdevname(state->bdev, b), blk);
        }
        /* blksize is blocks per 512 byte standard block */
        blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
-        printk(" RDSK (%d)", blksize * 512);    /* Be more informative */
+        {
+                char tmp[7 + 10 + 1 + 1];
+                /* Be more informative */
+                snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        blk = be32_to_cpu(rdb->rdb_PartitionList);
        put_dev_sector(sect);
        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
                blk *= blksize; /* Read in terms partition table understands */
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read partition block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
@@ -107,23 +112,27 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
                {
                        /* Be even more informative to aid mounting */
                        char dostype[4];
+                        char tmp[42];
                        __be32 *dt = (__be32 *)dostype;
                        *dt = pb->pb_Environment[16];
                        if (dostype[3] < ' ')
-                                printk(" (%c%c%c^%c)",
+                                snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
                                        dostype[0], dostype[1],
                                        dostype[2], dostype[3] + '@' );
                        else
-                                printk(" (%c%c%c%c)",
+                                snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
                                        dostype[0], dostype[1],
                                        dostype[2], dostype[3]);
-                        printk("(res %d spb %d)",
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
+                        snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
                                be32_to_cpu(pb->pb_Environment[6]),
                                be32_to_cpu(pb->pb_Environment[4]));
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                }
                res = 1;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
 rdb_done:
        return res;
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
 *  fs/partitions/amiga.h
 */
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..9875b05e80a2 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
                memcmp (s, "RAW", 3) == 0 ;
 }
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
        Sector sect;
        struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
        int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
-        rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+        rs = read_part_sector(state, 0, &sect);
        if (!rs)
                return -1;
        /* Verify this is an Atari rootsector: */
-        hd_size = bdev->bd_inode->i_size >> 9;
+        hd_size = state->bdev->bd_inode->i_size >> 9;
        if (!VALID_PARTITION(&rs->part[0], hd_size) &&
            !VALID_PARTITION(&rs->part[1], hd_size) &&
            !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        pi = &rs->part[0];
-        printk (" AHDI");
+        strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
        for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
                struct rootsector *xrs;
                Sector sect2;
@@ -81,10 +81,10 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 #ifdef ICD_PARTS
                part_fmt = 1;
 #endif
-                printk(" XGM<");
+                strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
                partsect = extensect = be32_to_cpu(pi->st);
                while (1) {
-                        xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+                        xrs = read_part_sector(state, partsect, &sect2);
                        if (!xrs) {
                                printk (" block %ld read failed\n", partsect);
                                put_dev_sector(sect);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
                                break;
                        }
                }
-                printk(" >");
+                strlcat(state->pp_buf, " >", PAGE_SIZE);
        }
 #ifdef ICD_PARTS
        if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
                pi = &rs->icdpart[0];
                /* sanity check: no ICD format if first partition invalid */
                if (OK_id(pi->id)) {
-                        printk(" ICD<");
+                        strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
                        for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
                                /* accept only GEM,BGM,RAW,LNX,SWP partitions */
                                if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
                                                be32_to_cpu(pi->st),
                                                be32_to_cpu(pi->siz));
                        }
-                        printk(" >");
+                        strlcat(state->pp_buf, " >", PAGE_SIZE);
                }
        }
 #endif
        put_dev_sector(sect);
-        printk ("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
  u16 checksum;                 /* checksum for bootable disks */
 } __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..79fbf3f390f0 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
        /*
         * Probe partition formats with tables at disk address 0
         * that also have an ADFS boot block at 0xdc0.
@@ -161,12 +161,19 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        struct parsed_partitions *state;
        int i, res, err;
-        state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+        state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
        if (!state)
                return NULL;
+        state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+        if (!state->pp_buf) {
+                kfree(state);
+                return NULL;
+        }
+        state->pp_buf[0] = '\0';
+        state->bdev = bdev;
        disk_name(hd, 0, state->name);
-        printk(KERN_INFO " %s:", state->name);
+        snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");
@@ -174,7 +181,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
-                res = check_part[i++](state, bdev);
+                res = check_part[i++](state);
                if (res < 0) {
                        /* We have hit an I/O error which we don't report now.
                        * But record it, and let the others do their job.
@@ -184,15 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
                }
        }
-        if (res > 0)
+        if (res > 0) {
+                printk(KERN_INFO "%s", state->pp_buf);
+                free_page((unsigned long)state->pp_buf);
                return state;
+        }
+        if (state->access_beyond_eod)
+                err = -ENOSPC;
        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
        if (!res)
-                printk(" unknown partition table\n");
+                strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
        else if (warn_no_part)
-                printk(" unable to read partition table\n");
+                strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+        printk(KERN_INFO "%s", state->pp_buf);
+        free_page((unsigned long)state->pp_buf);
        kfree(state);
        return ERR_PTR(res);
 }
@@ -456,7 +473,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        }
        /* everything is up and running, commence */
-        INIT_RCU_HEAD(&p->rcu_head);
        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
@@ -538,12 +554,33 @@ exit:
        disk_part_iter_exit(&piter);
 }
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+        const struct block_device_operations *bdops = disk->fops;
+        if (bdops->unlock_native_capacity &&
+            !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+                printk(KERN_CONT "enabling native capacity\n");
+                bdops->unlock_native_capacity(disk);
+                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+                return true;
+        } else {
+                printk(KERN_CONT "truncated\n");
+                return false;
+        }
+}
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+        struct parsed_partitions *state = NULL;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        struct parsed_partitions *state;
        int p, highest, res;
+rescan:
+        if (state && !IS_ERR(state)) {
+                kfree(state);
+                state = NULL;
+        }
        if (bdev->bd_part_count)
                return -EBUSY;
@@ -562,8 +599,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
-        if (IS_ERR(state))      /* I/O error reading the partition table */
+        if (IS_ERR(state)) {
+                /*
+                 * I/O error reading the partition table.  If any
+                 * partition code tried to read beyond EOD, retry
+                 * after unlocking native capacity.
+                 */
+                if (PTR_ERR(state) == -ENOSPC) {
+                        printk(KERN_WARNING "%s: partition table beyond EOD, ",
+                               disk->disk_name);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
+                }
                return -EIO;
+        }
+        /*
+         * If any partition code tried to read beyond EOD, try
+         * unlocking native capacity even if partition table is
+         * sucessfully read as we could be missing some partitions.
+         */
+        if (state->access_beyond_eod) {
+                printk(KERN_WARNING
+                       "%s: partition table partially beyond EOD, ",
+                       disk->disk_name);
+                if (disk_unlock_native_capacity(disk))
+                        goto rescan;
+        }
        /* tell userspace that the media / partition table may have changed */
        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +642,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
-try_scan:
                size = state->parts[p].size;
                if (!size)
                        continue;
@@ -589,30 +650,21 @@ try_scan:
                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
-                               "%s: p%d ignored, start %llu is behind the end of the disk\n",
+                               "%s: p%d start %llu is beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) from);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
                        continue;
                }
                if (from + size > get_capacity(disk)) {
-                        const struct block_device_operations *bdops = disk->fops;
-                        unsigned long long capacity;
                        printk(KERN_WARNING
-                               "%s: p%d size %llu exceeds device capacity, ",
+                               "%s: p%d size %llu extends beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) size);
-                        if (bdops->set_capacity &&
+                        if (disk_unlock_native_capacity(disk)) {
-                            (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+                                /* free state and restart */
-                                printk(KERN_CONT "enabling native capacity\n");
+                                goto rescan;
-                                capacity = bdops->set_capacity(disk, ~0ULL);
-                                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-                                if (capacity > get_capacity(disk)) {
-                                        set_capacity(disk, capacity);
-                                        check_disk_size_change(disk, bdev);
-                                        bdev->bd_invalidated = 0;
-                                }
-                                goto try_scan;
                        } else {
                                /*
                                 * we can not ignore partitions of broken tables
@@ -620,7 +672,6 @@ try_scan:
                                 * we limit them to the end of the disk to avoid
                                 * creating invalid block devices
                                 */
-                                printk(KERN_CONT "limited to end of disk\n");
                                size = get_capacity(disk) - from;
                        }
                }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..8e4e103ba216 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
 * description.
 */
 struct parsed_partitions {
+        struct block_device *bdev;
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
@@ -14,15 +15,30 @@ struct parsed_partitions {
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
+        bool access_beyond_eod;
+        char *pp_buf;
 };
+static inline void *read_part_sector(struct parsed_partitions *state,
+                                     sector_t n, Sector *p)
+{
+        if (n >= get_capacity(state->bdev->bd_disk)) {
+                state->access_beyond_eod = true;
+                return NULL;
+        }
+        return read_dev_sector(state->bdev, n, p);
+}
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
        if (n < p->limit) {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 1];
                p->parts[n].from = from;
                p->parts[n].size = size;
-                printk(" %s%d", p->name, n);
+                snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+                strlcat(p->pp_buf, tmp, PAGE_SIZE);
        }
 }
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..dbb44d4bb8a7 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
 *  the part[0] entry for this disk, and is the number of
 *  physical sectors available on the disk.
 */
-static u64
+static u64 last_lba(struct block_device *bdev)
-last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 /**
 * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
 * @lba
 * @buffer
 * @size_t
 *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
 * Returns number of bytes read on success, 0 on error.
 */
-static size_t
+static size_t read_lba(struct parsed_partitions *state,
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+                       u64 lba, u8 *buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        struct block_device *bdev = state->bdev;
        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
-        if (!bdev || !buffer || lba > last_lba(bdev))
+        if (!buffer || lba > last_lba(bdev))
                return 0;
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, n++, &sect);
+                unsigned char *data = read_part_sector(state, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 /**
 * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
 * @gpt - GPT header
 * 
 * Description: Returns ptes on success,  NULL on error.
 * Allocates space for PTEs based on information found in @gpt.
 * Notes: remember to free pte when you're done!
 */
-static gpt_entry *
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+                                         gpt_header *gpt)
 {
        size_t count;
        gpt_entry *pte;
-        if (!bdev || !gpt)
+        if (!gpt)
                return NULL;
        count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
        if (!pte)
                return NULL;
-        if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+        if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                     (u8 *) pte,
                     count) < count) {
                kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 /**
 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
 * @lba is the Logical Block Address of the partition table
 * 
 * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
 * Note: remember to free gpt when finished with it.
 */
-static gpt_header *
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+                                         u64 lba)
 {
        gpt_header *gpt;
-        unsigned ssz = bdev_logical_block_size(bdev);
+        unsigned ssz = bdev_logical_block_size(state->bdev);
-        if (!bdev)
-                return NULL;
        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+        if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 /**
 * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
 * @lba is the logical block address of the GPT header to test
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 * Description: returns 1 if valid,  0 on error.
 * If valid, returns pointers to newly allocated GPT header and PTEs.
 */
-static int
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
-is_gpt_valid(struct block_device *bdev, u64 lba,
+                        gpt_header **gpt, gpt_entry **ptes)
-             gpt_header **gpt, gpt_entry **ptes)
 {
        u32 crc, origcrc;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+        if (!(*gpt = alloc_read_gpt_header(state, lba)))
                return 0;
        /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check the first_usable_lba and last_usable_lba are
         * within the disk.
         */
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
                goto fail;
        }
-        if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+        if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
                goto fail;
        /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 /**
 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 * This protects against devices which misreport their size, and forces
 * the user to decide to use the Alternate GPT.
 */
-static int
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+                          gpt_entry **ptes)
 {
        int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
        gpt_header *pgpt = NULL, *agpt = NULL;
        gpt_entry *pptes = NULL, *aptes = NULL;
        legacy_mbr *legacymbr;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (!force_gpt) {
                /* This will be added to the EFI Spec. per Intel after v1.02. */
                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                if (legacymbr) {
-                        read_lba(bdev, 0, (u8 *) legacymbr,
+                        read_lba(state, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
+                                 sizeof (*legacymbr));
                        good_pmbr = is_pmbr_valid(legacymbr);
                        kfree(legacymbr);
                }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                        goto fail;
        }
-        good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+        good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
                                 &pgpt, &pptes);
        if (good_pgpt)
-                good_agpt = is_gpt_valid(bdev,
+                good_agpt = is_gpt_valid(state,
                                         le64_to_cpu(pgpt->alternate_lba),
                                         &agpt, &aptes);
        if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(bdev, lastlba,
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
-                                         &agpt, &aptes);
        /* The obviously unsuccessful case */
        if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 }
 /**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
 * @state
- * @bdev
 *
 * Description: called from check.c, if the disk contains GPT
 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 *  1 if successful
 *
 */
-int
+int efi_partition(struct parsed_partitions *state)
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
-        unsigned ssz = bdev_logical_block_size(bdev) / 512;
+        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
                kfree(ptes);
                return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                u64 size = le64_to_cpu(ptes[i].ending_lba) -
                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
-                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+                if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
                        continue;
                put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,10 +626,10 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
                                 PARTITION_LINUX_RAID_GUID))
-                        state->parts[i+1].flags = 1;
+                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
        }
        kfree(ptes);
        kfree(gpt);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 } __attribute__ ((packed)) legacy_mbr;
 /* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
 #endif
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..d513a07f44bb 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 /*
 */
-int
+int ibm_partition(struct parsed_partitions *state)
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
+        struct block_device *bdev = state->bdev;
        int blocksize, res;
        loff_t i_size, offset, size, fmt_size;
        dasd_information2_t *info;
@@ -74,6 +74,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        } *label;
        unsigned char *data;
        Sector sect;
+        sector_t labelsect;
+        char tmp[64];
        res = 0;
        blocksize = bdev_logical_block_size(bdev);
@@ -98,9 +100,19 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                goto out_freeall;
        /*
+         * Special case for FBA disks: label sector does not depend on
+         * blocksize.
+         */
+        if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+            (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+                labelsect = info->label_block;
+        else
+                labelsect = info->label_block * (blocksize >> 9);
+        /*
         * Get volume label, extract name and type.
         */
-        data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+        data = read_part_sector(state, labelsect, &sect);
        if (data == NULL)
                goto out_readerr;
@@ -133,13 +145,15 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                         */
                        blocksize = label->cms.block_size;
                        if (label->cms.disk_offset != 0) {
-                                printk("CMS1/%8s(MDSK):", name);
+                                snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                /* disk is reserved minidisk */
                                offset = label->cms.disk_offset;
                                size = (label->cms.block_count - 1)
                                        * (blocksize >> 9);
                        } else {
-                                printk("CMS1/%8s:", name);
+                                snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                offset = (info->label_block + 1);
                                size = label->cms.block_count
                                        * (blocksize >> 9);
@@ -148,7 +162,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                      size-offset*(blocksize >> 9));
                } else {
                        if (strncmp(type, "LNX1", 4) == 0) {
-                                printk("LNX1/%8s:", name);
+                                snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                if (label->lnx.ldl_version == 0xf2) {
                                        fmt_size = label->lnx.formatted_blocks
                                                * (blocksize >> 9);
@@ -167,7 +182,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                offset = (info->label_block + 1);
                        } else {
                                /* unlabeled disk */
-                                printk("(nonl)");
+                                strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
                                size = i_size >> 9;
                                offset = (info->label_block + 1);
                        }
@@ -186,15 +201,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                 * if not, something is wrong, skipping partition detection
                 */
                if (strncmp(type, "VOL1",  4) == 0) {
-                        printk("VOL1/%8s:", name);
+                        snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                        /*
                         * get block number and read then go through format1
                         * labels
                         */
                        blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
                        counter = 0;
-                        data = read_dev_sector(bdev, blk * (blocksize/512),
+                        data = read_part_sector(state, blk * (blocksize/512),
-                                               &sect);
+                                                &sect);
                        while (data != NULL) {
                                struct vtoc_format1_label f1;
@@ -208,9 +224,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                    || f1.DS1FMTID == _ascebc['7']
                                    || f1.DS1FMTID == _ascebc['9']) {
                                        blk++;
-                                        data = read_dev_sector(bdev, blk *
+                                        data = read_part_sector(state,
-                                                               (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                                &sect);
                                        continue;
                                }
@@ -230,9 +245,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                              size * (blocksize >> 9));
                                counter++;
                                blk++;
-                                data = read_dev_sector(bdev,
+                                data = read_part_sector(state,
-                                                       blk * (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                       &sect);
                        }
                        if (!data)
@@ -244,7 +258,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        goto out_freeall;
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..0ea19312706b 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "karma.h"
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
        } __attribute__((packed)) *label;
        struct d_partition *p;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 #define KARMA_LABEL_MAGIC               0xAB56
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..5bf8a04b5d9b 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
        int h;
        /* high part */
-        if      ((x = src[0] - '0') <= '9'-'0') h = x;
+        x = h = hex_to_bin(src[0]);
-        else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
+        if (h < 0)
-        else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
+                return -1;
-        else return -1;
-        h <<= 4;
        /* low part */
-        if ((x = src[1] - '0') <= '9'-'0') return h | x;
+        h = hex_to_bin(src[1]);
-        if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
+        if (h < 0)
-        if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
+                return -1;
-        return -1;
+        return (x << 4) + h;
 }
 /**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 /**
 * ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @ph1:   Memory struct to fill with ph contents
 *
 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 * Return:  'true'   Success
 *          'false'  Error
 */
-static bool ldm_validate_privheads (struct block_device *bdev,
+static bool ldm_validate_privheads(struct parsed_partitions *state,
-                                    struct privhead *ph1)
+                                   struct privhead *ph1)
 {
        static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
        struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        long num_sects;
        int i;
-        BUG_ON (!bdev || !ph1);
+        BUG_ON (!state || !ph1);
        ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
        ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        /* Read and parse privheads */
        for (i = 0; i < 3; i++) {
-                data = read_dev_sector (bdev,
+                data = read_part_sector(state, ph[0]->config_start + off[i],
-                        ph[0]->config_start + off[i], &sect);
+                                        &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
                }
        }
-        num_sects = bdev->bd_inode->i_size >> 9;
+        num_sects = state->bdev->bd_inode->i_size >> 9;
        if ((ph[0]->config_start > num_sects) ||
           ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
 /**
 * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
 *
 * The offsets and sizes of the configs are range-checked against a privhead.
 *
 * Return:  'true'   @toc1 contains validated TOCBLOCK info
 *          'false'  @toc1 contents are undefined
 */
-static bool ldm_validate_tocblocks(struct block_device *bdev,
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
-        unsigned long base, struct ldmdb *ldb)
+                                   unsigned long base, struct ldmdb *ldb)
 {
        static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
        struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
        int i, nr_tbs;
        bool result = false;
-        BUG_ON(!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        ph = &ldb->ph;
        tb[0] = &ldb->toc;
        tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
         * skip any that fail as long as we get at least one valid TOCBLOCK.
         */
        for (nr_tbs = i = 0; i < 4; i++) {
-                data = read_dev_sector(bdev, base + off[i], &sect);
+                data = read_part_sector(state, base + off[i], &sect);
                if (!data) {
                        ldm_error("Disk read failed for TOCBLOCK %d.", i);
                        continue;
@@ -473,7 +473,7 @@ err:
 /**
 * ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @base:  Offset, into @bdev, of the database
 * @ldb:   Cache of the database structures
 *
@@ -483,8 +483,8 @@ err:
 * Return:  'true'   @ldb contains validated VBDB info
 *          'false'  @ldb contents are undefined
 */
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
-                               struct ldmdb *ldb)
+                              unsigned long base, struct ldmdb *ldb)
 {
        Sector sect;
        u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
        struct vmdb *vm;
        struct tocblock *toc;
-        BUG_ON (!bdev || !ldb);
+        BUG_ON (!state || !ldb);
        vm  = &ldb->vm;
        toc = &ldb->toc;
-        data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+        data = read_part_sector(state, base + OFF_VMDB, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -534,21 +534,21 @@ out:
 /**
 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 *
 * This function provides a weak test to decide whether the device is a dynamic
 * disk or not.  It looks for an MS-DOS-style partition table containing at
 * least one partition of type 0x42 (formerly SFS, now used by Windows for
 * dynamic disks).
 *
- * N.B.  The only possible error can come from the read_dev_sector and that is
+ * N.B.  The only possible error can come from the read_part_sector and that is
 *       only likely to happen if the underlying device is strange.  If that IS
 *       the case we should return zero to let someone else try.
 *
- * Return:  'true'   @bdev is a dynamic disk
+ * Return:  'true'   @state->bdev is a dynamic disk
- *          'false'  @bdev is not a dynamic disk, or an error occurred
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
 */
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
 {
        Sector sect;
        u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
        int i;
        bool result = false;
-        BUG_ON (!bdev);
+        BUG_ON(!state);
-        data = read_dev_sector (bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
                return false;
        }
-        printk (" [LDM]");
+        strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
        /* Create the data partitions */
        list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
                part_num++;
        }
-        printk ("\n");
+        strlcat(pp->pp_buf, "\n", PAGE_SIZE);
        return true;
 }
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 /**
 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 * Return:  'true'   All the VBLKs were read successfully
 *          'false'  An error occurred
 */
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
-                           struct ldmdb *ldb)
+                          struct ldmdb *ldb)
 {
        int size, perbuf, skip, finish, s, v, recs;
        u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        bool result = false;
        LIST_HEAD (frags);
-        BUG_ON (!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        size   = ldb->vm.vblk_size;
        perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        finish = (size * ldb->vm.last_vblk_seq) >> 9;
        for (s = skip; s < finish; s++) {               /* For each sector */
-                data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+                data = read_part_sector(state, base + OFF_VMDB + s, &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 /**
 * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp:    List of the partitions parsed so far
+ * @state: Partition check state including device holding the LDM Database
- * @bdev:  Device holding the LDM Database
 *
 * This determines whether the device @bdev is a dynamic disk and if so creates
 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
 * and so on: the actual data containing partitions.
 *
- * Return:  1 Success, @bdev is a dynamic disk and we handled it
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
- *          0 Success, @bdev is not a dynamic disk
+ *          0 Success, @state->bdev is not a dynamic disk
 *         -1 An error occurred before enough information had been read
- *            Or @bdev is a dynamic disk, but it may be corrupted
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
 */
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
 {
        struct ldmdb  *ldb;
        unsigned long base;
        int result = -1;
-        BUG_ON (!pp || !bdev);
+        BUG_ON(!state);
        /* Look for signs of a Dynamic Disk */
-        if (!ldm_validate_partition_table (bdev))
+        if (!ldm_validate_partition_table(state))
                return 0;
        ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        }
        /* Parse and check privheads. */
-        if (!ldm_validate_privheads (bdev, &ldb->ph))
+        if (!ldm_validate_privheads(state, &ldb->ph))
                goto out;               /* Already logged */
        /* All further references are relative to base (database start). */
        base = ldb->ph.config_start;
        /* Parse and check tocs and vmdb. */
-        if (!ldm_validate_tocblocks (bdev, base, ldb) ||
+        if (!ldm_validate_tocblocks(state, base, ldb) ||
-            !ldm_validate_vmdb      (bdev, base, ldb))
+            !ldm_validate_vmdb(state, base, ldb))
                goto out;               /* Already logged */
        /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        INIT_LIST_HEAD (&ldb->v_comp);
        INIT_LIST_HEAD (&ldb->v_part);
-        if (!ldm_get_vblks (bdev, base, ldb)) {
+        if (!ldm_get_vblks(state, base, ldb)) {
                ldm_crit ("Failed to read the VBLKs from the database.");
                goto cleanup;
        }
        /* Finally, create the data partition devices. */
-        if (ldm_create_data_partitions (pp, ldb)) {
+        if (ldm_create_data_partitions(state, ldb)) {
                ldm_debug ("Parsed LDM database successfully.");
                result = 1;
        }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
        struct list_head v_part;
 };
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
 #endif /* _FS_PT_LDM_H_ */
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..68d6a216ee79 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
                stg[i] = 0;
 }
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
 {
        int slot = 1;
        Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct mac_driver_desc *md;
        /* Get 0th block and look at the first partition map entry. */
-        md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+        md = read_part_sector(state, 0, &sect);
        if (!md)
                return -1;
        if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        secsize = be16_to_cpu(md->block_size);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, secsize/512, &sect);
+        data = read_part_sector(state, secsize/512, &sect);
        if (!data)
                return -1;
        part = (struct mac_partition *) (data + secsize%512);
@@ -59,12 +59,12 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
                put_dev_sector(sect);
                return 0;               /* not a MacOS disk */
        }
-        printk(" [mac]");
+        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
        blocks_in_map = be32_to_cpu(part->map_count);
        for (blk = 1; blk <= blocks_in_map; ++blk) {
                int pos = blk * secsize;
                put_dev_sector(sect);
-                data = read_dev_sector(bdev, pos/512, &sect);
+                data = read_part_sector(state, pos/512, &sect);
                if (!data)
                        return -1;
                part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
                        be32_to_cpu(part->block_count) * (secsize/512));
                if (!strnicmp(part->type, "Linux_RAID", 10))
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
 #ifdef CONFIG_PPC_PMAC
                /*
                 * If this is the first bootable partition, tell the
@@ -123,10 +123,11 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
-                note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+                note_bootable_part(state->bdev->bd_dev, found_root,
+                                   found_root_goodness);
 #endif
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
    /* ... more stuff */
 };
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..5f79a6677c69 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 #define AIX_LABEL_MAGIC2        0xC2
 #define AIX_LABEL_MAGIC3        0xD4
 #define AIX_LABEL_MAGIC4        0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 {
        struct partition *pt = (struct partition *) (p + 0x1be);
        Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
                        is_extended_partition(pt))
                        return 0;
        }
-        d = read_dev_sector(bdev, 7, &sect);
+        d = read_part_sector(state, 7, &sect);
        if (d) {
                if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
                        ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 * only for the actual data partitions.
 */
-static void
+static void parse_extended(struct parsed_partitions *state,
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t first_sector, sector_t first_size)
-                        sector_t first_sector, sector_t first_size)
 {
        struct partition *p;
        Sector sect;
        unsigned char *data;
        sector_t this_sector, this_size;
-        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                        return;
                if (state->next == state->limit)
                        return;
-                data = read_dev_sector(bdev, this_sector, &sect);
+                data = read_part_sector(state, this_sector, &sect);
                if (!data)
                        return;
@@ -198,9 +197,8 @@ done:
 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
   indicates linux swap.  Be careful before believing this is Solaris. */
-static void
+static void parse_solaris_x86(struct parsed_partitions *state,
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
+                              sector_t offset, sector_t size, int origin)
-                        sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
        Sector sect;
@@ -208,17 +206,25 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
        int i;
        short max_nparts;
-        v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+        v = read_part_sector(state, offset + 1, &sect);
        if (!v)
                return;
        if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <solaris:", state->name, origin);
+        {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+                snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        if (le32_to_cpu(v->v_version) != 1) {
-                printk("  cannot handle version %d vtoc>\n",
+                char tmp[64];
-                        le32_to_cpu(v->v_version));
+                snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
+                         le32_to_cpu(v->v_version));
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                put_dev_sector(sect);
                return;
        }
@@ -226,9 +232,12 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
        max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
        for (i=0; i<max_nparts && state->next<state->limit; i++) {
                struct solaris_x86_slice *s = &v->v_slice[i];
+                char tmp[3 + 10 + 1 + 1];
                if (s->s_size == 0)
                        continue;
-                printk(" [s%d]", i);
+                snprintf(tmp, sizeof(tmp), " [s%d]", i);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                /* solaris partitions are relative to current MS-DOS
                 * one; must add the offset of the current partition */
                put_partition(state, state->next++,
@@ -236,7 +245,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
                                 le32_to_cpu(s->s_size));
        }
        put_dev_sector(sect);
-        printk(" >\n");
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
@@ -245,23 +254,25 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for BSD partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_bsd(struct parsed_partitions *state,
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
+                      sector_t offset, sector_t size, int origin, char *flavour,
-                sector_t offset, sector_t size, int origin, char *flavour,
+                      int max_partitions)
-                int max_partitions)
 {
        Sector sect;
        struct bsd_disklabel *l;
        struct bsd_partition *p;
+        char tmp[64];
-        l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+        l = read_part_sector(state, offset + 1, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <%s:", state->name, origin, flavour);
+        snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+        strlcat(state->pp_buf, tmp, PAGE_SIZE);
        if (le16_to_cpu(l->d_npartitions) < max_partitions)
                max_partitions = le16_to_cpu(l->d_npartitions);
@@ -278,46 +289,43 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
                        /* full parent partition, we have it already */
                        continue;
                if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
-                        printk("bad subpartition - ignored\n");
+                        strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
                        continue;
                }
                put_partition(state, state->next++, bsd_start, bsd_size);
        }
        put_dev_sector(sect);
-        if (le16_to_cpu(l->d_npartitions) > max_partitions)
+        if (le16_to_cpu(l->d_npartitions) > max_partitions) {
-                printk(" (ignored %d more)",
+                snprintf(tmp, sizeof(tmp), " (ignored %d more)",
-                       le16_to_cpu(l->d_npartitions) - max_partitions);
+                         le16_to_cpu(l->d_npartitions) - max_partitions);
-        printk(" >\n");
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 }
 #endif
-static void
+static void parse_freebsd(struct parsed_partitions *state,
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
-                        "bsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_netbsd(struct parsed_partitions *state,
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
+                         sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
-                        "netbsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_openbsd(struct parsed_partitions *state,
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "openbsd",
-                        "openbsd", OPENBSD_MAXPARTITIONS);
+                  OPENBSD_MAXPARTITIONS);
 #endif
 }
@@ -325,16 +333,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for Unixware partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_unixware(struct parsed_partitions *state,
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
        Sector sect;
        struct unixware_disklabel *l;
        struct unixware_slice *p;
-        l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+        l = read_part_sector(state, offset + 29, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -342,7 +349,12 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <unixware:", state->name, origin);
+        {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+                snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        p = &l->vtoc.v_slice[1];
        /* I omit the 0th slice as it is the same as whole disk. */
        while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -356,7 +368,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
                p++;
        }
        put_dev_sector(sect);
-        printk(" >\n");
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
@@ -365,9 +377,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
 * Rajeev V. Pillai    <rajeevvp@yahoo.com>
 */
-static void
+static void parse_minix(struct parsed_partitions *state,
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
+                        sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
        Sector sect;
@@ -375,7 +386,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
        struct partition *p;
        int i;
-        data = read_dev_sector(bdev, offset, &sect);
+        data = read_part_sector(state, offset, &sect);
        if (!data)
                return;
@@ -386,8 +397,10 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
         * the normal boot sector. */
        if (msdos_magic_present (data + 510) &&
            SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+                char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
-                printk(" %s%d: <minix:", state->name, origin);
+                snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
                        if (state->next == state->limit)
                                break;
@@ -396,7 +409,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
                                put_partition(state, state->next++,
                                              start_sect(p), nr_sects(p));
                }
-                printk(" >\n");
+                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
        }
        put_dev_sector(sect);
 #endif /* CONFIG_MINIX_SUBPARTITION */
@@ -404,8 +417,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
        unsigned char id;
-        void (*parse)(struct parsed_partitions *, struct block_device *,
+        void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
-                        sector_t, sector_t, int);
 } subtypes[] = {
        {FREEBSD_PARTITION, parse_freebsd},
        {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +429,16 @@ static struct {
        {0, NULL},
 };
 
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
 {
-        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
        struct fat_boot_sector *fb;
        int slot;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
        if (!msdos_magic_present(data + 510)) {
@@ -434,9 +446,9 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
-        if (aix_magic_present(data, bdev)) {
+        if (aix_magic_present(state, data)) {
                put_dev_sector(sect);
-                printk( " [AIX]");
+                strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
                return 0;
        }
@@ -457,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                        fb = (struct fat_boot_sector *) data;
                        if (slot == 1 && fb->reserved && fb->fats
                                && fat_valid_media(fb->media)) {
-                                printk("\n");
+                                strlcat(state->pp_buf, "\n", PAGE_SIZE);
                                put_dev_sector(sect);
                                return 1;
                        } else {
@@ -502,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                        n = min(size, max(sector_size, n));
                        put_partition(state, slot, start, n);
-                        printk(" <");
+                        strlcat(state->pp_buf, " <", PAGE_SIZE);
-                        parse_extended(state, bdev, start, size);
+                        parse_extended(state, start, size);
-                        printk(" >");
+                        strlcat(state->pp_buf, " >", PAGE_SIZE);
                        continue;
                }
                put_partition(state, slot, start, size);
                if (SYS_IND(p) == LINUX_RAID_PARTITION)
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
                if (SYS_IND(p) == DM6_PARTITION)
-                        printk("[DM]");
+                        strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
                if (SYS_IND(p) == EZD_PARTITION)
-                        printk("[EZD]");
+                        strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        /* second pass - output for each on a separate line */
        p = (struct partition *) (0x1be + data);
@@ -532,8 +544,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                if (!subtypes[n].parse)
                        continue;
-                subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
+                subtypes[n].parse(state, start_sect(p) * sector_size,
-                                                nr_sects(p)*sector_size, slot);
+                                  nr_sects(p) * sector_size, slot);
        }
        put_dev_sector(sect);
        return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 #define MSDOS_LABEL_MAGIC               0xAA55
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..48cec7cbca17 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
        } * label;
        struct d_partition * partition;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
                                le32_to_cpu(partition->p_size));
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 #define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..ea8a86dceaf4 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
        __be32 _unused1;                        /* Padding */
 };
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
 {
        int i, csum;
        __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct sgi_partition *p;
        char b[BDEVNAME_SIZE];
-        label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
        p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        if(csum) {
                printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
 *  fs/partitions/sgi.h
 */
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
 #define SGI_LABEL_MAGIC 0x0be5a941
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..b5b6fcfb3d36 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "sun.h"
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
 {
        int i;
        __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
        int use_vtoc;
        int nparts;
-        label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
                csum ^= *ush--;
        if (csum) {
                printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 #define SUN_LABEL_MAGIC          0xDABE
 #define SUN_VTOC_SANITY          0x600DDEEE
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9627ccffc1c4 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 };
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
 {
        int i, slices;
        int slot = 1;
@@ -54,8 +54,9 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        unsigned char *data;
        struct dkblk0 *b;
        struct slice *slice;
+        char tmp[64];
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -68,12 +69,13 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        i = be32_to_cpu(b->dk_ios.ios_slcblk);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, i, &sect);
+        data = read_part_sector(state, i, &sect);
        if (!data)
                return -1;
        slices -= 1; /* last slice is the whole disk */
-        printk("sysV68: %s(s%u)", state->name, slices);
+        snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+        strlcat(state->pp_buf, tmp, PAGE_SIZE);
        slice = (struct slice *)data;
        for (i = 0; i < slices; i++, slice++) {
                if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
                        put_partition(state, slot,
                                be32_to_cpu(slice->blkoff),
                                be32_to_cpu(slice->nblocks));
-                        printk("(s%u)", i);
+                        snprintf(tmp, sizeof(tmp), "(s%u)", i);
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..8dbaf9f77a99 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
 {
        int i;
        Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 #define PT_MAGIC        0x032957        /* Partition magic number */
 #define PT_VALID        1               /* Indicates if struct is valid */
-        data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+        data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
        if (!data)
                return -1;
        
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
                                              label->pt_part[i].pi_blkoff,
                                              label->pt_part[i].pi_nblocks);
                put_dev_sector(sect);
-                printk ("\n");
+                strlcat(state->pp_buf, "\n", PAGE_SIZE);
                return 1;
        } else {
                put_dev_sector(sect);
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
 *  fs/partitions/ultrix.h
 */
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -18,11 +19,23 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 /*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-size
+ */
+unsigned int pipe_max_size = 1048576;
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
+/*
 * We use a start+len construction, which provides full use of the 
 * allocated memory.
 * -- Florian Coosmann (FGC)
@@ -222,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
        return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 /**
 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
        } else
                kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 /**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
        return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 /**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
        page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 /**
 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
        return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 /**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
        page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
@@ -390,7 +409,7 @@ redo:
                        if (!buf->len) {
                                buf->ops = NULL;
                                ops->release(pipe, buf);
-                                curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+                                curbuf = (curbuf + 1) & (pipe->buffers - 1);
                                pipe->curbuf = curbuf;
                                pipe->nrbufs = --bufs;
                                do_wakeup = 1;
@@ -472,7 +491,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
        chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
        if (pipe->nrbufs && chars != 0) {
                int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-                                                        (PIPE_BUFFERS-1);
+                                                        (pipe->buffers - 1);
                struct pipe_buffer *buf = pipe->bufs + lastbuf;
                const struct pipe_buf_operations *ops = buf->ops;
                int offset = buf->offset + buf->len;
@@ -518,8 +537,8 @@ redo1:
                        break;
                }
                bufs = pipe->nrbufs;
-                if (bufs < PIPE_BUFFERS) {
+                if (bufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+                        int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        struct page *page = pipe->tmp_page;
                        char *src;
@@ -580,7 +599,7 @@ redo2:
                        if (!total_len)
                                break;
                }
-                if (bufs < PIPE_BUFFERS)
+                if (bufs < pipe->buffers)
                        continue;
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret)
@@ -640,7 +659,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        nrbufs = pipe->nrbufs;
                        while (--nrbufs >= 0) {
                                count += pipe->bufs[buf].len;
-                                buf = (buf+1) & (PIPE_BUFFERS-1);
+                                buf = (buf+1) & (pipe->buffers - 1);
                        }
                        mutex_unlock(&inode->i_mutex);
@@ -671,7 +690,7 @@ pipe_poll(struct file *filp, poll_table *wait)
        }
        if (filp->f_mode & FMODE_WRITE) {
-                mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+                mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
                /*
                 * Most Unices do not set POLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
@@ -877,25 +896,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
        if (pipe) {
-                init_waitqueue_head(&pipe->wait);
+                pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
-                pipe->r_counter = pipe->w_counter = 1;
+                if (pipe->bufs) {
-                pipe->inode = inode;
+                        init_waitqueue_head(&pipe->wait);
+                        pipe->r_counter = pipe->w_counter = 1;
+                        pipe->inode = inode;
+                        pipe->buffers = PIPE_DEF_BUFFERS;
+                        return pipe;
+                }
+                kfree(pipe);
        }
-        return pipe;
+        return NULL;
 }
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
        int i;
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        buf->ops->release(pipe, buf);
        }
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
+        kfree(pipe->bufs);
        kfree(pipe);
 }
@@ -1094,6 +1120,126 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 }
 /*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
+{
+        struct pipe_buffer *bufs;
+        /*
+         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+         * expect a lot of shrink+grow operations, just free and allocate
+         * again like we would do for growing. If the pipe currently
+         * contains more buffers than arg, then return busy.
+         */
+        if (nr_pages < pipe->nrbufs)
+                return -EBUSY;
+        bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+        if (unlikely(!bufs))
+                return -ENOMEM;
+        /*
+         * The pipe array wraps around, so just start the new one at zero
+         * and adjust the indexes.
+         */
+        if (pipe->nrbufs) {
+                unsigned int tail;
+                unsigned int head;
+                tail = pipe->curbuf + pipe->nrbufs;
+                if (tail < pipe->buffers)
+                        tail = 0;
+                else
+                        tail &= (pipe->buffers - 1);
+                head = pipe->nrbufs - tail;
+                if (head)
+                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+                if (tail)
+                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+        }
+        pipe->curbuf = 0;
+        kfree(pipe->bufs);
+        pipe->bufs = bufs;
+        pipe->buffers = nr_pages;
+        return nr_pages * PAGE_SIZE;
+}
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+        unsigned long nr_pages;
+        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+                 size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+        if (ret < 0 || !write)
+                return ret;
+        pipe_max_size = round_pipe_size(pipe_max_size);
+        return ret;
+}
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct pipe_inode_info *pipe;
+        long ret;
+        pipe = file->f_path.dentry->d_inode->i_pipe;
+        if (!pipe)
+                return -EBADF;
+        mutex_lock(&pipe->inode->i_mutex);
+        switch (cmd) {
+        case F_SETPIPE_SZ: {
+                unsigned int size, nr_pages;
+                size = round_pipe_size(arg);
+                nr_pages = size >> PAGE_SHIFT;
+                ret = -EINVAL;
+                if (!nr_pages)
+                        goto out;
+                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+                        ret = -EPERM;
+                        goto out;
+                }
+                ret = pipe_set_size(pipe, nr_pages);
+                break;
+                }
+        case F_GETPIPE_SZ:
+                ret = pipe->buffers * PAGE_SIZE;
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+out:
+        mutex_unlock(&pipe->inode->i_mutex);
+        return ret;
+}
+/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
        return 0;
 }
+/*
+ * vfsmount lock must be held for write
+ */
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
        if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
                prev_src_mnt  = child;
        }
 out:
-        spin_lock(&vfsmount_lock);
+        br_write_lock(vfsmount_lock);
        while (!list_empty(&tmp_list)) {
                child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
                umount_tree(child, 0, &umount_list);
        }
-        spin_unlock(&vfsmount_lock);
+        br_write_unlock(vfsmount_lock);
        release_mounts(&umount_list);
        return ret;
 }
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
 * other mounts its parent propagates to.
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
 */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 * collect all mounts that receive propagation from the mount in @list,
 * and return these additional mounts in the same list.
 * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
 */
 int propagate_umount(struct list_head *list)
 {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c68153..2758e2afc518 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux proc filesystem routines.
 #
-obj-$(CONFIG_PROC_FS) += proc.o
+obj-y   += proc.o
 proc-y                  := nommu.o task_nommu.o
 proc-$(CONFIG_MMU)      := mmu.o task_mmu.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..fff6572676ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                if (tracer)
                        tpid = task_pid_nr_ns(tracer, ns);
        }
-        cred = get_cred((struct cred *) __task_cred(p));
+        cred = get_task_cred(p);
        seq_printf(m,
                "State:\t%s\n"
                "Tgid:\t%d\n"
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                shpending = p->signal->shared_pending.signal;
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
-                num_threads = atomic_read(&p->signal->count);
+                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = atomic_read(&__task_cred(p)->user->sigpending);
                rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        tty_nr = new_encode_dev(tty_devnum(sig->tty));
                }
-                num_threads = atomic_read(&sig->count);
+                num_threads = get_nr_threads(task);
                collect_sigign_sigcatch(task, &sigign, &sigcatch);
                cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8418fcc0a6ab..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -148,43 +149,31 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
        return count;
 }
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
 {
-        struct fs_struct *fs;
        int result = -ENOENT;
        task_lock(task);
-        fs = task->fs;
+        if (task->fs) {
-        if (fs) {
+                get_fs_root(task->fs, root);
-                read_lock(&fs->lock);
-                *path = root ? fs->root : fs->pwd;
-                path_get(path);
-                read_unlock(&fs->lock);
                result = 0;
        }
        task_unlock(task);
        return result;
 }
-static int get_nr_threads(struct task_struct *tsk)
-{
-        unsigned long flags;
-        int count = 0;
-        if (lock_task_sighand(tsk, &flags)) {
-                count = atomic_read(&tsk->signal->count);
-                unlock_task_sighand(tsk, &flags);
-        }
-        return count;
-}
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
        struct task_struct *task = get_proc_task(inode);
        int result = -ENOENT;
        if (task) {
-                result = get_fs_path(task, path, 0);
+                task_lock(task);
+                if (task->fs) {
+                        get_fs_pwd(task->fs, path);
+                        result = 0;
+                }
+                task_unlock(task);
                put_task_struct(task);
        }
        return result;
@@ -196,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
        int result = -ENOENT;
        if (task) {
-                result = get_fs_path(task, path, 1);
+                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
@@ -439,17 +428,14 @@ static const struct file_operations proc_lstats_operations = {
 #endif
-/* The badness from the OOM killer */
-unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
        unsigned long points = 0;
-        struct timespec uptime;
-        do_posix_clock_monotonic_gettime(&uptime);
        read_lock(&tasklist_lock);
        if (pid_alive(task))
-                points = badness(task, uptime.tv_sec);
+                points = oom_badness(task, NULL, NULL,
+                                        totalram_pages + total_swap_pages);
        read_unlock(&tasklist_lock);
        return sprintf(buffer, "%lu\n", points);
 }
@@ -573,9 +559,19 @@ static int proc_setattr(struct dentry *dentry, struct iattr *attr)
                return -EPERM;
        error = inode_change_ok(inode, attr);
-        if (!error)
+        if (error)
-                error = inode_setattr(inode, attr);
+                return error;
-        return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 static const struct inode_operations proc_def_inode_operations = {
@@ -601,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
                                get_mnt_ns(ns);
                }
                rcu_read_unlock();
-                if (ns && get_fs_path(task, &root, 1) == 0)
+                if (ns && get_task_root(task, &root) == 0)
                        ret = 0;
                put_task_struct(task);
        }
@@ -730,6 +726,7 @@ out_no_task:
 static const struct file_operations proc_info_file_operations = {
        .read           = proc_info_read,
+        .llseek         = generic_file_llseek,
 };
 static int proc_single_show(struct seq_file *m, void *v)
@@ -987,6 +984,7 @@ out_no_task:
 static const struct file_operations proc_environ_operations = {
        .read           = environ_read,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1049,8 +1047,24 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
                return -EACCES;
        }
+        /*
+         * Warn that /proc/pid/oom_adj is deprecated, see
+         * Documentation/feature-removal-schedule.txt.
+         */
+        printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
+                        "please use /proc/%d/oom_score_adj instead.\n",
+                        current->comm, task_pid_nr(current),
+                        task_pid_nr(task), task_pid_nr(task));
        task->signal->oom_adj = oom_adjust;
+        /*
+         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+         * value is always attainable.
+         */
+        if (task->signal->oom_adj == OOM_ADJUST_MAX)
+                task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
+        else
+                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
+                                                                -OOM_DISABLE;
        unlock_task_sighand(task, &flags);
        put_task_struct(task);
@@ -1060,6 +1074,83 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 static const struct file_operations proc_oom_adjust_operations = {
        .read           = oom_adjust_read,
        .write          = oom_adjust_write,
+        .llseek         = generic_file_llseek,
+};
+static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+                                        size_t count, loff_t *ppos)
+{
+        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+        char buffer[PROC_NUMBUF];
+        int oom_score_adj = OOM_SCORE_ADJ_MIN;
+        unsigned long flags;
+        size_t len;
+        if (!task)
+                return -ESRCH;
+        if (lock_task_sighand(task, &flags)) {
+                oom_score_adj = task->signal->oom_score_adj;
+                unlock_task_sighand(task, &flags);
+        }
+        put_task_struct(task);
+        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+        return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+                                        size_t count, loff_t *ppos)
+{
+        struct task_struct *task;
+        char buffer[PROC_NUMBUF];
+        unsigned long flags;
+        long oom_score_adj;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count))
+                return -EFAULT;
+        err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
+        if (err)
+                return -EINVAL;
+        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+                        oom_score_adj > OOM_SCORE_ADJ_MAX)
+                return -EINVAL;
+        task = get_proc_task(file->f_path.dentry->d_inode);
+        if (!task)
+                return -ESRCH;
+        if (!lock_task_sighand(task, &flags)) {
+                put_task_struct(task);
+                return -ESRCH;
+        }
+        if (oom_score_adj < task->signal->oom_score_adj &&
+                        !capable(CAP_SYS_RESOURCE)) {
+                unlock_task_sighand(task, &flags);
+                put_task_struct(task);
+                return -EACCES;
+        }
+        task->signal->oom_score_adj = oom_score_adj;
+        /*
+         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
+         * always attainable.
+         */
+        if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                task->signal->oom_adj = OOM_DISABLE;
+        else
+                task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
+                                                        OOM_SCORE_ADJ_MAX;
+        unlock_task_sighand(task, &flags);
+        put_task_struct(task);
+        return count;
+}
+static const struct file_operations proc_oom_score_adj_operations = {
+        .read           = oom_score_adj_read,
+        .write          = oom_score_adj_write,
 };
 #ifdef CONFIG_AUDITSYSCALL
@@ -1131,6 +1222,7 @@ out_free_page:
 static const struct file_operations proc_loginuid_operations = {
        .read           = proc_loginuid_read,
        .write          = proc_loginuid_write,
+        .llseek         = generic_file_llseek,
 };
 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
@@ -1151,6 +1243,7 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 static const struct file_operations proc_sessionid_operations = {
        .read           = proc_sessionid_read,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1202,6 +1295,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
 static const struct file_operations proc_fault_inject_operations = {
        .read           = proc_fault_inject_read,
        .write          = proc_fault_inject_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -1432,7 +1526,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path(path, tmp, PAGE_SIZE);
+        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
@@ -1943,7 +2037,7 @@ static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 }
 static const struct file_operations proc_fdinfo_file_operations = {
-        .open           = nonseekable_open,
+        .open           = nonseekable_open,
        .read           = proc_fdinfo_read,
 };
@@ -2227,6 +2321,7 @@ out_no_task:
 static const struct file_operations proc_pid_attr_operations = {
        .read           = proc_pid_attr_read,
        .write          = proc_pid_attr_write,
+        .llseek         = generic_file_llseek,
 };
 static const struct pid_entry attr_dir_stuff[] = {
@@ -2347,6 +2442,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 static const struct file_operations proc_coredump_filter_operations = {
        .read           = proc_coredump_filter_read,
        .write          = proc_coredump_filter_write,
+        .llseek         = generic_file_llseek,
 };
 #endif
@@ -2436,7 +2532,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-EINVAL);
+        struct dentry *error;
        /* Allocate the inode */
        error = ERR_PTR(-ENOMEM);
@@ -2579,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, proc_pid_limits),
+        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -2629,6 +2725,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -2786,7 +2883,7 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct dentry *result = ERR_PTR(-ENOENT);
+        struct dentry *result;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
@@ -2914,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, proc_pid_limits),
+        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -2963,6 +3060,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
+        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..dd29f0337661 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -258,17 +259,22 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
        error = inode_change_ok(inode, iattr);
        if (error)
-                goto out;
+                return error;
-        error = inode_setattr(inode, iattr);
+        if ((iattr->ia_valid & ATTR_SIZE) &&
-        if (error)
+            iattr->ia_size != i_size_read(inode)) {
-                goto out;
+                error = vmtruncate(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, iattr);
+        mark_inode_dirty(inode);
        
        de->uid = inode->i_uid;
        de->gid = inode->i_gid;
        de->mode = inode->i_mode;
-out:
+        return 0;
-        return error;
 }
 static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -343,21 +349,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000             reserved
- * 00000001-00000fff    static entries  (goners)
- *      001             root-ino
- *
- * 00001000-00001fff    unused
- * 0001xxxx-7fffxxxx    pid-dir entries for pid 1-7fff
- * 80000000-efffffff    unused
- * f0000000-ffffffff    dynamic entries
- *
- * Goal:
- *      Once we split the thing into several virtual filesystems,
- *      we will get rid of magical ranges (and this comment, BTW).
 */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d35b23238fb1..9c2b5f484879 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -25,11 +25,12 @@
 #include "internal.h"
-static void proc_delete_inode(struct inode *inode)
+static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        /* Stop tracking associated processes */
        put_pid(PROC_I(inode)->pid);
@@ -40,7 +41,6 @@ static void proc_delete_inode(struct inode *inode)
                pde_put(de);
        if (PROC_I(inode)->sysctl)
                sysctl_head_put(PROC_I(inode)->sysctl);
-        clear_inode(inode);
 }
 struct vfsmount *proc_mnt;
@@ -91,7 +91,7 @@ static const struct super_operations proc_sops = {
        .alloc_inode    = proc_alloc_inode,
        .destroy_inode  = proc_destroy_inode,
        .drop_inode     = generic_delete_inode,
-        .delete_inode   = proc_delete_inode,
+        .evict_inode    = proc_evict_inode,
        .statfs         = simple_statfs,
 };
@@ -214,8 +214,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 {
        struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
        long rv = -ENOTTY;
-        long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
+        long (*ioctl)(struct file *, unsigned int, unsigned long);
-        int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long);
        spin_lock(&pde->pde_unload_lock);
        if (!pde->proc_fops) {
@@ -223,19 +222,11 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
                return rv;
        }
        pde->pde_users++;
-        unlocked_ioctl = pde->proc_fops->unlocked_ioctl;
+        ioctl = pde->proc_fops->unlocked_ioctl;
-        ioctl = pde->proc_fops->ioctl;
        spin_unlock(&pde->pde_unload_lock);
-        if (unlocked_ioctl) {
+        if (ioctl)
-                rv = unlocked_ioctl(file, cmd, arg);
+                rv = ioctl(file, cmd, arg);
-                if (rv == -ENOIOCTLCMD)
-                        rv = -EINVAL;
-        } else if (ioctl) {
-                lock_kernel();
-                rv = ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-                unlock_kernel();
-        }
        pde_users_dec(pde);
        return rv;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 19979a2ce272..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,6 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
 static const struct file_operations proc_kcore_operations = {
        .read           = read_kcore,
        .open           = open_kcore,
+        .llseek         = generic_file_llseek,
 };
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -587,7 +588,7 @@ static struct kcore_list kcore_text;
 */
 static void __init proc_kcore_text_init(void)
 {
-        kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+        kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index cfe90a48a6e8..bd4b5a740ff1 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -53,6 +53,7 @@ static const struct file_operations proc_kmsg_operations = {
        .poll           = kmsg_poll,
        .open           = kmsg_open,
        .release        = kmsg_release,
+        .llseek         = generic_file_llseek,
 };
 static int __init proc_kmsg_init(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 180cf5a0bd67..3b8b45660331 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page)
        u |= kpf_copy_bit(k, KPF_HWPOISON,      PG_hwpoison);
 #endif
-#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        u |= kpf_copy_bit(k, KPF_UNCACHED,      PG_uncached);
 #endif
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index ce94801f48ca..d9396a4fc7ff 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -209,6 +209,9 @@ void proc_device_tree_add_node(struct device_node *np,
        for (pp = np->properties; pp != NULL; pp = pp->next) {
                p = pp->name;
+                if (strchr(p, '/'))
+                        continue;
                if (duplicate_name(de, p))
                        p = fixup_name(np, de, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 6ff9981f0a18..5be436ea088e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -329,10 +329,19 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
                return -EPERM;
        error = inode_change_ok(inode, attr);
-        if (!error)
+        if (error)
-                error = inode_setattr(inode, attr);
+                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
-        return error;
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
        if (err)
                return;
        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        err = PTR_ERR(proc_mnt);
        if (IS_ERR(proc_mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..1dbca4e8cc16 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,6 +210,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
        int flags = vma->vm_flags;
        unsigned long ino = 0;
        unsigned long long pgoff = 0;
+        unsigned long start;
        dev_t dev = 0;
        int len;
@@ -220,8 +221,14 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
        }
+        /* We don't show the stack guard page in /proc/maps */
+        start = vma->vm_start;
+        if (vma->vm_flags & VM_GROWSDOWN)
+                if (!vma_stack_continue(vma->vm_prev, vma->vm_start))
+                        start += PAGE_SIZE;
        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-                        vma->vm_start,
+                        start,
                        vma->vm_end,
                        flags & VM_READ ? 'r' : '-',
                        flags & VM_WRITE ? 'w' : '-',
@@ -356,13 +363,13 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        mss->referenced += PAGE_SIZE;
                mapcount = page_mapcount(page);
                if (mapcount >= 2) {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->shared_dirty += PAGE_SIZE;
                        else
                                mss->shared_clean += PAGE_SIZE;
                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
                } else {
-                        if (pte_dirty(ptent))
+                        if (pte_dirty(ptent) || PageDirty(page))
                                mss->private_dirty += PAGE_SIZE;
                        else
                                mss->private_clean += PAGE_SIZE;
@@ -634,6 +641,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
        u64 pme = 0;
@@ -664,6 +672,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
        return err;
 }
+#endif /* HUGETLB_PAGE */
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +742,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
        pagemap_walk.mm = mm;
        pagemap_walk.private = &pm;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 46d4b5d72bd3..cb6306e63843 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -122,11 +122,20 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
        return size;
 }
+static void pad_len_spaces(struct seq_file *m, int len)
+{
+        len = 25 + sizeof(void*) * 6 - len;
+        if (len < 1)
+                len = 1;
+        seq_printf(m, "%*c", len, ' ');
+}
 /*
 * display a single VMA to a sequenced file
 */
 static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 {
+        struct mm_struct *mm = vma->vm_mm;
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
@@ -155,11 +164,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
-                len = 25 + sizeof(void *) * 6 - len;
+                pad_len_spaces(m, len);
-                if (len < 1)
-                        len = 1;
-                seq_printf(m, "%*c", len, ' ');
                seq_path(m, &file->f_path, "");
+        } else if (mm) {
+                if (vma->vm_start <= mm->start_stack &&
+                        vma->vm_end >= mm->start_stack) {
+                        pad_len_spaces(m, len);
+                        seq_puts(m, "[stack]");
+                }
        }
        seq_putc(m, '\n');
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9fbc99ec799a..2367fb3f70bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -163,6 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 static const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
+        .llseek         = default_llseek,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
 const struct file_operations qnx4_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 277575ddc05c..16829722be93 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -320,10 +320,19 @@ static int qnx4_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host);
+        int ret;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                qnx4_get_block,
                                &qnx4_inode->mmu_private);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b5802a7ce..aad1316a977f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,7 +82,7 @@
 /*
 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats, dqstats structure containing statistics about the lists
+ * and quota formats.
 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -132,7 +132,25 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+void __quota_error(struct super_block *sb, const char *func,
+                  const char *fmt, ...)
+{
+        va_list args;
+        if (printk_ratelimit()) {
+                va_start(args, fmt);
+                printk(KERN_ERR "Quota error (device %s): %s: ",
+                       sb->s_id, func);
+                vprintk(fmt, args);
+                printk("\n");
+                va_end(args);
+        }
+}
+EXPORT_SYMBOL(__quota_error);
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
+#endif
 static struct quota_format_type *quota_formats; /* List of registered formats */
 static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
@@ -273,7 +291,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
 static inline void put_dquot_last(struct dquot *dquot)
 {
        list_add_tail(&dquot->dq_free, &free_dquots);
-        dqstats.free_dquots++;
+        dqstats_inc(DQST_FREE_DQUOTS);
 }
 static inline void remove_free_dquot(struct dquot *dquot)
@@ -281,7 +299,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
-        dqstats.free_dquots--;
+        dqstats_dec(DQST_FREE_DQUOTS);
 }
 static inline void put_inuse(struct dquot *dquot)
@@ -289,12 +307,12 @@ static inline void put_inuse(struct dquot *dquot)
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
-        dqstats.allocated_dquots++;
+        dqstats_inc(DQST_ALLOC_DQUOTS);
 }
 static inline void remove_inuse(struct dquot *dquot)
 {
-        dqstats.allocated_dquots--;
+        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
 }
 /*
@@ -317,14 +335,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
 }
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
 int dquot_mark_dquot_dirty(struct dquot *dquot)
 {
+        int ret = 1;
+        /* If quota is dirty already, we don't have to acquire dq_list_lock */
+        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+                return 1;
        spin_lock(&dq_list_lock);
-        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
+        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_type].dqi_dirty_list);
+                ret = 0;
+        }
        spin_unlock(&dq_list_lock);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
@@ -550,8 +577,8 @@ int dquot_scan_active(struct super_block *sb,
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
                dqput(old_dquot);
                old_dquot = dquot;
                ret = fn(dquot, priv);
@@ -569,7 +596,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type, int wait)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
 {
        struct list_head *dirty;
        struct dquot *dquot;
@@ -596,8 +623,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                         * holding reference so we can safely just increase
                         * use count */
                        atomic_inc(&dquot->dq_count);
-                        dqstats.lookups++;
                        spin_unlock(&dq_list_lock);
+                        dqstats_inc(DQST_LOOKUPS);
                        sb->dq_op->write_dquot(dquot);
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
@@ -609,9 +636,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
-        spin_lock(&dq_list_lock);
+        dqstats_inc(DQST_SYNCS);
-        dqstats.syncs++;
-        spin_unlock(&dq_list_lock);
        mutex_unlock(&dqopt->dqonoff_mutex);
        if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -643,7 +668,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
        return 0;
 }
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -667,15 +692,16 @@ static void prune_dqcache(int count)
 * This is called from kswapd when we think we need some
 * more memory
 */
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 {
        if (nr) {
                spin_lock(&dq_list_lock);
                prune_dqcache(nr);
                spin_unlock(&dq_list_lock);
        }
-        return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
+        return ((unsigned)
+                percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+                /100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dqcache_shrinker = {
@@ -695,18 +721,12 @@ void dqput(struct dquot *dquot)
                return;
 #ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
-                printk("VFS: dqput: trying to free free dquot\n");
+                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
-                printk("VFS: device %s, dquot of %s %d\n",
+                            quotatypes[dquot->dq_type], dquot->dq_id);
-                        dquot->dq_sb->s_id,
-                        quotatypes[dquot->dq_type],
-                        dquot->dq_id);
                BUG();
        }
 #endif
-        
+        dqstats_inc(DQST_DROPS);
-        spin_lock(&dq_list_lock);
-        dqstats.drops++;
-        spin_unlock(&dq_list_lock);
 we_slept:
        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
@@ -725,9 +745,9 @@ we_slept:
                /* Commit dquot before releasing */
                ret = dquot->dq_sb->dq_op->write_dquot(dquot);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: cannot write quota structure on "
+                        quota_error(dquot->dq_sb, "Can't write quota structure"
-                                "device %s (error %d). Quota may get out of "
+                                    " (error %d). Quota may get out of sync!",
-                                "sync!\n", dquot->dq_sb->s_id, ret);
+                                    ret);
                        /*
                         * We clear dirty bit anyway, so that we avoid
                         * infinite loop here
@@ -823,15 +843,15 @@ we_slept:
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
-                dqstats.cache_hits++;
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_CACHE_HITS);
+                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 1 test */
@@ -878,7 +898,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
                        continue;
 #ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -907,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
 #ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
-                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
+                quota_error(sb, "Writes happened before quota was turned on "
-                        " was turned on thus quota information is probably "
+                        "thus quota information is probably inconsistent. "
-                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+                        "Please run quotacheck(8)");
        }
 #endif
 }
@@ -940,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
                if (dqput_blocks(dquot)) {
 #ifdef CONFIG_QUOTA_DEBUG
                        if (atomic_read(&dquot->dq_count) != 1)
-                                printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
+                                quota_error(inode->i_sb, "Adding dquot with "
+                                            "dq_count %d to dispose list",
+                                            atomic_read(&dquot->dq_count));
 #endif
                        spin_lock(&dq_list_lock);
                        /* As dquot must have currently users it can't be on
@@ -979,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                struct list_head *tofree_head)
 {
        struct inode *inode;
+        int reserved = 0;
        spin_lock(&inode_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -988,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                 *  only quota pointers and these have separate locking
                 *  (dqptr_sem).
                 */
-                if (!IS_NOQUOTA(inode))
+                if (!IS_NOQUOTA(inode)) {
+                        if (unlikely(inode_get_rsv_space(inode) > 0))
+                                reserved = 1;
                        remove_inode_dquot_ref(inode, type, tofree_head);
+                }
        }
        spin_unlock(&inode_lock);
+#ifdef CONFIG_QUOTA_DEBUG
+        if (reserved) {
+                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
+                        " was disabled thus quota information is probably "
+                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
+        }
+#endif
 }
 /* Gather all references from inodes and drop them */
@@ -1297,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
        return QUOTA_NL_NOWARN;
 }
+static int dquot_active(const struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        if (IS_NOQUOTA(inode))
+                return 0;
+        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
+}
 /*
 * Initialize quota pointers in inode
 *
@@ -1316,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return;
        /* First get references to structures we might need. */
@@ -1488,17 +1530,19 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
 * This operation can block, but only after everything is updated
 */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
-                int warn, int reserve)
 {
        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
+        int warn = flags & DQUOT_SPACE_WARN;
+        int reserve = flags & DQUOT_SPACE_RESERVE;
+        int nofail = flags & DQUOT_SPACE_NOFAIL;
        /*
         * First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex
         */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_incr_space(inode, number, reserve);
                goto out;
        }
@@ -1513,7 +1557,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                        continue;
                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
                                warntype+cnt);
-                if (ret) {
+                if (ret && !nofail) {
                        spin_unlock(&dq_data_lock);
                        goto out_flush_warn;
                }
@@ -1550,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1587,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
        int cnt;
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_claim_rsv_space(inode, number);
                return 0;
        }
@@ -1612,14 +1656,15 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
 * This operation can block, but only after everything is updated
 */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
+        int reserve = flags & DQUOT_SPACE_RESERVE;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) {
+        if (!dquot_active(inode)) {
                inode_decr_space(inode, number, reserve);
                return;
        }
@@ -1657,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode)
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
-        if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode))
+        if (!dquot_active(inode))
                return;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1677,16 +1722,19 @@ EXPORT_SYMBOL(dquot_free_inode);
 /*
 * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
+ *
 */
-static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
        qsize_t space, cur_space;
        qsize_t rsv_space = 0;
-        struct dquot *transfer_from[MAXQUOTAS];
+        struct dquot *transfer_from[MAXQUOTAS] = {};
-        struct dquot *transfer_to[MAXQUOTAS];
        int cnt, ret = 0;
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1696,19 +1744,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-                transfer_to[cnt] = NULL;
                warntype_to[cnt] = QUOTA_NL_NOWARN;
-        }
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (mask & (1 << cnt))
-                        transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
-        }
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                goto put_all;
+                return 0;
        }
        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
@@ -1760,47 +1801,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* The reference we got is transferred to the inode */
+        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = NULL;
+                transfer_to[cnt] = transfer_from[cnt];
-warn_put_all:
+warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-put_all:
-        dqput_all(transfer_from);
-        dqput_all(transfer_to);
        return ret;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Clear dquot pointers we don't want to dqput() */
+        goto warn;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-        goto warn_put_all;
 }
+EXPORT_SYMBOL(__dquot_transfer);
 /* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-        qid_t chid[MAXQUOTAS];
+        struct dquot *transfer_to[MAXQUOTAS] = {};
-        unsigned long mask = 0;
+        struct super_block *sb = inode->i_sb;
+        int ret;
-        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+        if (!dquot_active(inode))
-                mask |= 1 << USRQUOTA;
+                return 0;
-                chid[USRQUOTA] = iattr->ia_uid;
-        }
+        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
-        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+                transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
-                mask |= 1 << GRPQUOTA;
+        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
-                chid[GRPQUOTA] = iattr->ia_gid;
+                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
-        }
-        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        ret = __dquot_transfer(inode, transfer_to);
-                dquot_initialize(inode);
+        dqput_all(transfer_to);
-                return __dquot_transfer(inode, chid, mask);
+        return ret;
-        }
-        return 0;
 }
 EXPORT_SYMBOL(dquot_transfer);
@@ -1831,6 +1866,7 @@ const struct dquot_operations dquot_operations = {
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
 };
+EXPORT_SYMBOL(dquot_operations);
 /*
 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1849,7 +1885,7 @@ EXPORT_SYMBOL(dquot_file_open);
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
@@ -1956,7 +1992,7 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
                                truncate_inode_pages(&toputinode[cnt]->i_data,
                                                     0);
                                mutex_unlock(&toputinode[cnt]->i_mutex);
-                                mark_inode_dirty(toputinode[cnt]);
+                                mark_inode_dirty_sync(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
                }
@@ -1979,14 +2015,15 @@ put_inodes:
                }
        return ret;
 }
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
 {
-        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+        return dquot_disable(sb, type,
-                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 }
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
 /*
 *      Turn quotas on on a device
 */
@@ -2104,36 +2141,43 @@ out_fmt:
 }
 /* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
 {
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
-        int ret;
+        int ret = 0, cnt;
        unsigned int flags;
-        mutex_lock(&dqopt->dqonoff_mutex);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-        if (!sb_has_quota_suspended(sb, type)) {
+                if (type != -1 && cnt != type)
+                        continue;
+                mutex_lock(&dqopt->dqonoff_mutex);
+                if (!sb_has_quota_suspended(sb, cnt)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        continue;
+                }
+                inode = dqopt->files[cnt];
+                dqopt->files[cnt] = NULL;
+                spin_lock(&dq_state_lock);
+                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                        DQUOT_LIMITS_ENABLED,
+                                                        cnt);
+                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+                spin_unlock(&dq_state_lock);
                mutex_unlock(&dqopt->dqonoff_mutex);
-                return 0;
-        }
-        inode = dqopt->files[type];
-        dqopt->files[type] = NULL;
-        spin_lock(&dq_state_lock);
-        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
-                                                DQUOT_LIMITS_ENABLED, type);
-        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
-        spin_unlock(&dq_state_lock);
-        mutex_unlock(&dqopt->dqonoff_mutex);
-        flags = dquot_generic_flag(flags, type);
+                flags = dquot_generic_flag(flags, cnt);
-        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                ret = vfs_load_quota_inode(inode, cnt,
-                                   flags);
+                                dqopt->info[cnt].dqi_fmt_id, flags);
-        iput(inode);
+                iput(inode);
+        }
        return ret;
 }
+EXPORT_SYMBOL(dquot_resume);
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                      struct path *path)
 {
        int error = security_quota_on(path->dentry);
@@ -2148,40 +2192,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-                 int remount)
 {
        struct path path;
        int error;
-        if (remount)
-                return vfs_quota_on_remount(sb, type);
        error = kern_path(name, LOOKUP_FOLLOW, &path);
        if (!error) {
-                error = vfs_quota_on_path(sb, type, format_id, &path);
+                error = dquot_quota_on_path(sb, type, format_id, &path);
                path_put(&path);
        }
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
 /*
 * More powerful function for turning on quotas allowing setting
 * of individual quota flags
 */
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
+int dquot_enable(struct inode *inode, int type, int format_id,
-                unsigned int flags)
+                 unsigned int flags)
 {
        int ret = 0;
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);
        /* Just unsuspend quotas? */
-        if (flags & DQUOT_SUSPENDED)
+        BUG_ON(flags & DQUOT_SUSPENDED);
-                return vfs_quota_on_remount(sb, type);
        if (!flags)
                return 0;
        /* Just updating flags needed? */
@@ -2213,13 +2253,13 @@ out_lock:
 load_quota:
        return vfs_load_quota_inode(inode, type, format_id, flags);
 }
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
 /*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
 {
        struct dentry *dentry;
@@ -2245,24 +2285,7 @@ out:
        dput(dentry);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
-        int cnt;
-        int ret = 0, err;
-        if (!sb->s_qcop || !sb->s_qcop->quota_on)
-                return -ENOSYS;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-                if (err < 0 && !ret)
-                        ret = err;
-        }
-        return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2275,25 +2298,30 @@ static inline qsize_t stoqb(qsize_t space)
 }
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
+        memset(di, 0, sizeof(*di));
+        di->d_version = FS_DQUOT_VERSION;
+        di->d_flags = dquot->dq_type == USRQUOTA ?
+                        FS_USER_QUOTA : FS_GROUP_QUOTA;
+        di->d_id = dquot->dq_id;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+        di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
+        di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
-        di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
+        di->d_ino_hardlimit = dm->dqb_ihardlimit;
-        di->dqb_ihardlimit = dm->dqb_ihardlimit;
+        di->d_ino_softlimit = dm->dqb_isoftlimit;
-        di->dqb_isoftlimit = dm->dqb_isoftlimit;
+        di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
-        di->dqb_curinodes = dm->dqb_curinodes;
+        di->d_icount = dm->dqb_curinodes;
-        di->dqb_btime = dm->dqb_btime;
+        di->d_btimer = dm->dqb_btime;
-        di->dqb_itime = dm->dqb_itime;
+        di->d_itimer = dm->dqb_itime;
-        di->dqb_valid = QIF_ALL;
        spin_unlock(&dq_data_lock);
 }
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                    struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2305,53 +2333,72 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
+#define VFS_FS_DQ_MASK \
+        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+         FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+         FS_DQ_BTIMER | FS_DQ_ITIMER)
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
-        if ((di->dqb_valid & QIF_BLIMITS &&
+        if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
-             (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
+                return -EINVAL;
-              di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
-            (di->dqb_valid & QIF_ILIMITS &&
+        if (((di->d_fieldmask & FS_DQ_BSOFT) &&
-             (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
+             (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
-              di->dqb_isoftlimit > dqi->dqi_maxilimit)))
+            ((di->d_fieldmask & FS_DQ_BHARD) &&
+             (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+            ((di->d_fieldmask & FS_DQ_ISOFT) &&
+             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+            ((di->d_fieldmask & FS_DQ_IHARD) &&
+             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
-        if (di->dqb_valid & QIF_SPACE) {
+        if (di->d_fieldmask & FS_DQ_BCOUNT) {
-                dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
+                dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+        if (di->d_fieldmask & FS_DQ_BSOFT)
-                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
+                dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+        if (di->d_fieldmask & FS_DQ_BHARD)
+                dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+        if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_INODES) {
-                dm->dqb_curinodes = di->dqb_curinodes;
+        if (di->d_fieldmask & FS_DQ_ICOUNT) {
+                dm->dqb_curinodes = di->d_icount;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ILIMITS) {
-                dm->dqb_isoftlimit = di->dqb_isoftlimit;
+        if (di->d_fieldmask & FS_DQ_ISOFT)
-                dm->dqb_ihardlimit = di->dqb_ihardlimit;
+                dm->dqb_isoftlimit = di->d_ino_softlimit;
+        if (di->d_fieldmask & FS_DQ_IHARD)
+                dm->dqb_ihardlimit = di->d_ino_hardlimit;
+        if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME) {
-                dm->dqb_btime = di->dqb_btime;
+        if (di->d_fieldmask & FS_DQ_BTIMER) {
+                dm->dqb_btime = di->d_btimer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ITIME) {
-                dm->dqb_itime = di->dqb_itime;
+        if (di->d_fieldmask & FS_DQ_ITIMER) {
+                dm->dqb_itime = di->d_itimer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
@@ -2361,7 +2408,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_BTIME))
+                } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
        }
@@ -2370,7 +2417,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curinodes < dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_ITIME))
+                } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
        }
@@ -2385,8 +2432,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        return 0;
 }
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
        int rc;
@@ -2401,10 +2448,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
 out:
        return rc;
 }
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
 /* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
  
@@ -2423,10 +2470,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
 /* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
        int err = 0;
@@ -2453,74 +2500,86 @@ out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return err;
 }
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops dquot_quotactl_ops = {
-        .quota_on       = vfs_quota_on,
+        .quota_on       = dquot_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
+EXPORT_SYMBOL(dquot_quotactl_ops);
+static int do_proc_dqstats(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        unsigned int type = (int *)table->data - dqstats.stat;
+        /* Update global table */
+        dqstats.stat[type] =
+                        percpu_counter_sum_positive(&dqstats.counter[type]);
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
 static ctl_table fs_dqstats_table[] = {
        {
                .procname       = "lookups",
-                .data           = &dqstats.lookups,
+                .data           = &dqstats.stat[DQST_LOOKUPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "drops",
-                .data           = &dqstats.drops,
+                .data           = &dqstats.stat[DQST_DROPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "reads",
-                .data           = &dqstats.reads,
+                .data           = &dqstats.stat[DQST_READS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "writes",
-                .data           = &dqstats.writes,
+                .data           = &dqstats.stat[DQST_WRITES],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "cache_hits",
-                .data           = &dqstats.cache_hits,
+                .data           = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "allocated_dquots",
-                .data           = &dqstats.allocated_dquots,
+                .data           = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "free_dquots",
-                .data           = &dqstats.free_dquots,
+                .data           = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "syncs",
-                .data           = &dqstats.syncs,
+                .data           = &dqstats.stat[DQST_SYNCS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
@@ -2554,7 +2613,7 @@ static ctl_table sys_table[] = {
 static int __init dquot_init(void)
 {
-        int i;
+        int i, ret;
        unsigned long nr_hash, order;
        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2572,6 +2631,12 @@ static int __init dquot_init(void)
        if (!dquot_hash)
                panic("Cannot create dquot hash table");
+        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
+                ret = percpu_counter_init(&dqstats.counter[i], 0);
+                if (ret)
+                        panic("Cannot create dquot stat counters");
+        }
        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
        return security_quotactl(cmd, type, id, sb);
 }
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+        if (sb->s_qcop && sb->s_qcop->quota_sync)
+                sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
 static int quota_sync_all(int type)
 {
-        struct super_block *sb;
        int ret;
        if (type >= MAXQUOTAS)
                return -EINVAL;
        ret = security_quotactl(Q_SYNC, type, 0, NULL);
-        if (ret)
+        if (!ret)
-                return ret;
+                iterate_supers(quota_sync_one, &type);
+        return ret;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_qcop || !sb->s_qcop->quota_sync)
-                        continue;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        sb->s_qcop->quota_sync(sb, type, 1);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        return 0;
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -87,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);
        if (sb->s_qcop->quota_on)
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
        putname(pathname);
        return ret;
 }
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
        struct if_dqinfo info;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_info)
                return -ENOSYS;
        ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
        if (copy_from_user(&info, addr, sizeof(info)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_info)
                return -ENOSYS;
        return sb->s_qcop->set_info(sb, type, &info);
 }
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+{
+        dst->dqb_bhardlimit = src->d_blk_hardlimit;
+        dst->dqb_bsoftlimit = src->d_blk_softlimit;
+        dst->dqb_curspace = src->d_bcount;
+        dst->dqb_ihardlimit = src->d_ino_hardlimit;
+        dst->dqb_isoftlimit = src->d_ino_softlimit;
+        dst->dqb_curinodes = src->d_icount;
+        dst->dqb_btime = src->d_btimer;
+        dst->dqb_itime = src->d_itimer;
+        dst->dqb_valid = QIF_ALL;
+}
 static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (ret)
                return ret;
+        copy_to_if_dqblk(&idq, &fdq);
        if (copy_to_user(addr, &idq, sizeof(idq)))
                return -EFAULT;
        return 0;
 }
+static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+{
+        dst->d_blk_hardlimit = src->dqb_bhardlimit;
+        dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+        dst->d_bcount = src->dqb_curspace;
+        dst->d_ino_hardlimit = src->dqb_ihardlimit;
+        dst->d_ino_softlimit = src->dqb_isoftlimit;
+        dst->d_icount = src->dqb_curinodes;
+        dst->d_btimer = src->dqb_btime;
+        dst->d_itimer = src->dqb_itime;
+        dst->d_fieldmask = 0;
+        if (src->dqb_valid & QIF_BLIMITS)
+                dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+        if (src->dqb_valid & QIF_SPACE)
+                dst->d_fieldmask |= FS_DQ_BCOUNT;
+        if (src->dqb_valid & QIF_ILIMITS)
+                dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+        if (src->dqb_valid & QIF_INODES)
+                dst->d_fieldmask |= FS_DQ_ICOUNT;
+        if (src->dqb_valid & QIF_BTIME)
+                dst->d_fieldmask |= FS_DQ_BTIMER;
+        if (src->dqb_valid & QIF_ITIME)
+                dst->d_fieldmask |= FS_DQ_ITIMER;
+}
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        if (copy_from_user(&idq, addr, sizeof(idq)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
+        copy_from_if_dqblk(&fdq, &idq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
        if (copy_from_user(&fdq, addr, sizeof(fdq)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xquota)
+        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_xquota(sb, type, id, &fdq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        struct fs_disk_quota fdq;
        int ret;
-        if (!sb->s_qcop->get_xquota)
+        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
@@ -239,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type, 0);
+                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,16 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
        struct super_block *sb = info->dqi_sb;
+        ssize_t ret;
-        return sb->s_op->quota_write(sb, info->dqi_type, buf,
+        ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+        if (ret != info->dqi_usable_bs) {
+                quota_error(sb, "dquota write failed");
+                if (ret >= 0)
+                        ret = -EIO;
+        }
+        return ret;
 }
 /* Remove empty block from list and return it */
@@ -152,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
        /* No matter whether write succeeds block is out of list */
        if (write_blk(info, blk, buf) < 0)
-                printk(KERN_ERR
+                quota_error(info->dqi_sb, "Can't write block (%u) "
-                       "VFS: Can't write block (%u) with free entries.\n",
+                            "with free entries", blk);
-                       blk);
        return 0;
 out_buf:
        kfree(tmpbuf);
@@ -244,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
                *err = remove_free_dqentry(info, buf, blk);
                if (*err < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                        quota_error(dquot->dq_sb, "Can't remove block (%u) "
-                               "remove block (%u) from entry free list.\n",
+                                    "from entry free list", blk);
-                               blk);
                        goto out_buf;
                }
        }
@@ -260,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        }
 #ifdef __QUOTA_QT_PARANOIA
        if (i == qtree_dqstr_in_blk(info)) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+                quota_error(dquot->dq_sb, "Data block full but it shouldn't");
-                                "but it shouldn't.\n");
                *err = -EIO;
                goto out_buf;
        }
 #endif
        *err = write_blk(info, blk, buf);
        if (*err < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                quota_error(dquot->dq_sb, "Can't write quota data block %u",
-                                "data block %u.\n", blk);
+                            blk);
                goto out_buf;
        }
        dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -303,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        } else {
                ret = read_blk(info, *treeblk, buf);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                        quota_error(dquot->dq_sb, "Can't read tree quota "
-                                        "%u.\n", *treeblk);
+                                    "block %u", *treeblk);
                        goto out_buf;
                }
        }
@@ -315,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
                if (newblk) {
-                        printk(KERN_ERR "VFS: Inserting already present quota "
+                        quota_error(dquot->dq_sb, "Inserting already present "
-                                        "entry (block %u).\n",
+                                    "quota entry (block %u)",
-                               le32_to_cpu(ref[get_index(info,
+                                    le32_to_cpu(ref[get_index(info,
                                                dquot->dq_id, depth)]));
                        ret = -EIO;
                        goto out_buf;
@@ -365,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (!dquot->dq_off) {
                ret = dq_insert_tree(info, dquot);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                        quota_error(sb, "Error %zd occurred while creating "
-                                        "creating quota.\n", ret);
+                                    "quota", ret);
                        kfree(ddquot);
                        return ret;
                }
@@ -377,14 +381,13 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
                                    dquot->dq_off);
        if (ret != info->dqi_entry_size) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                quota_error(sb, "dquota write failed");
-                       sb->s_id);
                if (ret >= 0)
                        ret = -ENOSPC;
        } else {
                ret = 0;
        }
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        kfree(ddquot);
        return ret;
@@ -402,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (!buf)
                return -ENOMEM;
        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                quota_error(dquot->dq_sb, "Quota structure has offset to "
-                  "block (%u) than it should (%u).\n", blk,
+                        "other block (%u) than it should (%u)", blk,
-                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+                        (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
                goto out_buf;
        }
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota data block %u",
+                            blk);
                goto out_buf;
        }
        dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                if (ret >= 0)
                        ret = put_free_dqblk(info, buf, blk);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                        quota_error(dquot->dq_sb, "Can't move quota data block "
-                          "to free list.\n", blk);
+                                    "(%u) to free list", blk);
                        goto out_buf;
                }
        } else {
@@ -432,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                        /* Insert will write block itself */
                        ret = insert_free_dqentry(info, buf, blk);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                quota_error(dquot->dq_sb, "Can't insert quota "
-                                       "block (%u) to free entry list.\n", blk);
+                                    "data block (%u) to free entry list", blk);
                                goto out_buf;
                        }
                } else {
                        ret = write_blk(info, blk, buf);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
+                                quota_error(dquot->dq_sb, "Can't write quota "
-                                  "block %u\n", blk);
+                                            "data block %u", blk);
                                goto out_buf;
                        }
                }
@@ -464,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                quota_error(dquot->dq_sb, "Can't read quota data "
+                            "block %u", blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                quota_error(dquot->dq_sb, "Can't write quota "
-                                  "block %u.\n", *blk);
+                                            "tree block %u", blk);
                }
        }
 out_buf:
@@ -521,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota tree "
+                            "block %u", blk);
                goto out_buf;
        }
        ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                ddquot += info->dqi_entry_size;
        }
        if (i == qtree_dqstr_in_blk(info)) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                quota_error(dquot->dq_sb, "Quota for id %u referenced "
-                  "but not present.\n", dquot->dq_id);
+                            "but not present", dquot->dq_id);
                ret = -EIO;
                goto out_buf;
        } else {
@@ -556,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                quota_error(dquot->dq_sb, "Can't read quota tree block %u",
+                            blk);
                goto out_buf;
        }
        ret = 0;
@@ -590,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 #ifdef __QUOTA_QT_PARANOIA
        /* Invalidated quota? */
        if (!sb_dqopt(dquot->dq_sb)->files[type]) {
-                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+                quota_error(sb, "Quota invalidated while reading!");
                return -EIO;
        }
 #endif
@@ -599,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
                offset = find_dqentry(info, dquot);
                if (offset <= 0) {      /* Entry not present? */
                        if (offset < 0)
-                                printk(KERN_ERR "VFS: Can't read quota "
+                                quota_error(sb, "Can't read quota structure "
-                                  "structure for id %u.\n", dquot->dq_id);
+                                            "for id %u", dquot->dq_id);
                        dquot->dq_off = 0;
                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
                        memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -617,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (ret != info->dqi_entry_size) {
                if (ret >= 0)
                        ret = -EIO;
-                printk(KERN_ERR "VFS: Error while reading quota "
+                quota_error(sb, "Error while reading quota structure for id %u",
-                                "structure for id %u.\n", dquot->dq_id);
+                            dquot->dq_id);
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
                kfree(ddquot);
@@ -634,7 +641,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        spin_unlock(&dq_data_lock);
        kfree(ddquot);
 out:
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return ret;
 }
 EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
            dquot->dq_dqb.dqb_ihardlimit == 0 &&
            dquot->dq_dqb.dqb_isoftlimit == 0)
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return 0;
 }
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
                        (char *)&dqblk, sizeof(struct v1_disk_dqblk),
                        v1_dqoff(dquot->dq_id));
        if (ret != sizeof(struct v1_disk_dqblk)) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                quota_error(dquot->dq_sb, "dquota write failed");
-                        dquot->dq_sb->s_id);
                if (ret >= 0)
                        ret = -EIO;
                goto out;
@@ -104,7 +103,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
        ret = 0;
 out:
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        return ret;
 }
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
        size = sb->s_op->quota_read(sb, type, (char *)dqhead,
                                    sizeof(struct v2_disk_dqheader), 0);
        if (size != sizeof(struct v2_disk_dqheader)) {
-                printk(KERN_WARNING "quota_v2: Failed header read:"
+                quota_error(sb, "Failed header read: expected=%zd got=%zd",
-                       " expected=%zd got=%zd\n",
+                            sizeof(struct v2_disk_dqheader), size);
-                        sizeof(struct v2_disk_dqheader), size);
                return 0;
        }
        return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
+                quota_error(sb, "Can't read info structure");
-                        sb->s_id);
                return -1;
        }
        info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "Can't write info structure on device %s.\n",
+                quota_error(sb, "Can't write info structure");
-                        sb->s_id);
                return -1;
        }
        return 0;
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
 const struct inode_operations ramfs_file_inode_operations = {
+        .setattr        = simple_setattr,
        .getattr        = simple_getattr,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..9eead2c796b7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_read               = generic_file_aio_read,
        .write                  = do_sync_write,
        .aio_write              = generic_file_aio_write,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .splice_read            = generic_file_splice_read,
        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
@@ -146,9 +146,8 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
                        return ret;
        }
-        ret = vmtruncate(inode, newsize);
+        truncate_setsize(inode, newsize);
+        return 0;
-        return ret;
 }
 /*****************************************************************************/
@@ -169,7 +168,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
        /* pick out size-changing events */
        if (ia->ia_valid & ATTR_SIZE) {
-                loff_t size = i_size_read(inode);
+                loff_t size = inode->i_size;
                if (ia->ia_size != size) {
                        ret = ramfs_nommu_resize(inode, ia->ia_size, size);
                        if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +182,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
-        ret = inode_setattr(inode, ia);
+        setattr_copy(inode, ia);
 out:
        ia->ia_valid = old_ia_valid;
        return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c94853473ca9..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+struct inode *ramfs_get_inode(struct super_block *sb,
+                                const struct inode *dir, int mode, dev_t dev)
 {
        struct inode * inode = new_inode(sb);
        if (inode) {
-                inode->i_mode = mode;
+                inode_init_owner(inode, dir, mode);
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 static int
 ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-        struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+        struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
        int error = -ENOSPC;
        if (inode) {
-                if (dir->i_mode & S_ISGID) {
-                        inode->i_gid = dir->i_gid;
-                        if (S_ISDIR(mode))
-                                inode->i_mode |= S_ISGID;
-                }
                d_instantiate(dentry, inode);
                dget(dentry);   /* Extra count - pin the dentry in core */
                error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
        struct inode *inode;
        int error = -ENOSPC;
-        inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+        inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
                if (!error) {
-                        if (dir->i_mode & S_ISGID)
-                                inode->i_gid = dir->i_gid;
                        d_instantiate(dentry, inode);
                        dget(dentry);
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
        return 0;
 }
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct ramfs_fs_info *fsi;
        struct inode *inode = NULL;
@@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        sb->s_op                = &ramfs_ops;
        sb->s_time_gran         = 1;
-        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
        if (!inode) {
                err = -ENOMEM;
                goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..74e36586e4d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:       file structure to seek on
+ * @offset:     file offset to seek to
+ * @origin:     type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+        return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
        return -ESPIPE;
@@ -294,7 +311,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
                else
                        ret = do_sync_read(file, buf, count, pos);
                if (ret > 0) {
-                        fsnotify_access(file->f_path.dentry);
+                        fsnotify_access(file);
                        add_rchar(current, ret);
                }
                inc_syscr(current);
@@ -350,7 +367,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
                else
                        ret = do_sync_write(file, buf, count, pos);
                if (ret > 0) {
-                        fsnotify_modify(file->f_path.dentry);
+                        fsnotify_modify(file);
                        add_wchar(current, ret);
                }
                inc_syscw(current);
@@ -658,9 +675,9 @@ out:
                kfree(iov);
        if ((ret + (type == READ)) > 0) {
                if (type == READ)
-                        fsnotify_access(file->f_path.dentry);
+                        fsnotify_access(file);
                else
-                        fsnotify_modify(file->f_path.dentry);
+                        fsnotify_modify(file);
        }
        return ret;
 }
diff --git a/fs/readdir.c b/fs/readdir.c
index 7723401f8d8b..356f71528ad6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,6 +4,7 @@
 *  Copyright (C) 1995  Linus Torvalds
 */
+#include <linux/stddef.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/time.h>
@@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir);
 * anyway. Thus the special "fillonedir()" function for that
 * case (the low-level handlers don't need to care about this).
 */
-#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
 #ifdef __ARCH_WANT_OLD_READDIR
@@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
        struct linux_dirent __user * dirent;
        struct getdents_callback * buf = (struct getdents_callback *) __buf;
        unsigned long d_ino;
-        int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long));
+        int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
+                sizeof(long));
        buf->error = -EINVAL;   /* only used if we fail.. */
        if (reclen > buf->count)
@@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
 {
        struct linux_dirent64 __user *dirent;
        struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
-        int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64));
+        int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
+                sizeof(u64));
        buf->error = -EINVAL;   /* only used if we fail.. */
        if (reclen > buf->count)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
 extern const struct reiserfs_key MIN_KEY;
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
-                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
-                              int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        reiserfs_write_lock(inode->i_sb);
        err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..6846371498b6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,20 +38,24 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
        BUG_ON(!S_ISREG(inode->i_mode));
+        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+                return 0;
+        mutex_lock(&(REISERFS_I(inode)->tailpack));
+        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
+                mutex_unlock(&(REISERFS_I(inode)->tailpack));
+                return 0;
+        }
        /* fast out for when nothing needs to be done */
-        if ((atomic_read(&inode->i_count) > 1 ||
+        if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
-             !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
             !tail_has_to_be_packed(inode)) &&
            REISERFS_I(inode)->i_prealloc_count <= 0) {
+                mutex_unlock(&(REISERFS_I(inode)->tailpack));
                return 0;
        }
-        mutex_lock(&inode->i_mutex);
-        mutex_lock(&(REISERFS_I(inode)->i_mmap));
-        if (REISERFS_I(inode)->i_flags & i_ever_mapped)
-                REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
        reiserfs_write_lock(inode->i_sb);
        /* freeing preallocation only involves relogging blocks that
         * are already in the current transaction.  preallocation gets
@@ -94,9 +98,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
        if (!err)
                err = jbegin_failure;
-        if (!err && atomic_read(&inode->i_count) <= 1 &&
+        if (!err &&
            (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
            tail_has_to_be_packed(inode)) {
                /* if regular file is released by last holder and it has been
                   appended (we append by unformatted node only) or its direct
                   item(s) had to be converted, then it may have to be
@@ -104,27 +109,28 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
                err = reiserfs_truncate_file(inode, 0);
        }
      out:
-        mutex_unlock(&(REISERFS_I(inode)->i_mmap));
-        mutex_unlock(&inode->i_mutex);
        reiserfs_write_unlock(inode->i_sb);
+        mutex_unlock(&(REISERFS_I(inode)->tailpack));
        return err;
 }
-static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int reiserfs_file_open(struct inode *inode, struct file *file)
 {
-        struct inode *inode;
+        int err = dquot_file_open(inode, file);
+        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
-        inode = file->f_path.dentry->d_inode;
+                /* somebody might be tailpacking on final close; wait for it */
-        mutex_lock(&(REISERFS_I(inode)->i_mmap));
+                mutex_lock(&(REISERFS_I(inode)->tailpack));
-        REISERFS_I(inode)->i_flags |= i_ever_mapped;
+                atomic_inc(&REISERFS_I(inode)->openers);
-        mutex_unlock(&(REISERFS_I(inode)->i_mmap));
+                mutex_unlock(&(REISERFS_I(inode)->tailpack));
+        }
-        return generic_file_mmap(file, vma);
+        return err;
 }
 static void reiserfs_vfs_truncate_file(struct inode *inode)
 {
+        mutex_lock(&(REISERFS_I(inode)->tailpack));
        reiserfs_truncate_file(inode, 1);
+        mutex_unlock(&(REISERFS_I(inode)->tailpack));
 }
 /* Sync a reiserfs file. */
@@ -134,10 +140,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
 * be removed...
 */
-static int reiserfs_sync_file(struct file *filp,
+static int reiserfs_sync_file(struct file *filp, int datasync)
-                              struct dentry *dentry, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        int barrier_done;
@@ -147,7 +152,8 @@ static int reiserfs_sync_file(struct file *filp,
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
@@ -288,8 +294,8 @@ const struct file_operations reiserfs_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl = reiserfs_compat_ioctl,
 #endif
-        .mmap = reiserfs_file_mmap,
+        .mmap = generic_file_mmap,
-        .open = dquot_file_open,
+        .open = reiserfs_file_open,
        .release = reiserfs_file_release,
        .fsync = reiserfs_sync_file,
        .aio_read = generic_file_aio_read,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dc2c65e04853..caa758377d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -25,7 +25,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 int reiserfs_prepare_write(struct file *f, struct page *page,
                           unsigned from, unsigned to);
-void reiserfs_delete_inode(struct inode *inode)
+void reiserfs_evict_inode(struct inode *inode)
 {
        /* We need blocks for transaction + (user+group) quota update (possibly delete) */
        int jbegin_count =
@@ -35,10 +35,12 @@ void reiserfs_delete_inode(struct inode *inode)
        int depth;
        int err;
-        if (!is_bad_inode(inode))
+        if (!inode->i_nlink && !is_bad_inode(inode))
                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
+        if (inode->i_nlink)
+                goto no_delete;
        depth = reiserfs_write_lock_once(inode->i_sb);
@@ -77,9 +79,15 @@ void reiserfs_delete_inode(struct inode *inode)
                ;
        }
      out:
-        clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
+        end_writeback(inode);   /* note this must go after the journal_end to prevent deadlock */
+        dquot_drop(inode);
        inode->i_blocks = 0;
        reiserfs_write_unlock_once(inode->i_sb, depth);
+        return;
+no_delete:
+        end_writeback(inode);
+        dquot_drop(inode);
 }
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -1138,7 +1146,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
        REISERFS_I(inode)->i_prealloc_count = 0;
        REISERFS_I(inode)->i_trans_id = 0;
        REISERFS_I(inode)->i_jl = NULL;
-        mutex_init(&(REISERFS_I(inode)->i_mmap));
        reiserfs_init_xattr_rwsem(inode);
        if (stat_data_v1(ih)) {
@@ -1221,7 +1228,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
                inode_set_bytes(inode,
                                to_real_used_space(inode, inode->i_blocks,
                                                   SD_V2_SIZE));
-                /* read persistent inode attributes from sd and initalise
+                /* read persistent inode attributes from sd and initialise
                   generic inode flags from them */
                REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
                sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
@@ -1841,7 +1848,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        REISERFS_I(inode)->i_attrs =
            REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
        sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-        mutex_init(&(REISERFS_I(inode)->i_mmap));
        reiserfs_init_xattr_rwsem(inode);
        /* key to search for correct place for new stat data */
@@ -2587,8 +2593,7 @@ static int reiserfs_write_begin(struct file *file,
                old_ref = th->t_refcount;
                th->t_refcount++;
        }
-        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = __block_write_begin(page, pos, len, reiserfs_get_block);
-                                reiserfs_get_block);
        if (ret && reiserfs_transaction_running(inode->i_sb)) {
                struct reiserfs_transaction_handle *th = current->journal_info;
                /* this gets a little ugly.  If reiserfs_get_block returned an
@@ -3059,10 +3064,25 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs,
                                  reiserfs_get_blocks_direct_io, NULL);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again.
+         */
+        if (unlikely((rw & WRITE) && ret < 0)) {
+                loff_t isize = i_size_read(inode);
+                loff_t end = offset + iov_length(iov, nr_segs);
+                if (end > isize)
+                        vmtruncate(inode, isize);
+        }
+        return ret;
 }
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3072,13 +3092,18 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        int depth;
        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
        /* must be turned off for recursive notify_change calls */
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
        depth = reiserfs_write_lock_once(inode->i_sb);
-        if (attr->ia_valid & ATTR_SIZE) {
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
+        if (attr->ia_valid & ATTR_SIZE) {
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
                 */
@@ -3120,56 +3145,59 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                goto out;
        }
-        error = inode_change_ok(inode, attr);
+        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-        if (!error) {
+            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+                struct reiserfs_transaction_handle th;
-                    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+                int jbegin_count =
-                        error = reiserfs_chown_xattrs(inode, attr);
+                    2 *
+                    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+                     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+                    2;
-                        if (!error) {
+                error = reiserfs_chown_xattrs(inode, attr);
-                                struct reiserfs_transaction_handle th;
-                                int jbegin_count =
+                if (error)
-                                    2 *
+                        return error;
-                                    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
-                                     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+                /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
-                                    2;
+                error = journal_begin(&th, inode->i_sb, jbegin_count);
+                if (error)
-                                /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+                        goto out;
-                                error =
+                error = dquot_transfer(inode, attr);
-                                    journal_begin(&th, inode->i_sb,
+                if (error) {
-                                                  jbegin_count);
+                        journal_end(&th, inode->i_sb, jbegin_count);
-                                if (error)
+                        goto out;
-                                        goto out;
-                                error = dquot_transfer(inode, attr);
-                                if (error) {
-                                        journal_end(&th, inode->i_sb,
-                                                    jbegin_count);
-                                        goto out;
-                                }
-                                /* Update corresponding info in inode so that everything is in
-                                 * one transaction */
-                                if (attr->ia_valid & ATTR_UID)
-                                        inode->i_uid = attr->ia_uid;
-                                if (attr->ia_valid & ATTR_GID)
-                                        inode->i_gid = attr->ia_gid;
-                                mark_inode_dirty(inode);
-                                error =
-                                    journal_end(&th, inode->i_sb, jbegin_count);
-                        }
-                }
-                if (!error) {
-                        /*
-                         * Relax the lock here, as it might truncate the
-                         * inode pages and wait for inode pages locks.
-                         * To release such page lock, the owner needs the
-                         * reiserfs lock
-                         */
-                        reiserfs_write_unlock_once(inode->i_sb, depth);
-                        error = inode_setattr(inode, attr);
-                        depth = reiserfs_write_lock_once(inode->i_sb);
                }
+                /* Update corresponding info in inode so that everything is in
+                 * one transaction */
+                if (attr->ia_valid & ATTR_UID)
+                        inode->i_uid = attr->ia_uid;
+                if (attr->ia_valid & ATTR_GID)
+                        inode->i_gid = attr->ia_gid;
+                mark_inode_dirty(inode);
+                error = journal_end(&th, inode->i_sb, jbegin_count);
+                if (error)
+                        goto out;
        }
+        /*
+         * Relax the lock here, as it might truncate the
+         * inode pages and wait for inode pages locks.
+         * To release such page lock, the owner needs the
+         * reiserfs lock
+         */
+        reiserfs_write_unlock_once(inode->i_sb, depth);
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode))
+                error = vmtruncate(inode, attr->ia_size);
+        if (!error) {
+                setattr_copy(inode, attr);
+                mark_inode_dirty(inode);
+        }
+        depth = reiserfs_write_lock_once(inode->i_sb);
        if (!error && reiserfs_posixacl(inode->i_sb)) {
                if (attr->ia_valid & ATTR_MODE)
                        error = reiserfs_acl_chmod(inode);
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
 int reiserfs_unpack(struct inode *inode, struct file *filp)
 {
        int retval = 0;
+        int depth;
        int index;
        struct page *page;
        struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        /* we need to make sure nobody is changing the file size beneath
         ** us
         */
-        mutex_lock(&inode->i_mutex);
+        reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        write_from = inode->i_size & (blocksize - 1);
        /* if we are on a block boundary, we are already unpacked.  */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
      out:
        mutex_unlock(&inode->i_mutex);
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        return retval;
 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e7..812e2c05aa29 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -983,7 +983,6 @@ static int flush_older_commits(struct super_block *s,
 static int reiserfs_async_progress_wait(struct super_block *s)
 {
-        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
        if (atomic_read(&j->j_async_throttle)) {
@@ -2312,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
        /* flush out the real blocks */
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                set_buffer_dirty(real_blocks[i]);
-                ll_rw_block(SWRITE, 1, real_blocks + i);
+                write_dirty_buffer(real_blocks[i], WRITE);
        }
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                wait_on_buffer(real_blocks[i]);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
 */
 static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 {
-        /* the quota init calls have to know who to charge the quota to, so
-         ** we have to set uid and gid here
-         */
-        inode->i_uid = current_fsuid();
-        inode->i_mode = mode;
        /* Make inode invalid - just in case we are going to drop it before
         * the initialization happens */
        INODE_PKEY(inode)->k_objectid = 0;
+        /* the quota init calls have to know who to charge the quota to, so
-        if (dir->i_mode & S_ISGID) {
+         ** we have to set uid and gid here
-                inode->i_gid = dir->i_gid;
+         */
-                if (S_ISDIR(mode))
+        inode_init_owner(inode, dir, mode);
-                        inode->i_mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        dquot_initialize(inode);
        return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..e15ff612002d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        int i;
        int ms_active_set;
+        int quota_enabled[MAXQUOTAS];
 #endif
        /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
        }
        /* Turn on quotas so that they are updated correctly */
        for (i = 0; i < MAXQUOTAS; i++) {
+                quota_enabled[i] = 1;
                if (REISERFS_SB(s)->s_qf_names[i]) {
-                        int ret = reiserfs_quota_on_mount(s, i);
+                        int ret;
+                        if (sb_has_quota_active(s, i)) {
+                                quota_enabled[i] = 0;
+                                continue;
+                        }
+                        ret = reiserfs_quota_on_mount(s, i);
                        if (ret < 0)
                                reiserfs_warning(s, "reiserfs-2500",
                                                 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
-                if (sb_dqopt(s)->files[i])
+                if (sb_dqopt(s)->files[i] && quota_enabled[i])
-                        vfs_quota_off(s, i, 0);
+                        dquot_quota_off(s, i);
        }
        if (ms_active_set)
                /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        reiserfs_write_lock(s);
        if (s->s_dirt)
@@ -515,6 +525,8 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb)
            kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
+        atomic_set(&ei->openers, 0);
+        mutex_init(&ei->tailpack);
        return &ei->vfs_inode;
 }
@@ -579,11 +591,6 @@ out:
        reiserfs_write_unlock_once(inode->i_sb, lock_depth);
 }
-static void reiserfs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
                                    size_t, loff_t);
@@ -596,8 +603,7 @@ static const struct super_operations reiserfs_sops = {
        .destroy_inode = reiserfs_destroy_inode,
        .write_inode = reiserfs_write_inode,
        .dirty_inode = reiserfs_dirty_inode,
-        .clear_inode = reiserfs_clear_inode,
+        .evict_inode = reiserfs_evict_inode,
-        .delete_inode = reiserfs_delete_inode,
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
@@ -620,7 +626,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -634,12 +640,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
 static const struct quotactl_ops reiserfs_qctl_operations = {
        .quota_on = reiserfs_quota_on,
-        .quota_off = vfs_quota_off,
+        .quota_off = dquot_quota_off,
-        .quota_sync = vfs_quota_sync,
+        .quota_sync = dquot_quota_sync,
-        .get_info = vfs_get_dqinfo,
+        .get_info = dquot_get_dqinfo,
-        .set_info = vfs_set_dqinfo,
+        .set_info = dquot_set_dqinfo,
-        .get_dqblk = vfs_get_dqblk,
+        .get_dqblk = dquot_get_dqblk,
-        .set_dqblk = vfs_set_dqblk,
+        .set_dqblk = dquot_set_dqblk,
 };
 #endif
@@ -1242,6 +1248,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (s->s_flags & MS_RDONLY)
                        /* it is read-only already */
                        goto out_ok;
+                err = dquot_suspend(s, -1);
+                if (err < 0)
+                        goto out_err;
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
                    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1306,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        s->s_dirt = 0;
        if (!(*mount_flags & MS_RDONLY)) {
+                dquot_resume(s, -1);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -2022,15 +2034,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-                                  REISERFS_SB(sb)->s_jquota_fmt, type);
+                                        REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name, int remount)
+                             char *name)
 {
        int err;
        struct path path;
@@ -2039,9 +2051,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        /* No more checks needed? Path and format_id are bogus anyway... */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, 1);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
                return err;
@@ -2085,7 +2095,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
 out:
        path_put(&path);
        return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
                        (handler) = *(handlers)++)
 /* This is the implementation for the xattr plugin infrastructure */
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
-find_xattr_handler_prefix(struct xattr_handler **handlers,
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
                           const char *name)
 {
-        struct xattr_handler *xah;
+        const struct xattr_handler *xah;
        if (!handlers)
                return NULL;
@@ -748,7 +748,7 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
                  size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -767,7 +767,7 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
        size_t size;
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-                struct xattr_handler *handler;
+                const struct xattr_handler *handler;
                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 /* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
        &reiserfs_xattr_user_handler,
        &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_access_handler = {
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags = ACL_TYPE_ACCESS,
        .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_default_handler = {
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags = ACL_TYPE_DEFAULT,
        .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
        sec->value = NULL;
 }
-struct xattr_handler reiserfs_xattr_security_handler = {
+const struct xattr_handler reiserfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = security_get,
        .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_trusted_handler = {
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = trusted_get,
        .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_user_handler = {
+const struct xattr_handler reiserfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = user_get,
        .set = user_set,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c0..1c5a6add779d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
                 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
                 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+                 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
                break;
        case __SI_POLL:
                err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
                err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
                err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+                err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
                break;
        default:
                /*
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 const struct file_operations smb_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = smb_readdir,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .open           = smb_dir_open,
 };
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
 #include "proto.h"
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct smb_sb_info *server = server_from_dentry(dentry);
        int result;
@@ -437,7 +438,7 @@ const struct file_operations smb_file_operations =
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
        .aio_write      = smb_file_aio_write,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .mmap           = smb_file_mmap,
        .open           = smb_file_open,
        .release        = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..450c91941988 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -46,7 +46,7 @@
 #define SMB_TTL_DEFAULT 1000
-static void smb_delete_inode(struct inode *);
+static void smb_evict_inode(struct inode *);
 static void smb_put_super(struct super_block *);
 static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
@@ -102,7 +102,7 @@ static const struct super_operations smb_sops =
        .alloc_inode    = smb_alloc_inode,
        .destroy_inode  = smb_destroy_inode,
        .drop_inode     = generic_delete_inode,
-        .delete_inode   = smb_delete_inode,
+        .evict_inode    = smb_evict_inode,
        .put_super      = smb_put_super,
        .statfs         = smb_statfs,
        .show_options   = smb_show_options,
@@ -324,15 +324,15 @@ out:
 * All blocking cleanup operations need to go here to avoid races.
 */
 static void
-smb_delete_inode(struct inode *ino)
+smb_evict_inode(struct inode *ino)
 {
        DEBUG1("ino=%ld\n", ino->i_ino);
        truncate_inode_pages(&ino->i_data, 0);
+        end_writeback(ino);
        lock_kernel();
        if (smb_close(ino))
                PARANOIA("could not close inode %ld\n", ino->i_ino);
        unlock_kernel();
-        clear_inode(ino);
 }
 static struct option opts[] = {
@@ -714,9 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
                error = server->ops->truncate(inode, attr->ia_size);
                if (error)
                        goto out;
-                error = vmtruncate(inode, attr->ia_size);
+                truncate_setsize(inode, attr->ia_size);
-                if (error)
-                        goto out;
                refresh = 1;
        }
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 #include "proto.h"
-int
+long
-smb_ioctl(struct inode *inode, struct file *filp,
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-          unsigned int cmd, unsigned long arg)
 {
-        struct smb_sb_info *server = server_from_inode(inode);
+        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
        struct smb_conn_opt opt;
        int result = -EINVAL;
+        lock_kernel();
        switch (cmd) {
                uid16_t uid16;
                uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
        default:
                break;
        }
+        unlock_kernel();
        return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
-#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..8f1dfaecc8f0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        break;
                }
-                if (pipe->nrbufs < PIPE_BUFFERS) {
+                if (pipe->nrbufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        if (!--spd->nr_pages)
                                break;
-                        if (pipe->nrbufs < PIPE_BUFFERS)
+                        if (pipe->nrbufs < pipe->buffers)
                                continue;
                        break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
        page_cache_release(spd->pages[i]);
 }
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return 0;
+        spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+        spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+        if (spd->pages && spd->partial)
+                return 0;
+        kfree(spd->pages);
+        kfree(spd->partial);
+        return -ENOMEM;
+}
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+                       struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return;
+        kfree(spd->pages);
+        kfree(spd->partial);
+}
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
                           struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
        struct address_space *mapping = in->f_mapping;
        unsigned int loff, nr_pages, req_pages;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct page *page;
        pgoff_t index, end_index;
        loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        index = *ppos >> PAGE_CACHE_SHIFT;
        loff = *ppos & ~PAGE_CACHE_MASK;
        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+        nr_pages = min(req_pages, pipe->buffers);
        /*
         * Lookup the (hopefully) full range of pages we need.
         */
-        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
        index += spd.nr_pages;
        /*
@@ -321,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
                        error = add_to_page_cache_lru(page, mapping, index,
-                                                mapping_gfp_mask(mapping));
+                                                GFP_KERNEL);
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                        unlock_page(page);
                }
-                pages[spd.nr_pages++] = page;
+                spd.pages[spd.nr_pages++] = page;
                index++;
        }
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                 * this_len is the max we'll use from this page
                 */
                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-                page = pages[page_nr];
+                page = spd.pages[page_nr];
                if (PageReadahead(page))
                        page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -366,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                 * If the page isn't uptodate, we may need to start io on it
                 */
                if (!PageUptodate(page)) {
-                        /*
+                        lock_page(page);
-                         * If in nonblock mode then dont block on waiting
-                         * for an in-flight io page
-                         */
-                        if (flags & SPLICE_F_NONBLOCK) {
-                                if (!trylock_page(page)) {
-                                        error = -EAGAIN;
-                                        break;
-                                }
-                        } else
-                                lock_page(page);
                        /*
                         * Page was truncated, or invalidated by the
@@ -393,8 +416,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                        error = -ENOMEM;
                                        break;
                                }
-                                page_cache_release(pages[page_nr]);
+                                page_cache_release(spd.pages[page_nr]);
-                                pages[page_nr] = page;
+                                spd.pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
@@ -451,8 +474,8 @@ fill_it:
                        len = this_len;
                }
-                partial[page_nr].offset = loff;
+                spd.partial[page_nr].offset = loff;
-                partial[page_nr].len = this_len;
+                spd.partial[page_nr].len = this_len;
                len -= this_len;
                loff = 0;
                spd.nr_pages++;
@@ -464,12 +487,13 @@ fill_it:
         * we got, 'nr_pages' is how many pages are in the map.
         */
        while (page_nr < nr_pages)
-                page_cache_release(pages[page_nr++]);
+                page_cache_release(spd.pages[page_nr++]);
        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
        if (spd.nr_pages)
-                return splice_to_pipe(pipe, &spd);
+                error = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
        return error;
 }
@@ -560,10 +584,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        unsigned int nr_pages;
        unsigned int nr_freed;
        size_t offset;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
-        struct iovec vec[PIPE_BUFFERS];
+        struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
-        pgoff_t index;
        ssize_t res;
        size_t this_len;
        int error;
@@ -576,11 +599,21 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
-        index = *ppos >> PAGE_CACHE_SHIFT;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
+        res = -ENOMEM;
+        vec = __vec;
+        if (pipe->buffers > PIPE_DEF_BUFFERS) {
+                vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+                if (!vec)
+                        goto shrink_ret;
+        }
        offset = *ppos & ~PAGE_CACHE_MASK;
        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+        for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
                struct page *page;
                page = alloc_page(GFP_USER);
@@ -591,7 +624,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
                vec[i].iov_base = (void __user *) page_address(page);
                vec[i].iov_len = this_len;
-                pages[i] = page;
+                spd.pages[i] = page;
                spd.nr_pages++;
                len -= this_len;
                offset = 0;
@@ -610,11 +643,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        nr_freed = 0;
        for (i = 0; i < spd.nr_pages; i++) {
                this_len = min_t(size_t, vec[i].iov_len, res);
-                partial[i].offset = 0;
+                spd.partial[i].offset = 0;
-                partial[i].len = this_len;
+                spd.partial[i].len = this_len;
                if (!this_len) {
-                        __free_page(pages[i]);
+                        __free_page(spd.pages[i]);
-                        pages[i] = NULL;
+                        spd.pages[i] = NULL;
                        nr_freed++;
                }
                res -= this_len;
@@ -625,13 +658,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        if (res > 0)
                *ppos += res;
+shrink_ret:
+        if (vec != __vec)
+                kfree(vec);
+        splice_shrink_spd(pipe, &spd);
        return res;
 err:
        for (i = 0; i < spd.nr_pages; i++)
-                __free_page(pages[i]);
+                __free_page(spd.pages[i]);
-        return error;
+        res = error;
+        goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
@@ -784,7 +822,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (!buf->len) {
                        buf->ops = NULL;
                        ops->release(pipe, buf);
-                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
                        pipe->nrbufs--;
                        if (pipe->inode)
                                sd->need_wakeup = true;
@@ -1211,7 +1249,7 @@ out_release:
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops) {
@@ -1232,7 +1270,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
        struct file *file = sd->u.file;
-        return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+        return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+                              sd->flags);
 }
 /**
@@ -1321,8 +1360,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
-                        if (!out->f_op || !out->f_op->llseek ||
+                        if (!(out->f_mode & FMODE_PWRITE))
-                            out->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                                return -EFAULT;
@@ -1342,8 +1380,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
-                        if (!in->f_op || !in->f_op->llseek ||
+                        if (!(in->f_mode & FMODE_PREAD))
-                            in->f_op->llseek == no_llseek)
                                return -EINVAL;
                        if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                                return -EFAULT;
@@ -1371,7 +1408,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 */
 static int get_iovec_page_array(const struct iovec __user *iov,
                                unsigned int nr_vecs, struct page **pages,
-                                struct partial_page *partial, int aligned)
+                                struct partial_page *partial, int aligned,
+                                unsigned int pipe_buffers)
 {
        int buffers = 0, error = 0;
@@ -1414,8 +1452,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                        break;
                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                if (npages > PIPE_BUFFERS - buffers)
+                if (npages > pipe_buffers - buffers)
-                        npages = PIPE_BUFFERS - buffers;
+                        npages = pipe_buffers - buffers;
                error = get_user_pages_fast((unsigned long)base, npages,
                                        0, &pages[buffers]);
@@ -1450,7 +1488,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                 * or if we mapped the max number of pages that we have
                 * room for.
                 */
-                if (error < npages || buffers == PIPE_BUFFERS)
+                if (error < npages || buffers == pipe_buffers)
                        break;
                nr_vecs--;
@@ -1593,8 +1631,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                             unsigned long nr_segs, unsigned int flags)
 {
        struct pipe_inode_info *pipe;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
@@ -1602,17 +1640,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                .ops = &user_page_pipe_buf_ops,
                .spd_release = spd_release_page,
        };
+        long ret;
        pipe = pipe_info(file->f_path.dentry->d_inode);
        if (!pipe)
                return -EBADF;
-        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+        if (splice_grow_spd(pipe, &spd))
-                                            flags & SPLICE_F_GIFT);
+                return -ENOMEM;
+        spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+                                            spd.partial, flags & SPLICE_F_GIFT,
+                                            pipe->buffers);
        if (spd.nr_pages <= 0)
-                return spd.nr_pages;
+                ret = spd.nr_pages;
+        else
+                ret = splice_to_pipe(pipe, &spd);
-        return splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+        return ret;
 }
 /*
@@ -1738,13 +1784,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
         * Check ->nrbufs without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
-        if (pipe->nrbufs < PIPE_BUFFERS)
+        if (pipe->nrbufs < pipe->buffers)
                return 0;
        ret = 0;
        pipe_lock(pipe);
-        while (pipe->nrbufs >= PIPE_BUFFERS) {
+        while (pipe->nrbufs >= pipe->buffers) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
@@ -1810,7 +1856,7 @@ retry:
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
-                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;
@@ -1831,7 +1877,7 @@ retry:
                }
                ibuf = ipipe->bufs + ipipe->curbuf;
-                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                obuf = opipe->bufs + nbuf;
                if (len >= ibuf->len) {
@@ -1841,7 +1887,7 @@ retry:
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        opipe->nrbufs++;
-                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
                        ipipe->nrbufs--;
                        input_wakeup = true;
                } else {
@@ -1914,11 +1960,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
                 * If we have iterated all input buffers or ran out of
                 * output room, break.
                 */
-                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
                        break;
-                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
-                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                /*
                 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..e5f63da64d04 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
        help
          Saying Y here includes support for SquashFS 4.0 (a Compressed
          Read-Only File System).  Squashfs is a highly compressed read-only
-          filesystem for Linux.  It uses zlib compression to compress both
+          filesystem for Linux.  It uses zlib/lzo compression to compress both
          files, inodes and directories.  Inodes in the system are very small
          and all blocks are packed to minimise data overhead. Block sizes
          greater than 4K are supported up to a maximum of 1 Mbytes (default
          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
          (larger than 4GB), full uid/gid information, hard links and
-          timestamps.  
+          timestamps.
          Squashfs is intended for general read-only filesystem use, for
          archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,9 +26,35 @@ config SQUASHFS
          If unsure, say N.
-config SQUASHFS_EMBEDDED
+config SQUASHFS_XATTR
+        bool "Squashfs XATTR support"
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here includes support for extended attributes (xattrs).
+          Xattrs are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page).
+          If unsure, say N.
+config SQUASHFS_LZO
+        bool "Include support for LZO compressed file systems"
+        depends on SQUASHFS
+        default n
+        select LZO_DECOMPRESS
+        help
+          Saying Y here includes support for reading Squashfs file systems
+          compressed with LZO compresssion.  LZO compression is mainly
+          aimed at embedded systems with slower CPUs where the overheads
+          of zlib are too high.
+          LZO is not the standard compression used in Squashfs and so most
+          file systems will be readable without selecting this option.
-        bool "Additional option for memory-constrained systems" 
+          If unsure, say N.
+config SQUASHFS_EMBEDDED
+        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
        default n
        help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..7672bac8d328 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac9..24af9ce9722f 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
        NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
 };
+#ifndef CONFIG_SQUASHFS_LZO
 static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
+#endif
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
        &squashfs_lzma_unsupported_comp_ops,
+#ifdef CONFIG_SQUASHFS_LZO
+        &squashfs_lzo_comp_ops,
+#else
        &squashfs_lzo_unsupported_comp_ops,
+#endif
        &squashfs_unknown_comp_ops
 };
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
        union squashfs_inode squashfs_ino;
        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        int xattr_id = SQUASHFS_INVALID_XATTR;
        TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        frag_offset = 0;
                }
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &squashfs_symlink_inode_ops;
                inode->i_data.a_ops = &squashfs_symlink_aops;
                inode->i_mode |= S_IFLNK;
                squashfs_i(inode)->start = block;
                squashfs_i(inode)->offset = offset;
+                if (type == SQUASHFS_LSYMLINK_TYPE) {
+                        __le32 xattr;
+                        err = squashfs_read_metadata(sb, NULL, &block,
+                                                &offset, inode->i_size);
+                        if (err < 0)
+                                goto failed_read;
+                        err = squashfs_read_metadata(sb, &xattr, &block,
+                                                &offset, sizeof(xattr));
+                        if (err < 0)
+                                goto failed_read;
+                        xattr_id = le32_to_cpu(xattr);
+                }
                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                block, offset);
                break;
        }
        case SQUASHFS_BLKDEV_TYPE:
-        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE: {
-        case SQUASHFS_LBLKDEV_TYPE:
-        case SQUASHFS_LCHRDEV_TYPE: {
                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
                unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LCHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
        case SQUASHFS_FIFO_TYPE:
-        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_SOCKET_TYPE: {
-        case SQUASHFS_LFIFO_TYPE:
-        case SQUASHFS_LSOCKET_TYPE: {
                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LFIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
        default:
                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
                return -EINVAL;
        }
+        if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+                err = squashfs_xattr_lookup(sb, xattr_id,
+                                        &squashfs_i(inode)->xattr_count,
+                                        &squashfs_i(inode)->xattr_size,
+                                        &squashfs_i(inode)->xattr);
+                if (err < 0)
+                        goto failed_read;
+                inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+                                + 1;
+        } else
+                squashfs_i(inode)->xattr_count = 0;
        return 0;
 failed_read:
        ERROR("Unable to read inode 0x%llx\n", ino);
        return err;
 }
+const struct inode_operations squashfs_inode_ops = {
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 000000000000..5d87789bf1c1
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010 LG Electronics
+ * Chan Jeong <chan.jeong@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * lzo_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lzo.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+struct squashfs_lzo {
+        void    *input;
+        void    *output;
+};
+static void *lzo_init(struct squashfs_sb_info *msblk)
+{
+        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->input = vmalloc(block_size);
+        if (stream->input == NULL)
+                goto failed;
+        stream->output = vmalloc(block_size);
+        if (stream->output == NULL)
+                goto failed2;
+        return stream;
+failed2:
+        vfree(stream->input);
+failed:
+        ERROR("Failed to allocate lzo workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void lzo_free(void *strm)
+{
+        struct squashfs_lzo *stream = strm;
+        if (stream) {
+                vfree(stream->input);
+                vfree(stream->output);
+        }
+        kfree(stream);
+}
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        struct squashfs_lzo *stream = msblk->stream;
+        void *buff = stream->input;
+        int avail, i, bytes = length, res;
+        size_t out_len = srclength;
+        mutex_lock(&msblk->read_data_mutex);
+        for (i = 0; i < b; i++) {
+                wait_on_buffer(bh[i]);
+                if (!buffer_uptodate(bh[i]))
+                        goto block_release;
+                avail = min(bytes, msblk->devblksize - offset);
+                memcpy(buff, bh[i]->b_data + offset, avail);
+                buff += avail;
+                bytes -= avail;
+                offset = 0;
+                put_bh(bh[i]);
+        }
+        res = lzo1x_decompress_safe(stream->input, (size_t)length,
+                                        stream->output, &out_len);
+        if (res != LZO_E_OK)
+                goto failed;
+        res = bytes = (int)out_len;
+        for (i = 0, buff = stream->output; bytes && i < pages; i++) {
+                avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+                memcpy(buffer[i], buff, avail);
+                buff += avail;
+                bytes -= avail;
+        }
+        mutex_unlock(&msblk->read_data_mutex);
+        return res;
+block_release:
+        for (; i < b; i++)
+                put_bh(bh[i]);
+failed:
+        mutex_unlock(&msblk->read_data_mutex);
+        ERROR("lzo decompression failed, data probably corrupt\n");
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+        .init = lzo_init,
+        .free = lzo_free,
+        .decompress = lzo_uncompress,
+        .id = LZO_COMPRESSION,
+        .name = "lzo",
+        .supported = 1
+};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
 const struct inode_operations squashfs_dir_inode_ops = {
-        .lookup = squashfs_lookup
+        .lookup = squashfs_lookup,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..5d45569d5f72 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
                                unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
 /*
- * Inodes, files and decompressor operations
+ * Inodes, files,  decompressor and xattr operations
 */
 /* dir.c */
@@ -86,11 +89,21 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
+/* lzo_wrapper.c */
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..c5137fc9ab11 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN               256
 #define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR          (0xffffffffU)
 #define SQUASHFS_INVALID_BLK            (-1LL)
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE             13
 #define SQUASHFS_LSOCKET_TYPE           14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
 /* Flag whether block is compressed or uncompressed, bit is set if block is
 * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT         (1 << 15)
@@ -174,6 +182,24 @@
 #define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
                                        sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)         ((A) * sizeof(struct squashfs_xattr_id))
+#define SQUASHFS_XATTR_BLOCK(A)         (SQUASHFS_XATTR_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)  (SQUASHFS_XATTR_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCKS(A)        ((SQUASHFS_XATTR_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)   (SQUASHFS_XATTR_BLOCKS(A) *\
+                                        sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_XATTR_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS            8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
        __le64                  root_inode;
        __le64                  bytes_used;
        __le64                  id_table_start;
-        __le64                  xattr_table_start;
+        __le64                  xattr_id_table_start;
        __le64                  inode_table_start;
        __le64                  directory_table_start;
        __le64                  fragment_table_start;
@@ -248,7 +274,7 @@ struct squashfs_base_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
 };
 struct squashfs_ipc_inode {
@@ -257,19 +283,42 @@ struct squashfs_ipc_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
 };
+struct squashfs_lipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  xattr;
+};
 struct squashfs_dev_inode {
        __le16                  inode_type;
        __le16                  mode;
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+};
+struct squashfs_ldev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  rdev;
+        __le32                  xattr;
 };
 struct squashfs_symlink_inode {
@@ -278,7 +327,7 @@ struct squashfs_symlink_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  symlink_size;
        char                    symlink[0];
@@ -290,7 +339,7 @@ struct squashfs_reg_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  start_block;
        __le32                  fragment;
        __le32                  offset;
@@ -304,7 +353,7 @@ struct squashfs_lreg_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le64                  start_block;
        __le64                  file_size;
        __le64                  sparse;
@@ -321,7 +370,7 @@ struct squashfs_dir_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  start_block;
        __le32                  nlink;
        __le16                  file_size;
@@ -335,7 +384,7 @@ struct squashfs_ldir_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  file_size;
        __le32                  start_block;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
        struct squashfs_base_inode              base;
        struct squashfs_dev_inode               dev;
+        struct squashfs_ldev_inode              ldev;
        struct squashfs_symlink_inode           symlink;
        struct squashfs_reg_inode               reg;
        struct squashfs_lreg_inode              lreg;
        struct squashfs_dir_inode               dir;
        struct squashfs_ldir_inode              ldir;
        struct squashfs_ipc_inode               ipc;
+        struct squashfs_lipc_inode              lipc;
 };
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
        unsigned int            unused;
 };
+struct squashfs_xattr_entry {
+        __le16                  type;
+        __le16                  size;
+        char                    data[0];
+};
+struct squashfs_xattr_val {
+        __le32                  vsize;
+        char                    value[0];
+};
+struct squashfs_xattr_id {
+        __le64                  xattr;
+        __le32                  count;
+        __le32                  size;
+};
+struct squashfs_xattr_id_table {
+        __le64                  xattr_table_start;
+        __le32                  xattr_ids;
+        __le32                  unused;
+};
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
        u64             start;
        int             offset;
+        u64             xattr;
+        unsigned int    xattr_size;
+        int             xattr_count;
        union {
                struct {
                        u64             fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
        int                                     next_meta_index;
        __le64                                  *id_table;
        __le64                                  *fragment_index;
+        __le64                                  *xattr_id_table;
        struct mutex                            read_data_mutex;
        struct mutex                            meta_index_mutex;
        struct meta_index                       *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
        __le64                                  *inode_lookup_table;
        u64                                     inode_table;
        u64                                     directory_table;
+        u64                                     xattr_table;
        unsigned int                            block_size;
        unsigned short                          block_log;
        long long                               bytes_used;
        unsigned int                            inodes;
+        int                                     xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "xattr.h"
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start;
+        u64 lookup_table_start, xattr_id_table_start;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        if (msblk->decompressor == NULL)
                goto failed_mount;
-        /*
-         * Check if there's xattrs in the filesystem.  These are not
-         * supported in this version, so warn that they will be ignored.
-         */
-        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-                ERROR("Xattrs in filesystem, these will be ignored\n");
        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
        if (lookup_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
+                goto allocate_xattr_table;
        /* Allocate and read inode lookup table */
        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
        sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
 allocate_root:
        root = new_inode(sb);
        if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
+        kfree(msblk->xattr_id_table);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
                kfree(sbi->inode_lookup_table);
+                kfree(sbi->xattr_id_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -114,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
        .readpage = squashfs_symlink_readpage
 };
+const struct inode_operations squashfs_symlink_inode_ops = {
+        .readlink = generic_readlink,
+        .follow_link = page_follow_link_light,
+        .put_link = page_put_link,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..652b8541f9c6
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.c
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const struct xattr_handler *squashfs_xattr_handler(int);
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+        size_t buffer_size)
+{
+        struct inode *inode = d->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        size_t rest = buffer_size;
+        int err;
+        /* check that the file system has xattrs */
+        if (msblk->xattr_id_table == NULL)
+                return -EOPNOTSUPP;
+        /* loop reading each xattr name */
+        while (count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                const struct xattr_handler *handler;
+                int name_size, prefix_size = 0;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+                if (handler)
+                        prefix_size = handler->list(d, buffer, rest, NULL,
+                                name_size, handler->flags);
+                if (prefix_size) {
+                        if (buffer) {
+                                if (prefix_size + name_size + 1 > rest) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                buffer += prefix_size;
+                        }
+                        err = squashfs_read_metadata(sb, buffer, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                        if (buffer) {
+                                buffer[name_size] = '\0';
+                                buffer += name_size + 1;
+                        }
+                        rest -= prefix_size + name_size + 1;
+                } else  {
+                        /* no handler or insuffficient privileges, so skip */
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                }
+                /* skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = buffer_size - rest;
+failed:
+        return err;
+}
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+        const char *name, void *buffer, size_t buffer_size)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        int name_len = strlen(name);
+        int err, vsize;
+        char *target = kmalloc(name_len, GFP_KERNEL);
+        if (target == NULL)
+                return  -ENOMEM;
+        /* loop reading each xattr name */
+        for (; count; count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                int type, prefix, name_size;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                type = le16_to_cpu(entry.type);
+                prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+                if (prefix == name_index && name_size == name_len)
+                        err = squashfs_read_metadata(sb, target, &start,
+                                                &offset, name_size);
+                else
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                                &offset, name_size);
+                if (err < 0)
+                        goto failed;
+                if (prefix == name_index && name_size == name_len &&
+                                        strncmp(target, name, name_size) == 0) {
+                        /* found xattr */
+                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
+                                __le64 xattr;
+                                /* val is a reference to the real location */
+                                err = squashfs_read_metadata(sb, &val, &start,
+                                                &offset, sizeof(val));
+                                if (err < 0)
+                                        goto failed;
+                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                         &offset, sizeof(xattr));
+                                if (err < 0)
+                                        goto failed;
+                                xattr = le64_to_cpu(xattr);
+                                start = SQUASHFS_XATTR_BLK(xattr) +
+                                                        msblk->xattr_table;
+                                offset = SQUASHFS_XATTR_OFFSET(xattr);
+                        }
+                        /* read xattr value */
+                        err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                        if (err < 0)
+                                goto failed;
+                        vsize = le32_to_cpu(val.vsize);
+                        if (buffer) {
+                                if (vsize > buffer_size) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                err = squashfs_read_metadata(sb, buffer, &start,
+                                         &offset, vsize);
+                                if (err < 0)
+                                        goto failed;
+                        }
+                        break;
+                }
+                /* no match, skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = count ? vsize : -ENODATA;
+failed:
+        kfree(target);
+        return err;
+}
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+        const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_USER_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+        return XATTR_USER_PREFIX_LEN;
+}
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+        size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = squashfs_user_list,
+        .get    = squashfs_user_get
+};
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
+        if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+        return XATTR_TRUSTED_PREFIX_LEN;
+}
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = squashfs_trusted_list,
+        .get    = squashfs_trusted_get
+};
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+        return XATTR_SECURITY_PREFIX_LEN;
+}
+static int squashfs_security_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = squashfs_security_list,
+        .get    = squashfs_security_get
+};
+static const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+        if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+                /* ignore unrecognised type */
+                return NULL;
+        switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+        case SQUASHFS_XATTR_USER:
+                return &squashfs_xattr_user_handler;
+        case SQUASHFS_XATTR_TRUSTED:
+                return &squashfs_xattr_trusted_handler;
+        case SQUASHFS_XATTR_SECURITY:
+                return &squashfs_xattr_security_handler;
+        default:
+                /* ignore unrecognised type */
+                return NULL;
+        }
+}
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+        &squashfs_xattr_user_handler,
+        &squashfs_xattr_trusted_handler,
+        &squashfs_xattr_security_handler,
+        NULL
+};
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..49fe0d719fbf
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+#ifdef CONFIG_SQUASHFS_XATTR
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+                u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+                int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+                u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+        ERROR("Xattrs in filesystem, these will be ignored\n");
+        return ERR_PTR(-ENOTSUPP);
+}
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+                unsigned int index, int *count, int *size,
+                unsigned long long *xattr)
+{
+        return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+                int *count, unsigned int *size, unsigned long long *xattr)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_XATTR_BLOCK(index);
+        int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+        struct squashfs_xattr_id id;
+        int err;
+        err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+                                                        sizeof(id));
+        if (err < 0)
+                return err;
+        *xattr = le64_to_cpu(id.xattr);
+        *size = le32_to_cpu(id.size);
+        *count = le32_to_cpu(id.count);
+        return 0;
+}
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+                u64 *xattr_table_start, int *xattr_ids)
+{
+        unsigned int len;
+        __le64 *xid_table;
+        struct squashfs_xattr_id_table id_table;
+        int err;
+        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+        if (err < 0) {
+                ERROR("unable to read xattr id table\n");
+                return ERR_PTR(err);
+        }
+        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+        TRACE("In read_xattr_index_table, length %d\n", len);
+        /* Allocate xattr id lookup table indexes */
+        xid_table = kmalloc(len, GFP_KERNEL);
+        if (xid_table == NULL) {
+                ERROR("Failed to allocate xattr id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+        if (err < 0) {
+                ERROR("unable to read xattr id index table\n");
+                kfree(xid_table);
+                return ERR_PTR(err);
+        }
+        return xid_table;
+}
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52c5737..12e90e213900 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -68,7 +68,8 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 }
 EXPORT_SYMBOL(vfs_fstat);
-int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag)
+int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
+                int flag)
 {
        struct path path;
        int error = -EINVAL;
@@ -91,13 +92,13 @@ out:
 }
 EXPORT_SYMBOL(vfs_fstatat);
-int vfs_stat(char __user *name, struct kstat *stat)
+int vfs_stat(const char __user *name, struct kstat *stat)
 {
        return vfs_fstatat(AT_FDCWD, name, stat, 0);
 }
 EXPORT_SYMBOL(vfs_stat);
-int vfs_lstat(char __user *name, struct kstat *stat)
+int vfs_lstat(const char __user *name, struct kstat *stat)
 {
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
 }
@@ -147,7 +148,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(stat, const char __user *, filename,
+                struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
        int error;
@@ -159,7 +161,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
        return cp_old_stat(&stat, statbuf);
 }
-SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
+SYSCALL_DEFINE2(lstat, const char __user *, filename,
+                struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
        int error;
@@ -234,7 +237,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newstat, const char __user *, filename,
+                struct stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_stat(filename, &stat);
@@ -244,7 +248,8 @@ SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
        return cp_new_stat(&stat, statbuf);
 }
-SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
+SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+                struct stat __user *, statbuf)
 {
        struct kstat stat;
        int error;
@@ -257,7 +262,7 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
                struct stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
@@ -355,7 +360,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(stat64, const char __user *, filename,
+                struct stat64 __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_stat(filename, &stat);
@@ -366,7 +372,8 @@ SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf
        return error;
 }
-SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
+SYSCALL_DEFINE2(lstat64, const char __user *, filename,
+                struct stat64 __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);
@@ -388,7 +395,7 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
        return error;
 }
-SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
 {
        struct kstat stat;
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..30ea8c8a996b
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,243 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+static int flags_by_mnt(int mnt_flags)
+{
+        int flags = 0;
+        if (mnt_flags & MNT_READONLY)
+                flags |= ST_RDONLY;
+        if (mnt_flags & MNT_NOSUID)
+                flags |= ST_NOSUID;
+        if (mnt_flags & MNT_NODEV)
+                flags |= ST_NODEV;
+        if (mnt_flags & MNT_NOEXEC)
+                flags |= ST_NOEXEC;
+        if (mnt_flags & MNT_NOATIME)
+                flags |= ST_NOATIME;
+        if (mnt_flags & MNT_NODIRATIME)
+                flags |= ST_NODIRATIME;
+        if (mnt_flags & MNT_RELATIME)
+                flags |= ST_RELATIME;
+        return flags;
+}
+static int flags_by_sb(int s_flags)
+{
+        int flags = 0;
+        if (s_flags & MS_SYNCHRONOUS)
+                flags |= ST_SYNCHRONOUS;
+        if (s_flags & MS_MANDLOCK)
+                flags |= ST_MANDLOCK;
+        return flags;
+}
+static int calculate_f_flags(struct vfsmount *mnt)
+{
+        return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
+                flags_by_sb(mnt->mnt_sb->s_flags);
+}
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+        int retval;
+        if (!dentry->d_sb->s_op->statfs)
+                return -ENOSYS;
+        memset(buf, 0, sizeof(*buf));
+        retval = security_sb_statfs(dentry);
+        if (retval)
+                return retval;
+        retval = dentry->d_sb->s_op->statfs(dentry, buf);
+        if (retval == 0 && buf->f_frsize == 0)
+                buf->f_frsize = buf->f_bsize;
+        return retval;
+}
+int vfs_statfs(struct path *path, struct kstatfs *buf)
+{
+        int error;
+        error = statfs_by_dentry(path->dentry, buf);
+        if (!error)
+                buf->f_flags = calculate_f_flags(path->mnt);
+        return error;
+}
+EXPORT_SYMBOL(vfs_statfs);
+static int do_statfs_native(struct path *path, struct statfs *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(path, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                if (sizeof buf->f_blocks == 4) {
+                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                             st.f_bsize | st.f_frsize) &
+                            0xffffffff00000000ULL)
+                                return -EOVERFLOW;
+                        /*
+                         * f_files and f_ffree may be -1; it's okay to stuff
+                         * that into 32 bits
+                         */
+                        if (st.f_files != -1 &&
+                            (st.f_files & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                        if (st.f_ffree != -1 &&
+                            (st.f_ffree & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                }
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                buf->f_flags = st.f_flags;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+static int do_statfs64(struct path *path, struct statfs64 *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(path, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                buf->f_flags = st.f_flags;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+        struct path path;
+        int error;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs tmp;
+                error = do_statfs_native(&path, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct path path;
+        long error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs64 tmp;
+                error = do_statfs64(&path, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+        struct file *file;
+        struct statfs tmp;
+        int error;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = do_statfs_native(&file->f_path, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct file *file;
+        struct statfs64 tmp;
+        int error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = do_statfs64(&file->f_path, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+        struct super_block *s;
+        struct ustat tmp;
+        struct kstatfs sbuf;
+        int err;
+        s = user_get_super(new_decode_dev(dev));
+        if (!s)
+                return -EINVAL;
+        err = statfs_by_dentry(s->s_root, &sbuf);
+        drop_super(s);
+        if (err)
+                return err;
+        memset(&tmp,0,sizeof(struct ustat));
+        tmp.f_tfree = sbuf.f_bfree;
+        tmp.f_tinode = sbuf.f_ffree;
+        return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..8819e3a7ff20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,14 @@
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
-#include <linux/quotaops.h>
-#include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vfs.h>
 #include <linux/writeback.h>            /* for the emergency remount stuff */
 #include <linux/idr.h>
-#include <linux/kobject.h>
 #include <linux/mutex.h>
-#include <linux/file.h>
 #include <linux/backing-dev.h>
-#include <asm/uaccess.h>
 #include "internal.h"
@@ -63,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
                        s = NULL;
                        goto out;
                }
+#ifdef CONFIG_SMP
+                s->s_files = alloc_percpu(struct list_head);
+                if (!s->s_files) {
+                        security_sb_free(s);
+                        kfree(s);
+                        s = NULL;
+                        goto out;
+                } else {
+                        int i;
+                        for_each_possible_cpu(i)
+                                INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+                }
+#else
                INIT_LIST_HEAD(&s->s_files);
+#endif
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
@@ -93,16 +99,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
                 * subclass.
                 */
                down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-                s->s_count = S_BIAS;
+                s->s_count = 1;
                atomic_set(&s->s_active, 1);
                mutex_init(&s->s_vfs_rename_mutex);
+                lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
                mutex_init(&s->s_dquot.dqio_mutex);
                mutex_init(&s->s_dquot.dqonoff_mutex);
                init_rwsem(&s->s_dquot.dqptr_sem);
                init_waitqueue_head(&s->s_wait_unfrozen);
                s->s_maxbytes = MAX_NON_LFS;
-                s->dq_op = sb_dquot_ops;
-                s->s_qcop = sb_quotactl_ops;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
        }
@@ -118,6 +123,9 @@ out:
 */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+        free_percpu(s->s_files);
+#endif
        security_sb_free(s);
        kfree(s->s_subtype);
        kfree(s->s_options);
@@ -127,39 +135,14 @@ static inline void destroy_super(struct super_block *s)
 /* Superblock refcounting  */
 /*
- * Drop a superblock's refcount.  Returns non-zero if the superblock was
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
- * destroyed.  The caller must hold sb_lock.
 */
-static int __put_super(struct super_block *sb)
+void __put_super(struct super_block *sb)
 {
-        int ret = 0;
        if (!--sb->s_count) {
+                list_del_init(&sb->s_list);
                destroy_super(sb);
-                ret = 1;
        }
-        return ret;
-}
-/*
- * Drop a superblock's refcount.
- * Returns non-zero if the superblock is about to be destroyed and
- * at least is already removed from super_blocks list, so if we are
- * making a loop through super blocks then we need to restart.
- * The caller must hold sb_lock.
- */
-int __put_super_and_need_restart(struct super_block *sb)
-{
-        /* check for race with generic_shutdown_super() */
-        if (list_empty(&sb->s_list)) {
-                /* super block is removed, need to restart... */
-                __put_super(sb);
-                return 1;
-        }
-        /* can't be the last, since s_list is still in use */
-        sb->s_count--;
-        BUG_ON(sb->s_count == 0);
-        return 0;
 }
 /**
@@ -178,57 +161,47 @@ void put_super(struct super_block *sb)
 /**
- *      deactivate_super        -       drop an active reference to superblock
+ *      deactivate_locked_super -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Drops an active reference to superblock, acquiring a temprory one if
+ *      Drops an active reference to superblock, converting it into a temprory
- *      there is no active references left.  In that case we lock superblock,
+ *      one if there is no other active references left.  In that case we
 *      tell fs driver to shut it down and drop the temporary reference we
 *      had just acquired.
+ *
+ *      Caller holds exclusive lock on superblock; that lock is released.
 */
-void deactivate_super(struct super_block *s)
+void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+        if (atomic_dec_and_test(&s->s_active)) {
-                s->s_count -= S_BIAS-1;
-                spin_unlock(&sb_lock);
-                vfs_dq_off(s, 0);
-                down_write(&s->s_umount);
                fs->kill_sb(s);
                put_filesystem(fs);
                put_super(s);
+        } else {
+                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_super);
+EXPORT_SYMBOL(deactivate_locked_super);
 /**
- *      deactivate_locked_super -       drop an active reference to superblock
+ *      deactivate_super        -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *      Variant of deactivate_locked_super(), except that superblock is *not*
- *      it does not unlock it until it's all over.  As the result, it's safe to
+ *      locked by caller.  If we are going to drop the final active reference,
- *      use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *      lock will be acquired prior to that.
- *      will see the sucker until it's all over.  Equivalent using up_write +
- *      deactivate_super is safe for that purpose only if superblock is either
- *      safe to use or has NULL ->s_root when we unlock.
 */
-void deactivate_locked_super(struct super_block *s)
+void deactivate_super(struct super_block *s)
 {
-        struct file_system_type *fs = s->s_type;
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+                down_write(&s->s_umount);
-                s->s_count -= S_BIAS-1;
+                deactivate_locked_super(s);
-                spin_unlock(&sb_lock);
-                vfs_dq_off(s, 0);
-                fs->kill_sb(s);
-                put_filesystem(fs);
-                put_super(s);
-        } else {
-                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_locked_super);
+EXPORT_SYMBOL(deactivate_super);
 /**
 *      grab_super - acquire an active reference
@@ -243,22 +216,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
 */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
+        if (atomic_inc_not_zero(&s->s_active)) {
+                spin_unlock(&sb_lock);
+                return 1;
+        }
+        /* it's going away */
        s->s_count++;
        spin_unlock(&sb_lock);
+        /* wait for it to die */
        down_write(&s->s_umount);
-        if (s->s_root) {
-                spin_lock(&sb_lock);
-                if (s->s_count > S_BIAS) {
-                        atomic_inc(&s->s_active);
-                        s->s_count--;
-                        spin_unlock(&sb_lock);
-                        return 1;
-                }
-                spin_unlock(&sb_lock);
-        }
        up_write(&s->s_umount);
        put_super(s);
-        yield();
        return 0;
 }
@@ -321,8 +289,7 @@ void generic_shutdown_super(struct super_block *sb)
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
-        list_del_init(&sb->s_list);
+        list_del_init(&sb->s_instances);
-        list_del(&sb->s_instances);
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
 }
@@ -356,6 +323,12 @@ retry:
                        if (s) {
                                up_write(&s->s_umount);
                                destroy_super(s);
+                                s = NULL;
+                        }
+                        down_write(&old->s_umount);
+                        if (unlikely(!(old->s_flags & MS_BORN))) {
+                                deactivate_locked_super(old);
+                                goto retry;
                        }
                        return old;
                }
@@ -408,11 +381,12 @@ EXPORT_SYMBOL(drop_super);
 */
 void sync_supers(void)
 {
-        struct super_block *sb;
+        struct super_block *sb, *p = NULL;
        spin_lock(&sb_lock);
-restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
@@ -423,10 +397,47 @@ restart:
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        if (p)
-                                goto restart;
+                                __put_super(p);
+                        p = sb;
                }
        }
+        if (p)
+                __put_super(p);
+        spin_unlock(&sb_lock);
+}
+/**
+ *      iterate_supers - call function for all active superblocks
+ *      @f: function to call
+ *      @arg: argument to pass to it
+ *
+ *      Scans the superblock list and calls given function, passing it
+ *      locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+        struct super_block *sb, *p = NULL;
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (sb->s_root)
+                        f(sb, arg);
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                if (p)
+                        __put_super(p);
+                p = sb;
+        }
+        if (p)
+                __put_super(p);
        spin_unlock(&sb_lock);
 }
@@ -438,7 +449,7 @@ restart:
 *      mounted on the device given. %NULL is returned if no match is found.
 */
-struct super_block * get_super(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
 {
        struct super_block *sb;
@@ -448,17 +459,20 @@ struct super_block * get_super(struct block_device *bdev)
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_bdev == bdev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
@@ -473,7 +487,7 @@ EXPORT_SYMBOL(get_super);
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
- * reference and s_umount held exclusively or %NULL if none was found.
+ * reference or %NULL if none was found.
 */
 struct super_block *get_active_super(struct block_device *bdev)
 {
@@ -482,81 +496,49 @@ struct super_block *get_active_super(struct block_device *bdev)
        if (!bdev)
                return NULL;
+restart:
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_bdev != bdev)
+                if (list_empty(&sb->s_instances))
                        continue;
+                if (sb->s_bdev == bdev) {
-                sb->s_count++;
+                        if (grab_super(sb)) /* drops sb_lock */
-                spin_unlock(&sb_lock);
-                down_write(&sb->s_umount);
-                if (sb->s_root) {
-                        spin_lock(&sb_lock);
-                        if (sb->s_count > S_BIAS) {
-                                atomic_inc(&sb->s_active);
-                                sb->s_count--;
-                                spin_unlock(&sb_lock);
                                return sb;
-                        }
+                        else
-                        spin_unlock(&sb_lock);
+                                goto restart;
                }
-                up_write(&sb->s_umount);
-                put_super(sb);
-                yield();
-                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
 
-struct super_block * user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev)
 {
        struct super_block *sb;
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_dev ==  dev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
-{
-        struct super_block *s;
-        struct ustat tmp;
-        struct kstatfs sbuf;
-        int err = -EINVAL;
-        s = user_get_super(new_decode_dev(dev));
-        if (s == NULL)
-                goto out;
-        err = vfs_statfs(s->s_root, &sbuf);
-        drop_super(s);
-        if (err)
-                goto out;
-        memset(&tmp,0,sizeof(struct ustat));
-        tmp.f_tfree = sbuf.f_bfree;
-        tmp.f_tinode = sbuf.f_ffree;
-        err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
-out:
-        return err;
-}
 /**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
@@ -569,7 +551,7 @@ out:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
-        int remount_rw, remount_ro;
+        int remount_ro;
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
@@ -585,7 +567,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        sync_filesystem(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
-        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
@@ -594,9 +575,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
                        return -EBUSY;
-                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
-                        return -EBUSY;
        }
        if (sb->s_op->remount_fs) {
@@ -605,8 +583,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        return retval;
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-        if (remount_rw)
-                vfs_dq_quota_on_remount(sb);
        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
@@ -622,25 +599,29 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 static void do_emergency_remount(struct work_struct *work)
 {
-        struct super_block *sb;
+        struct super_block *sb, *p = NULL;
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
-                         * ->remount_fs needs lock_kernel().
-                         *
                         * What lock protects sb->s_flags??
                         */
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
                }
                up_write(&sb->s_umount);
-                put_super(sb);
                spin_lock(&sb_lock);
+                if (p)
+                        __put_super(p);
+                p = sb;
        }
+        if (p)
+                __put_super(p);
        spin_unlock(&sb_lock);
        kfree(work);
        printk("Emergency Remount complete\n");
@@ -821,7 +802,16 @@ int get_sb_bdev(struct file_system_type *fs_type,
                        goto error_bdev;
                }
+                /*
+                 * s_umount nests inside bd_mutex during
+                 * __invalidate_device().  close_bdev_exclusive()
+                 * acquires bd_mutex and can't be called under
+                 * s_umount.  Drop s_umount temporarily.  This is safe
+                 * as we're holding an active reference.
+                 */
+                up_write(&s->s_umount);
                close_bdev_exclusive(bdev, mode);
+                down_write(&s->s_umount);
        } else {
                char b[BDEVNAME_SIZE];
@@ -957,6 +947,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                goto out_free_secdata;
        BUG_ON(!mnt->mnt_sb);
        WARN_ON(!mnt->mnt_sb->s_bdi);
+        mnt->mnt_sb->s_flags |= MS_BORN;
        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
        if (error)
@@ -990,6 +981,96 @@ out:
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
+/**
+ * freeze_super - lock the filesystem and force it into a consistent state
+ * @sb: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+        int ret;
+        atomic_inc(&sb->s_active);
+        down_write(&sb->s_umount);
+        if (sb->s_frozen) {
+                deactivate_locked_super(sb);
+                return -EBUSY;
+        }
+        if (sb->s_flags & MS_RDONLY) {
+                sb->s_frozen = SB_FREEZE_TRANS;
+                smp_wmb();
+                up_write(&sb->s_umount);
+                return 0;
+        }
+        sb->s_frozen = SB_FREEZE_WRITE;
+        smp_wmb();
+        sync_filesystem(sb);
+        sb->s_frozen = SB_FREEZE_TRANS;
+        smp_wmb();
+        sync_blockdev(sb->s_bdev);
+        if (sb->s_op->freeze_fs) {
+                ret = sb->s_op->freeze_fs(sb);
+                if (ret) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem freeze failed\n");
+                        sb->s_frozen = SB_UNFROZEN;
+                        deactivate_locked_super(sb);
+                        return ret;
+                }
+        }
+        up_write(&sb->s_umount);
+        return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+        int error;
+        down_write(&sb->s_umount);
+        if (sb->s_frozen == SB_UNFROZEN) {
+                up_write(&sb->s_umount);
+                return -EINVAL;
+        }
+        if (sb->s_flags & MS_RDONLY)
+                goto out;
+        if (sb->s_op->unfreeze_fs) {
+                error = sb->s_op->unfreeze_fs(sb);
+                if (error) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem thaw failed\n");
+                        sb->s_frozen = SB_FREEZE_TRANS;
+                        up_write(&sb->s_umount);
+                        return error;
+                }
+        }
+out:
+        sb->s_frozen = SB_UNFROZEN;
+        smp_wmb();
+        wake_up(&sb->s_wait_unfrozen);
+        deactivate_locked_super(sb);
+        return 0;
+}
+EXPORT_SYMBOL(thaw_super);
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
        int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..ba76b9623e7e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+        if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+                __sync_filesystem(sb, *(int *)arg);
+}
 /*
 * Sync all the data for all the filesystems (called by sys_sync() and
 * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
 */
 static void sync_filesystems(int wait)
 {
-        struct super_block *sb;
+        iterate_supers(sync_one_sb, &wait);
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list)
-                sb->s_need_sync = 1;
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync)
-                        continue;
-                sb->s_need_sync = 0;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
-                        __sync_filesystem(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
 }
 /*
@@ -160,37 +128,9 @@ void emergency_sync(void)
        }
 }
-/*
- * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
- */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
-        struct inode * inode = dentry->d_inode;
-        struct super_block * sb;
-        int ret, err;
-        /* sync the inode to buffers */
-        ret = write_inode_now(inode, 0);
-        /* sync the superblock to buffers */
-        sb = inode->i_sb;
-        if (sb->s_dirt && sb->s_op->write_super)
-                sb->s_op->write_super(sb);
-        /* .. finally sync the buffers to disk */
-        err = sync_blockdev(sb->s_bdev);
-        if (!ret)
-                ret = err;
-        return ret;
-}
-EXPORT_SYMBOL(file_fsync);
 /**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @start:              offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:           perform only datasync
@@ -198,32 +138,13 @@ EXPORT_SYMBOL(file_fsync);
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
-                    loff_t end, int datasync)
 {
-        const struct file_operations *fop;
+        struct address_space *mapping = file->f_mapping;
-        struct address_space *mapping;
        int err, ret;
-        /*
+        if (!file->f_op || !file->f_op->fsync) {
-         * Get mapping and operations from the file in case we have
-         * as file, or get the default values for them in case we
-         * don't have a struct file available.  Damn nfsd..
-         */
-        if (file) {
-                mapping = file->f_mapping;
-                fop = file->f_op;
-        } else {
-                mapping = dentry->d_inode->i_mapping;
-                fop = dentry->d_inode->i_fop;
-        }
-        if (!fop || !fop->fsync) {
                ret = -EINVAL;
                goto out;
        }
@@ -235,7 +156,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = fop->fsync(file, dentry, datasync);
+        err = file->f_op->fsync(file, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +169,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
 /**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @datasync:           only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+        return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync);
@@ -271,7 +187,7 @@ static int do_fsync(unsigned int fd, int datasync)
        file = fget(fd);
        if (file) {
-                ret = vfs_fsync(file, file->f_path.dentry, datasync);
+                ret = vfs_fsync(file, datasync);
                fput(file);
        }
        return ret;
@@ -299,8 +215,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
        if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
                return 0;
-        return vfs_fsync_range(file, file->f_path.dentry, pos,
+        return vfs_fsync_range(file, pos, pos + count - 1,
-                               pos + count - 1,
                               (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
 };
 static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
        rc = -EIO;
        if (attr->read)
-                rc = attr->read(kobj, attr, buffer, off, count);
+                rc = attr->read(file, kobj, attr, buffer, off, count);
        sysfs_put_active(attr_sd);
@@ -70,8 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
        mutex_lock(&bb->mutex);
-        count = fill_read(dentry, bb->buffer, offs, count);
+        count = fill_read(file, bb->buffer, offs, count);
        if (count < 0) {
                mutex_unlock(&bb->mutex);
                goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 }
 static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
        rc = -EIO;
        if (attr->write)
-                rc = attr->write(kobj, attr, buffer, offset, count);
+                rc = attr->write(file, kobj, attr, buffer, offset, count);
        sysfs_put_active(attr_sd);
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                     size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        memcpy(bb->buffer, temp, count);
-        count = flush_write(dentry, bb->buffer, offs, count);
+        count = flush_write(file, bb->buffer, offs, count);
        mutex_unlock(&bb->mutex);
        if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        if (!attr->mmap)
                goto out_put;
-        rc = attr->mmap(kobj, attr, vma);
+        rc = attr->mmap(file, kobj, attr, vma);
        if (rc)
                goto out_put;
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *ps_iattr;
-        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+        if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
                return -EEXIST;
        sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name)
 {
        struct sysfs_dirent *sd;
-        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+                if (ns && sd->s_ns && (sd->s_ns != ns))
+                        continue;
                if (!strcmp(sd->s_name, name))
                        return sd;
+        }
        return NULL;
 }
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name)
 {
        struct sysfs_dirent *sd;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name);
+        sd = sysfs_find_dirent(parent_sd, ns, name);
        sysfs_get(sd);
        mutex_unlock(&sysfs_mutex);
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      const char *name, struct sysfs_dirent **p_sd)
+        enum kobj_ns_type type, const void *ns, const char *name,
+        struct sysfs_dirent **p_sd)
 {
        umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
        struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
        if (!sd)
                return -ENOMEM;
+        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+        sd->s_ns = ns;
        sd->s_dir.kobj = kobj;
        /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd)
 {
-        return create_dir(kobj, kobj->sd, name, p_sd);
+        return create_dir(kobj, kobj->sd,
+                          KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+/**
+ *      sysfs_read_ns_type: return associated ns_type
+ *      @kobj: the kobject being queried
+ *
+ *      Each kobject can be tagged with exactly one namespace type
+ *      (i.e. network or user).  Return the ns_type associated with
+ *      this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+        const struct kobj_ns_type_operations *ops;
+        enum kobj_ns_type type;
+        ops = kobj_child_ns_ops(kobj);
+        if (!ops)
+                return KOBJ_NS_TYPE_NONE;
+        type = ops->type;
+        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+        BUG_ON(type >= KOBJ_NS_TYPES);
+        BUG_ON(!kobj_ns_type_registered(type));
+        return type;
 }
 /**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 */
 int sysfs_create_dir(struct kobject * kobj)
 {
+        enum kobj_ns_type type;
        struct sysfs_dirent *parent_sd, *sd;
+        const void *ns = NULL;
        int error = 0;
        BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
        else
                parent_sd = &sysfs_root;
-        error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+        if (sysfs_ns_type(parent_sd))
+                ns = kobj->ktype->namespace(kobj);
+        type = sysfs_read_ns_type(kobj);
+        error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
        if (!error)
                kobj->sd = sd;
        return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd)
 {
        struct dentry *ret = NULL;
-        struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+        struct dentry *parent = dentry->d_parent;
+        struct sysfs_dirent *parent_sd = parent->d_fsdata;
        struct sysfs_dirent *sd;
        struct inode *inode;
+        enum kobj_ns_type type;
+        const void *ns;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dir->i_sb)->ns[type];
+        sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
        /* no such entry */
        if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
 }
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name)
+        struct sysfs_dirent *new_parent_sd, const void *new_ns,
+        const char *new_name)
 {
        const char *dup_name = NULL;
        int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
        mutex_lock(&sysfs_mutex);
        error = 0;
-        if ((sd->s_parent == new_parent_sd) &&
+        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
                goto out;
        /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
                sd->s_parent = new_parent_sd;
                sysfs_link_sibling(sd);
        }
+        sd->s_ns = new_ns;
        error = 0;
 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
 int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        const void *new_ns = NULL;
+        if (sysfs_ns_type(parent_sd))
+                new_ns = kobj->ktype->namespace(kobj);
+        return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
+        const void *new_ns = NULL;
        BUG_ON(!sd->s_parent);
+        if (sysfs_ns_type(sd->s_parent))
+                new_ns = kobj->ktype->namespace(kobj);
        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name);
+        return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
 }
 /* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
        if (pos) {
                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
                        pos->s_parent == parent_sd &&
                        ino == pos->s_ino;
                sysfs_put(pos);
-                if (valid)
+                if (!valid)
-                        return pos;
+                        pos = NULL;
        }
-        pos = NULL;
+        if (!pos && (ino > 1) && (ino < INT_MAX)) {
-        if ((ino > 1) && (ino < INT_MAX)) {
                pos = parent_sd->s_dir.children;
                while (pos && (ino > pos->s_ino))
                        pos = pos->s_sibling;
        }
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
-static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
-        pos = sysfs_dir_pos(parent_sd, ino, pos);
+        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
        if (pos)
                pos = pos->s_sibling;
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct dentry *dentry = filp->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
        struct sysfs_dirent *pos = filp->private_data;
+        enum kobj_ns_type type;
+        const void *ns;
        ino_t ino;
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dentry->d_sb)->ns[type];
        if (filp->f_pos == 0) {
                ino = parent_sd->s_ino;
                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        filp->f_pos++;
        }
        mutex_lock(&sysfs_mutex);
-        for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
             pos;
-             pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
+             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
                const char * name;
                unsigned int type;
                int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        char *p;
        p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-        if (p)
+        if (!IS_ERR(p))
                memmove(last_sysfs_file, p, strlen(p) + 1);
        /* need attr_sd for attr and ops, its parent for kobj */
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
        mutex_lock(&sysfs_mutex);
        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir);
+                /* Only directories are tagged, so no need to pass
+                 * a tag explicitly.
+                 */
+                sd = sysfs_find_dirent(sd, NULL, dir);
        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr);
+                sd = sysfs_find_dirent(sd, NULL, attr);
        if (sd)
                sysfs_notify_dirent(sd);
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
        int error;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
@@ -590,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 * @mode: file permissions.
 *
 */
-int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
+int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
+                     mode_t mode)
 {
        struct sysfs_dirent *sd;
        struct iattr newattrs;
@@ -599,7 +603,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
        mutex_lock(&sysfs_mutex);
        rc = -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name);
+        sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
        if (!sd)
                goto out;
@@ -624,7 +628,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
 }
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +650,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
-                sysfs_hash_and_remove(dir_sd, attr->name);
+                sysfs_hash_and_remove(dir_sd, NULL, attr->name);
                sysfs_put(dir_sd);
        }
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        int i;
        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-                sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 }
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                 * visibility.  Do this by first removing then
                 * re-adding (if required) the file */
                if (update)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
                if (grp->is_visible) {
                        mode = grp->is_visible(kobj, *attr, i);
                        if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
        struct sysfs_dirent *sd;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
                if (!sd) {
                        WARN(!sd, KERN_WARNING "sysfs group %p not found for "
                                "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a9419711..cffb1fd8ba33 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                goto out;
-        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+        error = sysfs_sd_setattr(sd, iattr);
-        error = inode_setattr(inode, iattr);
        if (error)
                goto out;
-        error = sysfs_sd_setattr(sd, iattr);
+        /* this ignores size changes */
+        setattr_copy(inode, iattr);
 out:
        mutex_unlock(&sysfs_mutex);
        return error;
@@ -312,19 +312,19 @@ struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
 * To prevent the sysfs inode numbers from being freed prematurely we take a
 * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.delete_inode() implementation is needed to drop that
+ * super_operations.evict_inode() implementation is needed to drop that
 * reference upon inode destruction.
 */
-void sysfs_delete_inode(struct inode *inode)
+void sysfs_evict_inode(struct inode *inode)
 {
        struct sysfs_dirent *sd  = inode->i_private;
        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
+        end_writeback(inode);
        sysfs_put(sd);
 }
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
 {
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        sysfs_addrm_start(&acxt, dir_sd);
-        sd = sysfs_find_dirent(dir_sd, name);
+        sd = sysfs_find_dirent(dir_sd, ns, name);
+        if (sd && (sd->s_ns != ns))
+                sd = NULL;
        if (sd)
                sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 776137828dca..f2af22574c50 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -29,13 +29,13 @@ struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
        .statfs         = simple_statfs,
        .drop_inode     = generic_delete_inode,
-        .delete_inode   = sysfs_delete_inode,
+        .evict_inode    = sysfs_evict_inode,
 };
 struct sysfs_dirent sysfs_root = {
        .s_name         = "",
        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR,
+        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
        .s_mode         = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
        .s_ino          = 1,
 };
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+        struct sysfs_super_info *sb_info = sysfs_info(sb);
+        struct sysfs_super_info *info = data;
+        enum kobj_ns_type type;
+        int found = 1;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (sb_info->ns[type] != info->ns[type])
+                        found = 0;
+        }
+        return found;
+}
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
 static int sysfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+        struct sysfs_super_info *info;
+        enum kobj_ns_type type;
+        struct super_block *sb;
+        int error;
+        error = -ENOMEM;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                goto out;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+                info->ns[type] = kobj_ns_current(type);
+        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto out;
+        }
+        if (!sb->s_root) {
+                sb->s_flags = flags;
+                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        goto out;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        error = 0;
+out:
+        return error;
+}
+static void sysfs_kill_sb(struct super_block *sb)
+{
+        struct sysfs_super_info *info = sysfs_info(sb);
+        /* Remove the superblock from fs_supers/s_instances
+         * so we can't find it, before freeing sysfs_super_info.
+         */
+        kill_anon_super(sb);
+        kfree(info);
 }
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
        .get_sb         = sysfs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = sysfs_kill_sb,
 };
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+        struct super_block *sb;
+        mutex_lock(&sysfs_mutex);
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+                struct sysfs_super_info *info = sysfs_info(sb);
+                /*
+                 * If we see a superblock on the fs_supers/s_instances
+                 * list the unmount has not completed and sb->s_fs_info
+                 * points to a valid struct sysfs_super_info.
+                 */
+                /* Ignore superblocks with the wrong ns */
+                if (info->ns[type] != ns)
+                        continue;
+                info->ns[type] = NULL;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&sysfs_mutex);
+}
 int __init sysfs_init(void)
 {
        int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b93ec51fa7ac..a7ac78f8e67a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        struct sysfs_dirent *target_sd = NULL;
        struct sysfs_dirent *sd = NULL;
        struct sysfs_addrm_cxt acxt;
+        enum kobj_ns_type ns_type;
        int error;
        BUG_ON(!name);
@@ -58,14 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
+        ns_type = sysfs_ns_type(parent_sd);
+        if (ns_type)
+                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
        sysfs_addrm_start(&acxt, parent_sd);
-        if (warn)
+        /* Symlinks must be between directories with the same ns_type */
-                error = sysfs_add_one(&acxt, sd);
+        if (!ns_type ||
-        else
+            (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-                error = __sysfs_add_one(&acxt, sd);
+                if (warn)
+                        error = sysfs_add_one(&acxt, sd);
+                else
+                        error = __sysfs_add_one(&acxt, sd);
+        } else {
+                error = -EINVAL;
+                WARN(1, KERN_WARNING
+                        "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
+                        parent_sd->s_name,
+                        sd->s_name,
+                        sd->s_symlink.target_sd->s_parent->s_name,
+                        sd->s_symlink.target_sd->s_name);
+        }
        sysfs_addrm_finish(&acxt);
        if (error)
@@ -107,6 +123,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 }
 /**
+ *      sysfs_delete_link - remove symlink in object's directory.
+ *      @kobj:  object we're acting for.
+ *      @targ:  object we're pointing to.
+ *      @name:  name of the symlink to remove.
+ *
+ *      Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *      to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+                        const char *name)
+{
+        const void *ns = NULL;
+        spin_lock(&sysfs_assoc_lock);
+        if (targ->sd && sysfs_ns_type(kobj->sd))
+                ns = targ->sd->s_ns;
+        spin_unlock(&sysfs_assoc_lock);
+        sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+/**
 *      sysfs_remove_link - remove symlink in object's directory.
 *      @kobj:  object we're acting for.
 *      @name:  name of the symlink to remove.
@@ -121,7 +157,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        else
                parent_sd = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name);
+        sysfs_hash_and_remove(parent_sd, NULL, name);
 }
 /**
@@ -137,6 +173,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
                        const char *old, const char *new)
 {
        struct sysfs_dirent *parent_sd, *sd = NULL;
+        const void *old_ns = NULL, *new_ns = NULL;
        int result;
        if (!kobj)
@@ -144,8 +181,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        else
                parent_sd = kobj->sd;
+        if (targ->sd)
+                old_ns = targ->sd->s_ns;
        result = -ENOENT;
-        sd = sysfs_get_dirent(parent_sd, old);
+        sd = sysfs_get_dirent(parent_sd, old_ns, old);
        if (!sd)
                goto out;
@@ -155,7 +195,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
                goto out;
-        result = sysfs_rename(sd, parent_sd, new);
+        if (sysfs_ns_type(parent_sd))
+                new_ns = targ->ktype->namespace(targ);
+        result = sysfs_rename(sd, parent_sd, new_ns, new);
 out:
        sysfs_put(sd);
@@ -261,3 +304,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..d9be60a2e956 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
+        const void              *s_ns; /* namespace tag */
        union {
                struct sysfs_elem_dir           s_dir;
                struct sysfs_elem_symlink       s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-#define SYSFS_FLAG_MASK                 ~SYSFS_TYPE_MASK
+/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_FLAG_REMOVED              0x0200
+#define SYSFS_NS_TYPE_MASK              0xff00
+#define SYSFS_NS_TYPE_SHIFT             8
+#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED              0x020000
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)                           \
 do {                                                            \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
 /*
 * mount.c
 */
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+        const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name);
+        struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -172,14 +198,14 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 * inode.c
 */
 struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_delete_inode(struct inode *inode);
+void sysfs_evict_inode(struct inode *inode);
 int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
 int sysfs_permission(struct inode *inode, int mask);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
 int sysfs_inode_init(void);
 /*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..a77c42157620 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -218,8 +218,7 @@ got_it:
        pos = page_offset(page) +
                        (char*)de - (char*)page_address(page);
        lock_page(page);
-        err = __sysv_write_begin(NULL, page->mapping, pos, SYSV_DIRSIZE,
+        err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
                goto out_unlock;
        memcpy (de->name, name, namelen);
@@ -239,15 +238,13 @@ out_unlock:
 int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 {
-        struct address_space *mapping = page->mapping;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode = (struct inode*)mapping->host;
        char *kaddr = (char*)page_address(page);
        loff_t pos = page_offset(page) + (char *)de - kaddr;
        int err;
        lock_page(page);
-        err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE,
+        err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        BUG_ON(err);
        de->inode = 0;
        err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
@@ -259,16 +256,14 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 int sysv_make_empty(struct inode *inode, struct inode *dir)
 {
-        struct address_space *mapping = inode->i_mapping;
+        struct page *page = grab_cache_page(inode->i_mapping, 0);
-        struct page *page = grab_cache_page(mapping, 0);
        struct sysv_dir_entry * de;
        char *base;
        int err;
        if (!page)
                return -ENOMEM;
-        err = __sysv_write_begin(NULL, mapping, 0, 2 * SYSV_DIRSIZE,
+        err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err) {
                unlock_page(page);
                goto fail;
@@ -341,15 +336,13 @@ not_empty:
 void sysv_set_link(struct sysv_dir_entry *de, struct page *page,
        struct inode *inode)
 {
-        struct address_space *mapping = page->mapping;
+        struct inode *dir = page->mapping->host;
-        struct inode *dir = mapping->host;
        loff_t pos = page_offset(page) +
                        (char *)de-(char*)page_address(page);
        int err;
        lock_page(page);
-        err = __sysv_write_begin(NULL, mapping, pos, SYSV_DIRSIZE,
+        err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        BUG_ON(err);
        de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
        err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..0a65939508e9 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,11 +26,33 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
+static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
+        if ((attr->ia_valid & ATTR_SIZE) &&
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
+}
 const struct inode_operations sysv_file_inode_operations = {
        .truncate       = sysv_truncate,
+        .setattr        = sysv_setattr,
        .getattr        = sysv_getattr,
 };
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..0c96c98bd1db 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -25,6 +25,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include "sysv.h"
 /* We don't trust the value of
@@ -112,7 +113,6 @@ void sysv_free_inode(struct inode * inode)
                return;
        }
        raw_inode = sysv_raw_inode(sb, ino, &bh);
-        clear_inode(inode);
        if (!raw_inode) {
                printk("sysv_free_inode: unable to read inode block on device "
                       "%s\n", inode->i_sb->s_id);
@@ -139,6 +139,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        struct inode *inode;
        sysv_ino_t ino;
        unsigned count;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE
+        };
        inode = new_inode(sb);
        if (!inode)
@@ -159,15 +162,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
        fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
        dirty_sb(sb);
-        
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_uid = current_fsuid();
        inode->i_ino = fs16_to_cpu(sbi, ino);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
@@ -176,8 +171,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        inode->i_mode = mode;           /* for sysv_write_inode() */
+        sysv_write_inode(inode, &wbc);  /* ensure inode not allocated again */
-        sysv_write_inode(inode, 0);     /* ensure inode not allocated again */
        mark_inode_dirty(inode);        /* cleared by sysv_write_inode() */
        /* That's it. */
        unlock_super(sb);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..de44d067b9e6 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
         * then attach current time stamp.
         * But if the filesystem was marked clean, keep it clean.
         */
+        sb->s_dirt = 0;
        old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
        if (sbi->s_type == FSTYPE_SYSV4) {
                if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
@@ -70,8 +71,8 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
        lock_super(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
-        if (!(*flags & MS_RDONLY))
+        if (*flags & MS_RDONLY)
-                sb->s_dirt = 1;
+                sysv_write_super(sb);
        unlock_super(sb);
        return 0;
 }
@@ -307,12 +308,17 @@ int sysv_sync_inode(struct inode *inode)
        return __sysv_write_inode(inode, 1);
 }
-static void sysv_delete_inode(struct inode *inode)
+static void sysv_evict_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
-        inode->i_size = 0;
+        if (!inode->i_nlink) {
-        sysv_truncate(inode);
+                inode->i_size = 0;
-        sysv_free_inode(inode);
+                sysv_truncate(inode);
+        }
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
+        if (!inode->i_nlink)
+                sysv_free_inode(inode);
 }
 static struct kmem_cache *sysv_inode_cachep;
@@ -343,7 +349,7 @@ const struct super_operations sysv_sops = {
        .alloc_inode    = sysv_alloc_inode,
        .destroy_inode  = sysv_destroy_inode,
        .write_inode    = sysv_write_inode,
-        .delete_inode   = sysv_delete_inode,
+        .evict_inode    = sysv_evict_inode,
        .put_super      = sysv_put_super,
        .write_super    = sysv_write_super,
        .sync_fs        = sysv_sync_fs,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index f042eec464c2..9ca66276315e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -459,20 +459,25 @@ static int sysv_readpage(struct file *file, struct page *page)
        return block_read_full_page(page,get_block);
 }
-int __sysv_write_begin(struct file *file, struct address_space *mapping,
+int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return __block_write_begin(page, pos, len, get_block);
-                                get_block);
 }
 static int sysv_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return __sysv_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 5a903da54551..a0b0cda6927e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -347,7 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
                sb->s_flags |= MS_RDONLY;
        if (sbi->s_truncate)
                sb->s_root->d_op = &sysv_dentry_operations;
-        sb->s_dirt = 1;
        return 1;
 }
@@ -435,12 +434,46 @@ Ebadsize:
        goto failed;
 }
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
+static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
 {
-        struct sysv_sb_info *sbi;
-        struct buffer_head *bh, *bh2 = NULL;
        struct v7_super_block *v7sb;
        struct sysv_inode *v7i;
+        struct buffer_head *bh2;
+        struct sysv_sb_info *sbi;
+        sbi = sb->s_fs_info;
+        /* plausibility check on superblock */
+        v7sb = (struct v7_super_block *) bh->b_data;
+        if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+            fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+            fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
+                return 0;
+        /* plausibility check on root inode: it is a directory,
+           with a nonzero size that is a multiple of 16 */
+        bh2 = sb_bread(sb, 2);
+        if (bh2 == NULL)
+                return 0;
+        v7i = (struct sysv_inode *)(bh2->b_data + 64);
+        if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
+            (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
+            (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+            (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+             sizeof(struct sysv_dir_entry))) {
+                brelse(bh2);
+                return 0;
+        }
+        brelse(bh2);
+        return 1;
+}
+static int v7_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct sysv_sb_info *sbi;
+        struct buffer_head *bh;
        if (440 != sizeof (struct v7_super_block))
                panic("V7 FS: bad super-block size");
@@ -454,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_sb = sb;
        sbi->s_block_base = 0;
        sbi->s_type = FSTYPE_V7;
-        sbi->s_bytesex = BYTESEX_PDP;
        sb->s_fs_info = sbi;
        
        sb_set_blocksize(sb, 512);
@@ -466,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        }
-        /* plausibility check on superblock */
+        /* Try PDP-11 UNIX */
-        v7sb = (struct v7_super_block *) bh->b_data;
+        sbi->s_bytesex = BYTESEX_PDP;
-        if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+        if (v7_sanity_check(sb, bh))
-            fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+                goto detected;
-            fs32_to_cpu(sbi, v7sb->s_time) == 0)
-                goto failed;
-        /* plausibility check on root inode: it is a directory,
+        /* Try PC/IX, v7/x86 */
-           with a nonzero size that is a multiple of 16 */
+        sbi->s_bytesex = BYTESEX_LE;
-        if ((bh2 = sb_bread(sb, 2)) == NULL)
+        if (v7_sanity_check(sb, bh))
-                goto failed;
+                goto detected;
-        v7i = (struct sysv_inode *)(bh2->b_data + 64);
-        if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
-            (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
-            (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
-                goto failed;
-        brelse(bh2);
-        bh2 = NULL;
+        goto failed;
+detected:
        sbi->s_bh1 = bh;
        sbi->s_bh2 = bh;
        if (complete_read_super(sb, silent, 1))
                return 0;
 failed:
-        brelse(bh2);
+        printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
+                sb->s_id);
        brelse(bh);
        kfree(sbi);
        return -EINVAL;
@@ -560,4 +587,5 @@ static void __exit exit_sysv_fs(void)
 module_init(init_sysv_fs)
 module_exit(exit_sysv_fs)
+MODULE_ALIAS("v7");
 MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 94cb9b4d76c2..bb55cdb394bf 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -136,9 +136,7 @@ extern unsigned long sysv_count_free_blocks(struct super_block *);
 /* itree.c */
 extern void sysv_truncate(struct inode *);
-extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
+extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata);
 /* inode.c */
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 98158de91d24..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        struct timerfd_ctx *ctx = file->private_data;
        ssize_t res;
        u64 ticks = 0;
-        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ticks))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
-        res = -EAGAIN;
+        if (file->f_flags & O_NONBLOCK)
-        if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
+                res = -EAGAIN;
-                __add_wait_queue(&ctx->wqh, &wait);
+        else
-                for (res = 0;;) {
+                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (ctx->ticks) {
-                                res = 0;
-                                break;
-                        }
-                        if (signal_pending(current)) {
-                                res = -ERESTARTSYS;
-                                break;
-                        }
-                        spin_unlock_irq(&ctx->wqh.lock);
-                        schedule();
-                        spin_lock_irq(&ctx->wqh.lock);
-                }
-                __remove_wait_queue(&ctx->wqh, &wait);
-                __set_current_state(TASK_RUNNING);
-        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 076ca50e9933..c8ff0d1ae5d3 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -62,7 +62,9 @@
 */
 static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
+        down_read(&c->vfs_sb->s_umount);
        writeback_inodes_sb(c->vfs_sb);
+        up_read(&c->vfs_sb->s_umount);
 }
 /**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
         */
        inode->i_flags |= (S_NOCMTIME);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..03ae894c45de 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,16 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
 * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
 * means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
+ * XXX(truncate): with the new truncate sequence this is not true anymore,
+ * and the calls to truncate_setsize can be move around freely.  They should
+ * be moved to the very end of the truncate sequence.
+ *
 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
 * inode size. How do we do this if @inode->i_size may became smaller while we
 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,9 +1129,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
                budgeted = 0;
        }
-        err = vmtruncate(inode, new_size);
+        truncate_setsize(inode, new_size);
-        if (err)
-                goto out_budg;
        if (offset) {
                pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
@@ -1214,16 +1216,14 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-                err = vmtruncate(inode, new_size);
+                truncate_setsize(inode, new_size);
-                if (err)
-                        goto out;
        }
        mutex_lock(&ui->ui_mutex);
        if (attr->ia_valid & ATTR_SIZE) {
                /* Truncation changes inode [mc]time */
                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-                /* 'vmtruncate()' changed @i_size, update @ui_size */
+                /* 'truncate_setsize()' changed @i_size, update @ui_size */
                ui->ui_size = inode->i_size;
        }
@@ -1245,10 +1245,6 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (IS_SYNC(inode))
                err = inode->i_sb->s_op->write_inode(inode, NULL);
        return err;
-out:
-        ubifs_release_budget(c, &req);
-        return err;
 }
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -1304,9 +1300,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        int err;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 77d5cf4a7547..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
        if (!c->ro_media) {
                c->ro_media = 1;
                c->no_chk_data_crc = 0;
+                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
                dbg_dump_stack();
        }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ad7f67b827ea..0084a33c4c69 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        iip = (i & (UBIFS_LPT_FANOUT - 1));
        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
@@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
        nnode = c->nroot;
        nnode = dirty_cow_nnode(c, nnode);
        if (IS_ERR(nnode))
-                return ERR_PTR(PTR_ERR(nnode));
+                return ERR_CAST(nnode);
        i = lnum - c->main_first;
        shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
        for (h = 1; h < c->lpt_hght; h++) {
@@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
                nnode = dirty_cow_nnode(c, nnode);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        pnode = dirty_cow_pnode(c, pnode);
        if (IS_ERR(pnode))
-                return ERR_PTR(PTR_ERR(pnode));
+                return ERR_CAST(pnode);
        iip = (i & (UBIFS_LPT_FANOUT - 1));
        dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
               pnode->lprops[iip].free, pnode->lprops[iip].dirty,
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 13cb7a4237bf..d12535b7fc78 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
                shft -= UBIFS_LPT_FANOUT_SHIFT;
                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
-                        return ERR_PTR(PTR_ERR(nnode));
+                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
        return ubifs_get_pnode(c, nnode, iip);
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 109c6ea03bb5..daae9e1f5382 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -24,7 +24,7 @@
 * This file implements functions needed to recover from unclean un-mounts.
 * When UBIFS is mounted, it checks a flag on the master node to determine if
 * an un-mount was completed successfully. If not, the process of mounting
- * incorparates additional checking and fixing of on-flash data structures.
+ * incorporates additional checking and fixing of on-flash data structures.
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
 * read-only, and the flash is not modified in that case.
@@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
        }
        err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
        if (err) {
-                if (err == -ENOSPC)
+                /*
-                        dbg_err("could not find a dirty LEB");
+                 * There are no dirty or empty LEBs subject to here being
+                 * enough for the index. Try to use
+                 * 'ubifs_find_free_leb_for_idx()', which will return any empty
+                 * LEBs (ignoring index requirements). If the index then
+                 * doesn't have enough LEBs the recovery commit will fail -
+                 * which is the  same result anyway i.e. recovery fails. So
+                 * there is no problem ignoring index  requirements and just
+                 * grabbing a free LEB since we have already established there
+                 * is not a dirty LEB we could have used instead.
+                 */
+                if (err == -ENOSPC) {
+                        dbg_rcvry("could not find a dirty LEB");
+                        goto find_free;
+                }
                return err;
        }
        ubifs_assert(!(lp.flags & LPROPS_INDEX));
@@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
 find_free:
        /*
         * There is no GC head LEB or the free space in the GC head LEB is too
-         * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+         * small, or there are not dirty LEBs. Allocate gc_lnum by calling
-         * GC is not run.
+         * 'ubifs_find_free_leb_for_idx()' so GC is not run.
         */
        lnum = ubifs_find_free_leb_for_idx(c);
        if (lnum < 0) {
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
        return 0;
 }
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
        int freed, contention = 0;
        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4d2f2157dd3f..cd5900b85d38 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -327,7 +327,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
        return err;
 }
-static void ubifs_delete_inode(struct inode *inode)
+static void ubifs_evict_inode(struct inode *inode)
 {
        int err;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -343,9 +343,12 @@ static void ubifs_delete_inode(struct inode *inode)
        dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
        ubifs_assert(!atomic_read(&inode->i_count));
-        ubifs_assert(inode->i_nlink == 0);
        truncate_inode_pages(&inode->i_data, 0);
+        if (inode->i_nlink)
+                goto done;
        if (is_bad_inode(inode))
                goto out;
@@ -367,7 +370,8 @@ out:
                c->nospace = c->nospace_rp = 0;
                smp_wmb();
        }
-        clear_inode(inode);
+done:
+        end_writeback(inode);
 }
 static void ubifs_dirty_inode(struct inode *inode)
@@ -1307,6 +1311,8 @@ static int mount_ubifs(struct ubifs_info *c)
                        if (err)
                                goto out_orphans;
                        err = ubifs_rcvry_gc_commit(c);
+                        if (err)
+                                goto out_orphans;
                } else {
                        err = take_gc_lnum(c);
                        if (err)
@@ -1318,7 +1324,7 @@ static int mount_ubifs(struct ubifs_info *c)
                         */
                        err = ubifs_leb_unmap(c, c->gc_lnum);
                        if (err)
-                                return err;
+                                goto out_orphans;
                }
                err = dbg_check_lprops(c);
@@ -1824,7 +1830,7 @@ const struct super_operations ubifs_super_operations = {
        .destroy_inode = ubifs_destroy_inode,
        .put_super     = ubifs_put_super,
        .write_inode   = ubifs_write_inode,
-        .delete_inode  = ubifs_delete_inode,
+        .evict_inode   = ubifs_evict_inode,
        .statfs        = ubifs_statfs,
        .dirty_inode   = ubifs_dirty_inode,
        .remount_fs    = ubifs_remount_fs,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..0c9876b396dd 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 /* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 /* commit.c */
 int ubifs_bg_thread(void *info);
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 /* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
                        } else {
-                                if (inode)
-                                        dquot_free_block(inode, 1);
                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                bit = block % (sb->s_blocksize << 3);
                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
-                        if (!udf_test_bit(bit, bh->b_data))
+                        if (!udf_clear_bit(bit, bh->b_data))
                                goto out;
-                        else if (dquot_prealloc_block(inode, 1))
-                                goto out;
-                        else if (!udf_clear_bit(bit, bh->b_data)) {
-                                udf_debug("bit already cleared for block %d\n", bit);
-                                dquot_free_block(inode, 1);
-                                goto out;
-                        }
                        block_count--;
                        alloc_count++;
                        bit++;
@@ -338,20 +328,6 @@ search_back:
        }
 got_block:
-        /*
-         * Check quota for allocation of this block.
-         */
-        if (inode) {
-                int ret = dquot_alloc_block(inode, 1);
-                if (ret) {
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        *err = ret;
-                        return 0;
-                }
-        }
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
                (sizeof(struct spaceBitmapDesc) << 3);
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        }
        iinfo = UDF_I(table);
-        /* We do this up front - There are some error conditions that
-           could occure, but.. oh well */
-        if (inode)
-                dquot_free_block(inode, count);
        udf_add_free_space(sb, sbi->s_partition, count);
        start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                epos.offset -= adsize;
                alloc_count = (elen >> sb->s_blocksize_bits);
-                if (inode && dquot_prealloc_block(inode,
+                if (alloc_count > block_count) {
-                        alloc_count > block_count ? block_count : alloc_count))
-                        alloc_count = 0;
-                else if (alloc_count > block_count) {
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;
-        if (inode) {
-                *err = dquot_alloc_block(inode, 1);
-                if (*err) {
-                        brelse(goal_epos.bh);
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        return 0;
-                }
-        }
        if (goal_elen)
                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46ccbf46..66b9e7e7e4c5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
@@ -144,50 +143,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return retval;
 }
-int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        long old_block, new_block;
        int result = -EINVAL;
+        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
-                udf_debug("no permission to access inode %lu\n",
+                udf_debug("no permission to access inode %lu\n", inode->i_ino);
-                          inode->i_ino);
+                result = -EPERM;
-                return -EPERM;
+                goto out;
        }
        if (!arg) {
                udf_debug("invalid argument to udf_ioctl\n");
-                return -EINVAL;
+                result = -EINVAL;
+                goto out;
        }
        switch (cmd) {
        case UDF_GETVOLIDENT:
                if (copy_to_user((char __user *)arg,
                                 UDF_SB(inode->i_sb)->s_volume_ident, 32))
-                        return -EFAULT;
+                        result = -EFAULT;
                else
-                        return 0;
+                        result = 0;
+                goto out;
        case UDF_RELOCATE_BLOCKS:
-                if (!capable(CAP_SYS_ADMIN))
+                if (!capable(CAP_SYS_ADMIN)) {
-                        return -EACCES;
+                        result = -EACCES;
-                if (get_user(old_block, (long __user *)arg))
+                        goto out;
-                        return -EFAULT;
+                }
+                if (get_user(old_block, (long __user *)arg)) {
+                        result = -EFAULT;
+                        goto out;
+                }
                result = udf_relocate_blocks(inode->i_sb,
                                                old_block, &new_block);
                if (result == 0)
                        result = put_user(new_block, (long __user *)arg);
-                return result;
+                goto out;
        case UDF_GETEASIZE:
                result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
-                break;
+                goto out;
        case UDF_GETEABLOCK:
                result = copy_to_user((char __user *)arg,
                                      UDF_I(inode)->i_ext.i_data,
                                      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
-                break;
+                goto out;
        }
+out:
+        unlock_kernel();
        return result;
 }
@@ -207,40 +216,39 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
-        .open                   = dquot_file_open,
+        .open                   = generic_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
-int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        int error;
-        error = inode_change_ok(inode, iattr);
+        error = inode_change_ok(inode, attr);
        if (error)
                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if ((attr->ia_valid & ATTR_SIZE) &&
-                dquot_initialize(inode);
+            attr->ia_size != i_size_read(inode)) {
+                error = vmtruncate(inode, attr->ia_size);
-        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, iattr);
                if (error)
                        return error;
        }
-        return inode_setattr(inode, iattr);
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 const struct inode_operations udf_file_inode_operations = {
-        .truncate               = udf_truncate,
        .setattr                = udf_setattr,
+        .truncate               = udf_truncate,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..75d9304d0dc3 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -32,15 +31,6 @@ void udf_free_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
-        /*
-         * Note: we must free any quota before locking the superblock,
-         * as writing the quota to disk may need the lock as well.
-         */
-        dquot_free_inode(inode);
-        dquot_drop(inode);
-        clear_inode(inode);
        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_bh) {
                struct logicalVolIntegrityDescImpUse *lvidiu =
@@ -61,7 +51,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
-        int block, ret;
+        int block;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
@@ -124,15 +114,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
-        inode->i_mode = mode;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
@@ -153,17 +136,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dquot_initialize(inode);
-        ret = dquot_alloc_inode(inode);
-        if (ret) {
-                dquot_drop(inode);
-                inode->i_flags |= S_NOQUOTA;
-                inode->i_nlink = 0;
-                iput(inode);
-                *err = ret;
-                return NULL;
-        }
        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..fc48f37aa2dd 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
@@ -69,40 +68,23 @@ static void udf_update_extents(struct inode *,
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
-void udf_delete_inode(struct inode *inode)
+void udf_evict_inode(struct inode *inode)
 {
-        if (!is_bad_inode(inode))
+        struct udf_inode_info *iinfo = UDF_I(inode);
-                dquot_initialize(inode);
+        int want_delete = 0;
        truncate_inode_pages(&inode->i_data, 0);
-        if (is_bad_inode(inode))
+        if (!inode->i_nlink && !is_bad_inode(inode)) {
-                goto no_delete;
+                want_delete = 1;
+                inode->i_size = 0;
-        inode->i_size = 0;
+                udf_truncate(inode);
-        udf_truncate(inode);
+                lock_kernel();
-        lock_kernel();
+                udf_update_inode(inode, IS_SYNC(inode));
+                unlock_kernel();
-        udf_update_inode(inode, IS_SYNC(inode));
+        }
-        udf_free_inode(inode);
+        invalidate_inode_buffers(inode);
+        end_writeback(inode);
-        unlock_kernel();
-        return;
-no_delete:
-        clear_inode(inode);
-}
-/*
- * If we are going to release inode from memory, we truncate last inode extent
- * to proper length. We could use drop_inode() but it's called under inode_lock
- * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
- * to make sure to write inode as it's not written automatically.
- */
-void udf_clear_inode(struct inode *inode)
-{
-        struct udf_inode_info *iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
            inode->i_size != iinfo->i_lenExtents) {
                printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has "
@@ -112,10 +94,13 @@ void udf_clear_inode(struct inode *inode)
                        (unsigned long long)inode->i_size,
                        (unsigned long long)iinfo->i_lenExtents);
        }
-        dquot_drop(inode);
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
+        if (want_delete) {
+                lock_kernel();
+                udf_free_inode(inode);
+                unlock_kernel();
+        }
 }
 static int udf_writepage(struct page *page, struct writeback_control *wbc)
@@ -132,9 +117,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                udf_get_block);
+        ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t udf_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
@@ -579,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_data.a_ops = &udf_aops;
        inode->i_op = &udf_file_inode_operations;
        inode->i_fop = &udf_file_operations;
-        inode->i_mode = mode;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -618,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
@@ -627,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                goto out;
        iinfo = UDF_I(inode);
-        inode->i_uid = current_fsuid();
        init_special_inode(inode, mode, rdev);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -666,15 +659,13 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
                goto out;
        err = -EIO;
-        inode = udf_new_inode(dir, S_IFDIR, &err);
+        inode = udf_new_inode(dir, S_IFDIR | mode, &err);
        if (!inode)
                goto out;
@@ -697,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                        FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
        udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
        brelse(fibh.sbh);
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -805,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -853,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -909,10 +893,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
-        inode = udf_new_inode(dir, S_IFLNK, &err);
+        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
@@ -923,7 +905,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        }
        iinfo = UDF_I(inode);
-        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
@@ -1081,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        struct buffer_head *bh;
-        dquot_initialize(dir);
        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
                unlock_kernel();
@@ -1145,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
@@ -1393,7 +1369,6 @@ const struct export_operations udf_export_ops = {
 const struct inode_operations udf_dir_inode_operations = {
        .lookup                         = udf_lookup,
        .create                         = udf_create,
-        .setattr                        = udf_setattr,
        .link                           = udf_link,
        .unlink                         = udf_unlink,
        .symlink                        = udf_symlink,
@@ -1406,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
-        .setattr        = udf_setattr,
 };
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..65412d84a45d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,8 +175,7 @@ static const struct super_operations udf_sb_ops = {
        .alloc_inode    = udf_alloc_inode,
        .destroy_inode  = udf_destroy_inode,
        .write_inode    = udf_write_inode,
-        .delete_inode   = udf_delete_inode,
+        .evict_inode    = udf_evict_inode,
-        .clear_inode    = udf_clear_inode,
        .put_super      = udf_put_super,
        .sync_fs        = udf_sync_fs,
        .statfs         = udf_statfs,
@@ -557,6 +556,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
        struct udf_sb_info *sbi = UDF_SB(sb);
+        int error = 0;
        uopt.flags = sbi->s_flags;
        uopt.uid   = sbi->s_uid;
@@ -582,17 +582,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
+                goto out_unlock;
-                return 0;
-        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+out_unlock:
        unlock_kernel();
-        return 0;
+        return error;
 }
 /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1578,9 +1578,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
 {
        struct anchorVolDescPtr *anchor;
        long main_s, main_e, reserve_s, reserve_e;
-        struct udf_sb_info *sbi;
-        sbi = UDF_SB(sb);
        anchor = (struct anchorVolDescPtr *)bh->b_data;
        /* Locate the main sequence */
@@ -1939,7 +1937,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
-        sb->dq_op = NULL;
        sb->s_dirt = 0;
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 702a1148e702..6995ab1f4305 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,9 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
                        uint8_t *, uint8_t *);
 /* file.c */
-extern int udf_ioctl(struct inode *, struct file *, unsigned int,
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-                     unsigned long);
-extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
@@ -141,8 +139,7 @@ extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
 extern void udf_truncate(struct inode *);
 extern void udf_read_inode(struct inode *);
-extern void udf_delete_inode(struct inode *);
+extern void udf_evict_inode(struct inode *);
-extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..46f7a807bbc1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                                   "bit already cleared for fragment %u", i);
        }
        
-        dquot_free_block(inode, count);
-        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -118,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        
        unlock_super (sb);
@@ -195,7 +189,6 @@ do_more:
                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
-                dquot_free_block(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
                uspi->cs_total.cs_nbfree++;
@@ -212,10 +205,8 @@ do_more:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        if (overflow) {
                fragment += count;
@@ -511,7 +502,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
-        int ret;
        
        UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
             (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +547,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -569,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -598,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
        u64 result;
-        int ret;
        
        UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
             inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +649,6 @@ cg_found:
                for (i = count; i < uspi->s_fpb; i++)
                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
-                dquot_free_block(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
                uspi->cs_total.cs_nffree += i;
@@ -679,11 +660,6 @@ cg_found:
        result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
        if (result == INVBLOCK)
                return 0;
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        for (i = 0; i < count; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
@@ -698,10 +674,8 @@ cg_found:
 succed:
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        result += cgno * uspi->s_fpg;
@@ -718,7 +692,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
-        int ret;
        UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -752,11 +725,6 @@ gotit:
        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
-        ret = dquot_alloc_block(inode, uspi->s_fpb);
-        if (ret) {
-                *err = ret;
-                return INVBLOCK;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
        uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..dbc90994715a 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -95,8 +95,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
        int err;
        lock_page(page);
-        err = __ufs_write_begin(NULL, page->mapping, pos, len,
+        err = ufs_prepare_chunk(page, pos, len);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        BUG_ON(err);
        de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
@@ -381,8 +380,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 got_it:
        pos = page_offset(page) +
                        (char*)de - (char*)page_address(page);
-        err = __ufs_write_begin(NULL, page->mapping, pos, rec_len,
+        err = ufs_prepare_chunk(page, pos, rec_len);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
                goto out_unlock;
        if (de->d_ino) {
@@ -518,7 +516,6 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
                     struct page * page)
 {
        struct super_block *sb = inode->i_sb;
-        struct address_space *mapping = page->mapping;
        char *kaddr = page_address(page);
        unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
        unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
@@ -549,8 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
        pos = page_offset(page) + from;
        lock_page(page);
-        err = __ufs_write_begin(NULL, mapping, pos, to - from,
+        err = ufs_prepare_chunk(page, pos, to - from);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        BUG_ON(err);
        if (pde)
                pde->d_reclen = cpu_to_fs16(sb, to - from);
@@ -577,8 +573,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
        if (!page)
                return -ENOMEM;
-        err = __ufs_write_begin(NULL, mapping, 0, chunk_size,
+        err = ufs_prepare_chunk(page, 0, chunk_size);
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err) {
                unlock_page(page);
                goto fail;
@@ -666,6 +661,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
 */
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .open           = dquot_file_open,
+        .open           = generic_file_open,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..2eabf04af3de 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -95,11 +94,6 @@ void ufs_free_inode (struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
-        dquot_free_inode(inode);
-        dquot_drop(inode);
-        clear_inode (inode);
        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
                ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
        else {
@@ -119,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        
        sb->s_dirt = 1;
        unlock_super (sb);
@@ -162,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
        fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
        ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer(UCPI_UBH(ucpi));
-        }
        UFSD("EXIT\n");
 }
@@ -296,22 +286,12 @@ cg_found:
        }
        ubh_mark_buffer_dirty (USPI_UBH(uspi));
        ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-        if (sb->s_flags & MS_SYNCHRONOUS) {
+        if (sb->s_flags & MS_SYNCHRONOUS)
-                ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
+                ubh_sync_block(UCPI_UBH(ucpi));
-                ubh_wait_on_buffer (UCPI_UBH(ucpi));
-        }
        sb->s_dirt = 1;
        inode->i_ino = cg * uspi->s_ipg + bit;
-        inode->i_mode = mode;
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_generation = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,21 +335,12 @@ cg_found:
        unlock_super (sb);
-        dquot_initialize(inode);
-        err = dquot_alloc_inode(inode);
-        if (err) {
-                dquot_drop(inode);
-                goto fail_without_unlock;
-        }
        UFSD("allocating inode %lu\n", inode->i_ino);
        UFSD("EXIT\n");
        return inode;
 fail_remove_inode:
        unlock_super(sb);
-fail_without_unlock:
-        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..2b251f2093af 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -559,20 +558,26 @@ static int ufs_readpage(struct file *file, struct page *page)
        return block_read_full_page(page,ufs_getfrag_block);
 }
-int __ufs_write_begin(struct file *file, struct address_space *mapping,
+int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
-                        loff_t pos, unsigned len, unsigned flags,
-                        struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return __block_write_begin(page, pos, len, ufs_getfrag_block);
-                                ufs_getfrag_block);
 }
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        *pagep = NULL;
+        int ret;
-        return __ufs_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        ret = block_write_begin(mapping, pos, len, flags, pagep,
+                                ufs_getfrag_block);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
 }
 static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
@@ -603,7 +608,7 @@ static void ufs_set_inode_ops(struct inode *inode)
                if (!inode->i_blocks)
                        inode->i_op = &ufs_fast_symlink_inode_operations;
                else {
-                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_op = &ufs_symlink_inode_operations;
                        inode->i_mapping->a_ops = &ufs_aops;
                }
        } else
@@ -906,27 +911,33 @@ int ufs_sync_inode (struct inode *inode)
        return ufs_update_inode (inode, 1);
 }
-void ufs_delete_inode (struct inode * inode)
+void ufs_evict_inode(struct inode * inode)
 {
-        loff_t old_i_size;
+        int want_delete = 0;
-        if (!is_bad_inode(inode))
+        if (!inode->i_nlink && !is_bad_inode(inode))
-                dquot_initialize(inode);
+                want_delete = 1;
        truncate_inode_pages(&inode->i_data, 0);
-        if (is_bad_inode(inode))
+        if (want_delete) {
-                goto no_delete;
+                loff_t old_i_size;
-        /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
+                /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-        lock_kernel();
+                lock_kernel();
-        mark_inode_dirty(inode);
+                mark_inode_dirty(inode);
-        ufs_update_inode(inode, IS_SYNC(inode));
+                ufs_update_inode(inode, IS_SYNC(inode));
-        old_i_size = inode->i_size;
+                old_i_size = inode->i_size;
-        inode->i_size = 0;
+                inode->i_size = 0;
-        if (inode->i_blocks && ufs_truncate(inode, old_i_size))
+                if (inode->i_blocks && ufs_truncate(inode, old_i_size))
-                ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
+                        ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-        ufs_free_inode (inode);
+                unlock_kernel();
-        unlock_kernel();
+        }
-        return;
-no_delete:
+        invalidate_inode_buffers(inode);
-        clear_inode(inode);     /* We must guarantee clearing of inode... */
+        end_writeback(inode);
+        if (want_delete) {
+                lock_kernel();
+                ufs_free_inode (inode);
+                unlock_kernel();
+        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
        UFSD("BEGIN\n");
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        dquot_initialize(dir);
        lock_kernel();
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -148,7 +141,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
                /* slow symlink */
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &ufs_symlink_inode_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                err = page_symlink(inode, symname, l);
                if (err)
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
                return -EMLINK;
        }
-        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        dquot_initialize(dir);
        lock_kernel();
        inode_inc_link_count(dir);
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        dquot_initialize(dir);
        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..d510c1b91817 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
        sbi->s_bytesex = BYTESEX_LE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
        sbi->s_bytesex = BYTESEX_BE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
         */
        sb->s_op = &ufs_super_ops;
        sb->s_export_op = &ufs_export_ops;
-        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
        uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ufs_inode_cachep);
 }
-static void ufs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
 static const struct super_operations ufs_super_ops = {
        .alloc_inode    = ufs_alloc_inode,
        .destroy_inode  = ufs_destroy_inode,
        .write_inode    = ufs_write_inode,
-        .delete_inode   = ufs_delete_inode,
+        .evict_inode    = ufs_evict_inode,
-        .clear_inode    = ufs_clear_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
-#ifdef CONFIG_QUOTA
-        .quota_read     = ufs_quota_read,
-        .quota_write    = ufs_quota_write,
-#endif
 };
-#ifdef CONFIG_QUOTA
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
-                               size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t toread;
-        struct buffer_head *bh;
-        loff_t i_size = i_size_read(inode);
-        if (off > i_size)
-                return 0;
-        if (off+len > i_size)
-                len = i_size-off;
-        toread = len;
-        while (toread > 0) {
-                tocopy = sb->s_blocksize - offset < toread ?
-                                sb->s_blocksize - offset : toread;
-                bh = ufs_bread(inode, blk, 0, &err);
-                if (err)
-                        return err;
-                if (!bh)        /* A hole? */
-                        memset(data, 0, tocopy);
-                else {
-                        memcpy(data, bh->b_data+offset, tocopy);
-                        brelse(bh);
-                }
-                offset = 0;
-                toread -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-        return len;
-}
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
-                                const char *data, size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t towrite = len;
-        struct buffer_head *bh;
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
-                tocopy = sb->s_blocksize - offset < towrite ?
-                                sb->s_blocksize - offset : towrite;
-                bh = ufs_bread(inode, blk, 1, &err);
-                if (!bh)
-                        goto out;
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                set_buffer_uptodate(bh);
-                mark_buffer_dirty(bh);
-                unlock_buffer(bh);
-                brelse(bh);
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-out:
-        if (len == towrite) {
-                mutex_unlock(&inode->i_mutex);
-                return err;
-        }
-        if (inode->i_size < off+len-towrite)
-                i_size_write(inode, off+len-towrite);
-        inode->i_version++;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
-}
-#endif
 static int ufs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
 const struct inode_operations ufs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ufs_follow_link,
+        .setattr        = ufs_setattr,
+};
+const struct inode_operations ufs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = ufs_setattr,
 };
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..a58f9155fc9a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -244,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(ind_ubh);
                ind_ubh = NULL;
        }
-        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
+        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
-                ubh_ll_rw_block(SWRITE, ind_ubh);
+                ubh_sync_block(ind_ubh);
-                ubh_wait_on_buffer (ind_ubh);
-        }
        ubh_brelse (ind_ubh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -308,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
                ubh_bforget(dind_bh);
                dind_bh = NULL;
        }
-        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
+        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
-                ubh_ll_rw_block(SWRITE, dind_bh);
+                ubh_sync_block(dind_bh);
-                ubh_wait_on_buffer (dind_bh);
-        }
        ubh_brelse (dind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -368,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
                ubh_bforget(tind_bh);
                tind_bh = NULL;
        }
-        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
+        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
-                ubh_ll_rw_block(SWRITE, tind_bh);
+                ubh_sync_block(tind_bh);
-                ubh_wait_on_buffer (tind_bh);
-        }
        ubh_brelse (tind_bh);
        
        UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -501,14 +494,7 @@ out:
        return err;
 }
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
-/*
- * We don't define our `inode->i_op->truncate', and call it here,
- * because of:
- * - there is no way to know old size
- * - there is no way inform user about error, if it happens in `truncate'
- */
-static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        unsigned int ia_valid = attr->ia_valid;
@@ -518,26 +504,20 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+        if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
-            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, attr);
-                if (error)
-                        return error;
-        }
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
                loff_t old_i_size = inode->i_size;
-                dquot_initialize(inode);
+                /* XXX(truncate): truncate_setsize should be called last */
+                truncate_setsize(inode, attr->ia_size);
-                error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
                error = ufs_truncate(inode, old_i_size);
                if (error)
                        return error;
        }
-        return inode_setattr(inode, attr);
+        setattr_copy(inode, attr);
+        mark_inode_dirty(inode);
+        return 0;
 }
 const struct inode_operations ufs_file_inode_operations = {
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..c08782e1b48a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -108,7 +108,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
-extern void ufs_delete_inode (struct inode *);
+extern void ufs_evict_inode (struct inode *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
 /* truncate.c */
 extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4e..d2c36d53fe66 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
        }
 }
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
 {
-        if (!ubh)
+        if (ubh) {
-                return;
+                unsigned i;
-        ll_rw_block(rw, ubh->count, ubh->bh);
+                for (i = 0; i < ubh->count; i++)
-}
+                        write_dirty_buffer(ubh->bh[i], WRITE);
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
+                for (i = 0; i < ubh->count; i++)
-{
+                        wait_on_buffer(ubh->bh[i]);
-        unsigned i;
+        }
-        if (!ubh)
-                return;
-        for ( i = 0; i < ubh->count; i++ )
-                wait_on_buffer (ubh->bh[i]);
 }
 void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 23ceed8c8fb9..9f8775ce381c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -257,9 +257,7 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
 extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
 extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
-extern int __ufs_write_begin(struct file *file, struct address_space *mapping,
+extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
-                loff_t pos, unsigned len, unsigned flags,
-                struct page **pagep, void **fsdata);
 /*
 * These functions manipulate ufs buffers
@@ -271,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db5d373..179b58690657 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -126,7 +126,8 @@ out:
 * must be owner or have write permission.
 * Else, update from *times, must be owner or super user.
 */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+               int flags)
 {
        int error = -EINVAL;
@@ -170,7 +171,7 @@ out:
        return error;
 }
-SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
                struct timespec __user *, utimes, int, flags)
 {
        struct timespec tstimes[2];
@@ -188,7 +189,7 @@ SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
        return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
-SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
                struct timeval __user *, utimes)
 {
        struct timeval times[2];
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
 /*
 * Find the xattr_handler with the matching prefix.
 */
-static struct xattr_handler *
+static const struct xattr_handler *
-xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (!*name)
                return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
 ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+        const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
        unsigned int size = 0;
        if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..0dce969d6cad 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_itable.o \
                                   xfs_dfrag.o \
                                   xfs_log.o \
+                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
@@ -86,11 +87,9 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
-                                   xfs_trans_item.o \
                                   xfs_utils.o \
                                   xfs_vnodeops.o \
-                                   xfs_rw.o \
+                                   xfs_rw.o
-                                   xfs_dmops.o
 xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..b2771862fd3d 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask)
        struct posix_acl *acl;
        int error = -EAGAIN;
-        xfs_itrace_entry(ip);
+        trace_xfs_check_acl(ip);
        /*
         * If there is no attribute fork no ACL exists on this inode and
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return error;
 }
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = xfs_xattr_acl_get,
        .set    = xfs_xattr_acl_set,
 };
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0f8b9968a803..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,19 +21,12 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
 #include "xfs_iomap.h"
@@ -45,6 +38,15 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_READ,        /* mapping for a read */
+        IO_DELAY,       /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_NEW          /* just allocated */
+};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -83,18 +85,15 @@ void
 xfs_count_page_state(
        struct page             *page,
        int                     *delalloc,
-        int                     *unmapped,
        int                     *unwritten)
 {
        struct buffer_head      *bh, *head;
-        *delalloc = *unmapped = *unwritten = 0;
+        *delalloc = *unwritten = 0;
        bh = head = page_buffers(page);
        do {
-                if (buffer_uptodate(bh) && !buffer_mapped(bh))
+                if (buffer_unwritten(bh))
-                        (*unmapped) = 1;
-                else if (buffer_unwritten(bh))
                        (*unwritten) = 1;
                else if (buffer_delay(bh))
                        (*delalloc) = 1;
@@ -103,8 +102,9 @@ xfs_count_page_state(
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
-        struct xfs_inode        *ip)
+        struct inode            *inode)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        if (XFS_IS_REALTIME_INODE(ip))
@@ -183,7 +183,7 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IOMAP_READ);
+        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -202,23 +202,17 @@ xfs_setfilesize(
 }
 /*
- * Schedule IO completion handling on a xfsdatad if this was
+ * Schedule IO completion handling on the final put of an ioend.
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
 */
 STATIC void
 xfs_finish_ioend(
-        xfs_ioend_t     *ioend,
+        struct xfs_ioend        *ioend)
-        int             wait)
 {
        if (atomic_dec_and_test(&ioend->io_remaining)) {
-                struct workqueue_struct *wq;
+                if (ioend->io_type == IO_UNWRITTEN)
+                        queue_work(xfsconvertd_workqueue, &ioend->io_work);
-                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                else
-                        xfsconvertd_workqueue : xfsdatad_workqueue;
+                        queue_work(xfsdatad_workqueue, &ioend->io_work);
-                queue_work(wq, &ioend->io_work);
-                if (wait)
-                        flush_workqueue(wq);
        }
 }
@@ -237,7 +231,7 @@ xfs_end_io(
         * For unwritten extents we need to issue transactions to convert a
         * range to normal written extens after the data I/O has finished.
         */
-        if (ioend->io_type == IOMAP_UNWRITTEN &&
+        if (ioend->io_type == IO_UNWRITTEN &&
            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -250,7 +244,7 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IOMAP_READ) {
+        if (ioend->io_type != IO_READ) {
                error = xfs_setfilesize(ioend);
                ASSERT(!error || error == EAGAIN);
        }
@@ -262,11 +256,25 @@ xfs_end_io(
         */
        if (error == EAGAIN) {
                atomic_inc(&ioend->io_remaining);
-                xfs_finish_ioend(ioend, 0);
+                xfs_finish_ioend(ioend);
                /* ensure we don't spin on blocked ioends */
                delay(1);
-        } else
+        } else {
+                if (ioend->io_iocb)
+                        aio_complete(ioend->io_iocb, ioend->io_result, 0);
                xfs_destroy_ioend(ioend);
+        }
+}
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+        struct xfs_ioend        *ioend)
+{
+        if (atomic_dec_and_test(&ioend->io_remaining))
+                xfs_end_io(&ioend->io_work);
 }
 /*
@@ -299,6 +307,8 @@ xfs_alloc_ioend(
        atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
        ioend->io_offset = 0;
        ioend->io_size = 0;
+        ioend->io_iocb = NULL;
+        ioend->io_result = 0;
        INIT_WORK(&ioend->io_work, xfs_end_io);
        return ioend;
@@ -309,21 +319,25 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        ssize_t                 count,
-        xfs_iomap_t             *mapp,
+        struct xfs_bmbt_irec    *imap,
        int                     flags)
 {
        int                     nmaps = 1;
+        int                     new = 0;
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 STATIC int
-xfs_iomap_valid(
+xfs_imap_valid(
-        xfs_iomap_t             *iomapp,
+        struct inode            *inode,
-        loff_t                  offset)
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
 {
-        return offset >= iomapp->iomap_offset &&
+        offset >>= inode->i_blkbits;
-                offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+        return offset >= imap->br_startoff &&
+                offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
@@ -344,7 +358,7 @@ xfs_end_bio(
        bio->bi_end_io = NULL;
        bio_put(bio);
-        xfs_finish_ioend(ioend, 0);
+        xfs_finish_ioend(ioend);
 }
 STATIC void
@@ -486,7 +500,7 @@ xfs_submit_ioend(
                }
                if (bio)
                        xfs_submit_ioend_bio(wbc, ioend, bio);
-                xfs_finish_ioend(ioend, 0);
+                xfs_finish_ioend(ioend);
        } while ((ioend = next) != NULL);
 }
@@ -554,19 +568,23 @@ xfs_add_to_ioend(
 STATIC void
 xfs_map_buffer(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        uint                    block_bits)
 {
        sector_t                bn;
+        struct xfs_mount        *m = XFS_I(inode)->i_mount;
+        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-        ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
+        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-              ((offset - mp->iomap_offset) >> block_bits);
+              ((offset - iomap_offset) >> inode->i_blkbits);
-        ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
        bh->b_blocknr = bn;
        set_buffer_mapped(bh);
@@ -574,17 +592,17 @@ xfs_map_buffer(
 STATIC void
 xfs_map_at_offset(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        loff_t                  offset,
+        struct xfs_bmbt_irec    *imap,
-        int                     block_bits,
+        xfs_off_t               offset)
-        xfs_iomap_t             *iomapp)
 {
-        ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-        ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
        lock_buffer(bh);
-        xfs_map_buffer(bh, iomapp, offset, block_bits);
+        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = iomapp->iomap_target->bt_bdev;
+        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
@@ -596,31 +614,30 @@ xfs_map_at_offset(
 STATIC unsigned int
 xfs_probe_page(
        struct page             *page,
-        unsigned int            pg_offset,
+        unsigned int            pg_offset)
-        int                     mapped)
 {
+        struct buffer_head      *bh, *head;
        int                     ret = 0;
        if (PageWriteback(page))
                return 0;
+        if (!PageDirty(page))
+                return 0;
+        if (!page->mapping)
+                return 0;
+        if (!page_has_buffers(page))
+                return 0;
-        if (page->mapping && PageDirty(page)) {
+        bh = head = page_buffers(page);
-                if (page_has_buffers(page)) {
+        do {
-                        struct buffer_head      *bh, *head;
+                if (!buffer_uptodate(bh))
+                        break;
-                        bh = head = page_buffers(page);
+                if (!buffer_mapped(bh))
-                        do {
+                        break;
-                                if (!buffer_uptodate(bh))
+                ret += bh->b_size;
-                                        break;
+                if (ret >= pg_offset)
-                                if (mapped != buffer_mapped(bh))
+                        break;
-                                        break;
+        } while ((bh = bh->b_this_page) != head);
-                                ret += bh->b_size;
-                                if (ret >= pg_offset)
-                                        break;
-                        } while ((bh = bh->b_this_page) != head);
-                } else
-                        ret = mapped ? 0 : PAGE_CACHE_SIZE;
-        }
        return ret;
 }
@@ -630,8 +647,7 @@ xfs_probe_cluster(
        struct inode            *inode,
        struct page             *startpage,
        struct buffer_head      *bh,
-        struct buffer_head      *head,
+        struct buffer_head      *head)
-        int                     mapped)
 {
        struct pagevec          pvec;
        pgoff_t                 tindex, tlast, tloff;
@@ -640,7 +656,7 @@ xfs_probe_cluster(
        /* First sum forwards in this page */
        do {
-                if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+                if (!buffer_uptodate(bh) || !buffer_mapped(bh))
                        return total;
                total += bh->b_size;
        } while ((bh = bh->b_this_page) != head);
@@ -674,7 +690,7 @@ xfs_probe_cluster(
                                pg_offset = PAGE_CACHE_SIZE;
                        if (page->index == tindex && trylock_page(page)) {
-                                pg_len = xfs_probe_page(page, pg_offset, mapped);
+                                pg_len = xfs_probe_page(page, pg_offset);
                                unlock_page(page);
                        }
@@ -713,11 +729,11 @@ xfs_is_delayed_page(
                bh = head = page_buffers(page);
                do {
                        if (buffer_unwritten(bh))
-                                acceptable = (type == IOMAP_UNWRITTEN);
+                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IOMAP_DELAY);
+                                acceptable = (type == IO_DELAY);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IOMAP_NEW);
+                                acceptable = (type == IO_NEW);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -740,17 +756,15 @@ xfs_convert_page(
        struct inode            *inode,
        struct page             *page,
        loff_t                  tindex,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     startio,
        int                     all_bh)
 {
        struct buffer_head      *bh, *head;
        xfs_off_t               end_offset;
        unsigned long           p_offset;
        unsigned int            type;
-        int                     bbits = inode->i_blkbits;
        int                     len, page_dirty;
        int                     count = 0, done = 0, uptodate = 1;
        xfs_off_t               offset = page_offset(page);
@@ -802,32 +816,27 @@ xfs_convert_page(
                if (buffer_unwritten(bh) || buffer_delay(bh)) {
                        if (buffer_unwritten(bh))
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                        else
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
-                        if (!xfs_iomap_valid(mp, offset)) {
+                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-                        ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+                        xfs_map_at_offset(inode, bh, imap, offset);
+                        xfs_add_to_ioend(inode, bh, offset, type,
+                                         ioendp, done);
-                        xfs_map_at_offset(bh, offset, bbits, mp);
-                        if (startio) {
-                                xfs_add_to_ioend(inode, bh, offset,
-                                                type, ioendp, done);
-                        } else {
-                                set_buffer_dirty(bh);
-                                unlock_buffer(bh);
-                                mark_buffer_dirty(bh);
-                        }
                        page_dirty--;
                        count++;
                } else {
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
-                        if (buffer_mapped(bh) && all_bh && startio) {
+                        if (buffer_mapped(bh) && all_bh) {
                                lock_buffer(bh);
                                xfs_add_to_ioend(inode, bh, offset,
                                                type, ioendp, done);
@@ -842,14 +851,12 @@ xfs_convert_page(
        if (uptodate && bh == head)
                SetPageUptodate(page);
-        if (startio) {
+        if (count) {
-                if (count) {
+                if (--wbc->nr_to_write <= 0 &&
-                        wbc->nr_to_write--;
+                    wbc->sync_mode == WB_SYNC_NONE)
-                        if (wbc->nr_to_write <= 0)
+                        done = 1;
-                                done = 1;
-                }
-                xfs_start_page_writeback(page, !page_dirty, count);
        }
+        xfs_start_page_writeback(page, !page_dirty, count);
        return done;
 fail_unlock_page:
@@ -866,10 +873,9 @@ STATIC void
 xfs_cluster_write(
        struct inode            *inode,
        pgoff_t                 tindex,
-        xfs_iomap_t             *iomapp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
-        int                     startio,
        int                     all_bh,
        pgoff_t                 tlast)
 {
@@ -885,7 +891,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        iomapp, ioendp, wbc, startio, all_bh);
+                                        imap, ioendp, wbc, all_bh);
                        if (done)
                                break;
                }
@@ -930,7 +936,7 @@ xfs_aops_discard_page(
        loff_t                  offset = page_offset(page);
        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -964,7 +970,7 @@ xfs_aops_discard_page(
                 */
                error = xfs_bmapi(NULL, ip, offset_fsb, 1,
                                XFS_BMAPI_ENTIRE,  NULL, 0, &imap,
-                                &nimaps, NULL, NULL);
+                                &nimaps, NULL);
                if (error) {
                        /* something screwed, just bail */
@@ -992,7 +998,7 @@ xfs_aops_discard_page(
                 */
                xfs_bmap_init(&flist, &firstblock);
                error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
-                                        &flist, NULL, &done);
+                                        &flist, &done);
                ASSERT(!flist.xbf_count && !flist.xbf_first);
                if (error) {
@@ -1015,50 +1021,66 @@ out_invalidate:
 }
 /*
- * Calling this without startio set means we are being asked to make a dirty
+ * Write out a dirty page.
- * page ready for freeing it's buffers.  When called with startio set then
- * we are coming from writepage.
 *
- * When called with startio set it is important that we write the WHOLE
+ * For delalloc space on the page we need to allocate space and flush it.
- * page if possible.
+ * For unwritten space on the page we need to start the conversion to
- * The bh->b_state's cannot know if any of the blocks or which block for
+ * regular allocated space.
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
+ * For any other dirty buffer heads on the page we should flush them.
- * only valid if the page itself isn't completely uptodate.  Some layers
+ *
- * may clear the page dirty flag prior to calling write page, under the
+ * If we detect that a transaction would be required to flush the page, we
- * assumption the entire page will be written out; by not writing out the
+ * have to check the process flags first, if we are already in a transaction
- * whole page the page can be reused before all valid dirty data is
+ * or disk I/O during allocations is off, we need to fail the writepage and
- * written out.  Note: in the case of a page that has been dirty'd by
+ * redirty the page.
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
 */
 STATIC int
-xfs_page_state_convert(
+xfs_vm_writepage(
-        struct inode    *inode,
+        struct page             *page,
-        struct page     *page,
+        struct writeback_control *wbc)
-        struct writeback_control *wbc,
-        int             startio,
-        int             unmapped) /* also implies page uptodate */
 {
+        struct inode            *inode = page->mapping->host;
+        int                     delalloc, unwritten;
        struct buffer_head      *bh, *head;
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
-        unsigned long           p_offset = 0;
        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index, tlast;
+        pgoff_t                 end_index, last_index;
        ssize_t                 size, len;
-        int                     flags, err, iomap_valid = 0, uptodate = 1;
+        int                     flags, err, imap_valid = 0, uptodate = 1;
-        int                     page_dirty, count = 0;
+        int                     count = 0;
-        int                     trylock = 0;
+        int                     all_bh = 0;
-        int                     all_bh = unmapped;
+        trace_xfs_writepage(inode, page, 0);
-        if (startio) {
-                if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+        ASSERT(page_has_buffers(page));
-                        trylock |= BMAPI_TRYLOCK;
-        }
+        /*
+         * Refuse to write the page out if we are called from reclaim context.
+         *
+         * This avoids stack overflows when called from deeply used stacks in
+         * random callers for direct reclaim or memcg reclaim.  We explicitly
+         * allow reclaim from kswapd as the stack usage there is relatively low.
+         *
+         * This should really be done by the core VM, but until that happens
+         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * by themselves.
+         */
+        if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+                goto redirty;
+        /*
+         * We need a transaction if there are delalloc or unwritten buffers
+         * on the page.
+         *
+         * If we need a transaction and the process flags say we are already
+         * in a transaction, or no IO is allowed then mark the page dirty
+         * again and leave the page as is.
+         */
+        xfs_count_page_state(page, &delalloc, &unwritten);
+        if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+                goto redirty;
        /* Is this page beyond the end of the file? */
        offset = i_size_read(inode);
@@ -1067,92 +1089,64 @@ xfs_page_state_convert(
        if (page->index >= end_index) {
                if ((page->index >= end_index + 1) ||
                    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-                        if (startio)
+                        unlock_page(page);
-                                unlock_page(page);
                        return 0;
                }
        }
-        /*
-         * page_dirty is initially a count of buffers on the page before
-         * EOF and is decremented as we move each into a cleanable state.
-         *
-         * Derivation:
-         *
-         * End offset is the highest offset that this page should represent.
-         * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-         * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-         * hence give us the correct page_dirty count. On any other page,
-         * it will be zero and in that case we need page_dirty to be the
-         * count of buffers on the page.
-         */
        end_offset = min_t(unsigned long long,
-                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
+                        (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+                        offset);
        len = 1 << inode->i_blkbits;
-        p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                        PAGE_CACHE_SIZE);
-        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-        page_dirty = p_offset / len;
        bh = head = page_buffers(page);
        offset = page_offset(page);
        flags = BMAPI_READ;
-        type = IOMAP_NEW;
+        type = IO_NEW;
-        /* TODO: cleanup count and page_dirty */
        do {
                if (offset >= end_offset)
                        break;
                if (!buffer_uptodate(bh))
                        uptodate = 0;
-                if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
-                        /*
+                /*
-                         * the iomap is actually still valid, but the ioend
+                 * A hole may still be marked uptodate because discard_buffer
-                         * isn't.  shouldn't happen too often.
+                 * leaves the flag set.
-                         */
+                 */
-                        iomap_valid = 0;
+                if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                        ASSERT(!buffer_dirty(bh));
+                        imap_valid = 0;
                        continue;
                }
-                if (iomap_valid)
+                if (imap_valid)
-                        iomap_valid = xfs_iomap_valid(&iomap, offset);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
-                /*
+                if (buffer_unwritten(bh) || buffer_delay(bh)) {
-                 * First case, map an unwritten extent and prepare for
-                 * extent state conversion transaction on completion.
-                 *
-                 * Second case, allocate space for a delalloc buffer.
-                 * We can return EAGAIN here in the release page case.
-                 *
-                 * Third case, an unmapped buffer was found, and we are
-                 * in a path where we need to write the whole page out.
-                 */
-                if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                    ((buffer_uptodate(bh) || PageUptodate(page)) &&
-                     !buffer_mapped(bh) && (unmapped || startio))) {
                        int new_ioend = 0;
                        /*
                         * Make sure we don't use a read-only iomap
                         */
                        if (flags == BMAPI_READ)
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        if (buffer_unwritten(bh)) {
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
                        } else if (buffer_delay(bh)) {
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
-                                flags = BMAPI_ALLOCATE | trylock;
+                                flags = BMAPI_ALLOCATE;
-                        } else {
-                                type = IOMAP_NEW;
+                                if (wbc->sync_mode == WB_SYNC_NONE &&
-                                flags = BMAPI_WRITE | BMAPI_MMAP;
+                                    wbc->nonblocking)
+                                        flags |= BMAPI_TRYLOCK;
                        }
-                        if (!iomap_valid) {
+                        if (!imap_valid) {
                                /*
-                                 * if we didn't have a valid mapping then we
+                                 * If we didn't have a valid mapping then we
                                 * need to ensure that we put the new mapping
                                 * in a new ioend structure. This needs to be
                                 * done to ensure that the ioends correctly
@@ -1160,74 +1154,57 @@ xfs_page_state_convert(
                                 * for unwritten extent conversion.
                                 */
                                new_ioend = 1;
-                                if (type == IOMAP_NEW) {
+                                err = xfs_map_blocks(inode, offset, len,
-                                        size = xfs_probe_cluster(inode,
+                                                &imap, flags);
-                                                        page, bh, head, 0);
-                                } else {
-                                        size = len;
-                                }
-                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
-                        if (iomap_valid) {
+                        if (imap_valid) {
-                                xfs_map_at_offset(bh, offset,
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                                                inode->i_blkbits, &iomap);
+                                xfs_add_to_ioend(inode, bh, offset, type,
-                                if (startio) {
+                                                 &ioend, new_ioend);
-                                        xfs_add_to_ioend(inode, bh, offset,
-                                                        type, &ioend,
-                                                        new_ioend);
-                                } else {
-                                        set_buffer_dirty(bh);
-                                        unlock_buffer(bh);
-                                        mark_buffer_dirty(bh);
-                                }
-                                page_dirty--;
                                count++;
                        }
-                } else if (buffer_uptodate(bh) && startio) {
+                } else if (buffer_uptodate(bh)) {
                        /*
                         * we got here because the buffer is already mapped.
                         * That means it must already have extents allocated
                         * underneath it. Map the extent by reading it.
                         */
-                        if (!iomap_valid || flags != BMAPI_READ) {
+                        if (!imap_valid || flags != BMAPI_READ) {
                                flags = BMAPI_READ;
-                                size = xfs_probe_cluster(inode, page, bh,
+                                size = xfs_probe_cluster(inode, page, bh, head);
-                                                                head, 1);
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
                        /*
-                         * We set the type to IOMAP_NEW in case we are doing a
+                         * We set the type to IO_NEW in case we are doing a
                         * small write at EOF that is extending the file but
                         * without needing an allocation. We need to update the
                         * file size on I/O completion in this case so it is
                         * the same case as having just allocated a new extent
                         * that we are writing into for the first time.
                         */
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (trylock_buffer(bh)) {
-                                ASSERT(buffer_mapped(bh));
+                                if (imap_valid)
-                                if (iomap_valid)
                                        all_bh = 1;
                                xfs_add_to_ioend(inode, bh, offset, type,
-                                                &ioend, !iomap_valid);
+                                                &ioend, !imap_valid);
-                                page_dirty--;
                                count++;
                        } else {
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        }
-                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
+                } else if (PageUptodate(page)) {
-                           (unmapped || startio)) {
+                        ASSERT(buffer_mapped(bh));
-                        iomap_valid = 0;
+                        imap_valid = 0;
                }
                if (!iohead)
@@ -1238,132 +1215,48 @@ xfs_page_state_convert(
        if (uptodate && bh == head)
                SetPageUptodate(page);
-        if (startio)
+        xfs_start_page_writeback(page, 1, count);
-                xfs_start_page_writeback(page, 1, count);
-        if (ioend && iomap_valid) {
-                offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
-                                        PAGE_CACHE_SHIFT;
-                tlast = min_t(pgoff_t, offset, last_index);
-                xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-                                        wbc, startio, all_bh, tlast);
-        }
-        if (iohead)
-                xfs_submit_ioend(wbc, iohead);
-        return page_dirty;
-error:
-        if (iohead)
-                xfs_cancel_ioend(iohead);
-        /*
+        if (ioend && imap_valid) {
-         * If it's delalloc and we have nowhere to put it,
+                xfs_off_t               end_index;
-         * throw it away, unless the lower layers told
-         * us to try again.
-         */
-        if (err != -EAGAIN) {
-                if (!unmapped)
-                        xfs_aops_discard_page(page);
-                ClearPageUptodate(page);
-        }
-        return err;
-}
-/*
+                end_index = imap.br_startoff + imap.br_blockcount;
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- *    state is cleared before we get here. In this case is it
- *    conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-STATIC int
+                /* to bytes */
-xfs_vm_writepage(
+                end_index <<= inode->i_blkbits;
-        struct page             *page,
-        struct writeback_control *wbc)
-{
-        int                     error;
-        int                     need_trans;
-        int                     delalloc, unmapped, unwritten;
-        struct inode            *inode = page->mapping->host;
-        trace_xfs_writepage(inode, page, 0);
+                /* to pages */
+                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-        /*
+                /* check against file size */
-         * We need a transaction if:
+                if (end_index > last_index)
-         *  1. There are delalloc buffers on the page
+                        end_index = last_index;
-         *  2. The page is uptodate and we have unmapped buffers
-         *  3. The page is uptodate and we have no buffers
-         *  4. There are unwritten buffers on the page
-         */
-        if (!page_has_buffers(page)) {
+                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                unmapped = 1;
+                                        wbc, all_bh, end_index);
-                need_trans = 1;
-        } else {
-                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-                if (!PageUptodate(page))
-                        unmapped = 0;
-                need_trans = delalloc + unmapped + unwritten;
        }
-        /*
+        if (iohead)
-         * If we need a transaction and the process flags say
+                xfs_submit_ioend(wbc, iohead);
-         * we are already in a transaction, or no IO is allowed
-         * then mark the page dirty again and leave the page
-         * as is.
-         */
-        if (current_test_flags(PF_FSTRANS) && need_trans)
-                goto out_fail;
-        /*
-         * Delay hooking up buffer heads until we have
-         * made our go/no-go decision.
-         */
-        if (!page_has_buffers(page))
-                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+        return 0;
-        /*
+error:
-         *  VM calculation for nr_to_write seems off.  Bump it way
+        if (iohead)
-         *  up, this gets simple streaming writes zippy again.
+                xfs_cancel_ioend(iohead);
-         *  To be reviewed again after Jens' writeback changes.
-         */
-        wbc->nr_to_write *= 4;
-        /*
+        if (err == -EAGAIN)
-         * Convert delayed allocate, unwritten or unmapped space
+                goto redirty;
-         * to real space and flush out to disk.
-         */
-        error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
-        if (error == -EAGAIN)
-                goto out_fail;
-        if (unlikely(error < 0))
-                goto out_unlock;
-        return 0;
+        xfs_aops_discard_page(page);
+        ClearPageUptodate(page);
+        unlock_page(page);
+        return err;
-out_fail:
+redirty:
        redirty_page_for_writepage(wbc, page);
        unlock_page(page);
        return 0;
-out_unlock:
-        unlock_page(page);
-        return error;
 }
 STATIC int
@@ -1377,65 +1270,27 @@ xfs_vm_writepages(
 /*
 * Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
+ * to be released. The page should already be clean. We always
 * have buffer heads in this call.
 *
- * Returns 0 if the page is ok to release, 1 otherwise.
+ * Returns 1 if the page is ok to release, 0 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- *    to via regular I/O. buffer heads will be dirty and possibly
- *    delalloc. If no delalloc buffer heads in this case then we
- *    can just return zero.
- *
- * 2. We are called to release a page which has been written via
- *    mmap, all we need to do is ensure there is no delalloc
- *    state in the buffer heads, if not we can let the caller
- *    free them and we should come back later via writepage.
 */
 STATIC int
 xfs_vm_releasepage(
        struct page             *page,
        gfp_t                   gfp_mask)
 {
-        struct inode            *inode = page->mapping->host;
+        int                     delalloc, unwritten;
-        int                     dirty, delalloc, unmapped, unwritten;
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = 1,
-        };
-        trace_xfs_releasepage(inode, page, 0);
+        trace_xfs_releasepage(page->mapping->host, page, 0);
-        if (!page_has_buffers(page))
-                return 0;
-        xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+        xfs_count_page_state(page, &delalloc, &unwritten);
-        if (!delalloc && !unwritten)
-                goto free_buffers;
-        if (!(gfp_mask & __GFP_FS))
+        if (WARN_ON(delalloc))
                return 0;
+        if (WARN_ON(unwritten))
-        /* If we are already inside a transaction or the thread cannot
-         * do I/O, we cannot release this page.
-         */
-        if (current_test_flags(PF_FSTRANS))
                return 0;
-        /*
-         * Convert delalloc space to real space, do not flush the
-         * data out to disk, that will be done by the caller.
-         * Never need to allocate space here - we will always
-         * come back to writepage in that case.
-         */
-        dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
-        if (dirty == 0 && !unwritten)
-                goto free_buffers;
-        return 0;
-free_buffers:
        return try_to_free_buffers(page);
 }
@@ -1445,13 +1300,14 @@ __xfs_get_blocks(
        sector_t                iblock,
        struct buffer_head      *bh_result,
        int                     create,
-        int                     direct,
+        int                     direct)
-        bmapi_flags_t           flags)
 {
-        xfs_iomap_t             iomap;
+        int                     flags = create ? BMAPI_WRITE : BMAPI_READ;
+        struct xfs_bmbt_irec    imap;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     niomap = 1;
+        int                     nimap = 1;
+        int                     new = 0;
        int                     error;
        offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1461,23 +1317,25 @@ __xfs_get_blocks(
        if (!create && direct && offset >= i_size_read(inode))
                return 0;
-        error = xfs_iomap(XFS_I(inode), offset, size,
+        if (direct && create)
-                             create ? flags : BMAPI_READ, &iomap, &niomap);
+                flags |= BMAPI_DIRECT;
+        error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap,
+                          &new);
        if (error)
                return -error;
-        if (niomap == 0)
+        if (nimap == 0)
                return 0;
-        if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+        if (imap.br_startblock != HOLESTARTBLOCK &&
+            imap.br_startblock != DELAYSTARTBLOCK) {
                /*
                 * For unwritten extents do not report a disk address on
                 * the read case (treat as if we're reading into a hole).
                 */
-                if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(bh_result, &iomap, offset,
+                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                                       inode->i_blkbits);
+                if (create && ISUNWRITTEN(&imap)) {
-                }
-                if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
                        if (direct)
                                bh_result->b_private = inode;
                        set_buffer_unwritten(bh_result);
@@ -1488,7 +1346,7 @@ __xfs_get_blocks(
         * If this is a realtime file, data may be on a different device.
         * to that pointed to from the buffer_head b_bdev currently.
         */
-        bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
        /*
         * If we previously allocated a block out beyond eof and we are now
@@ -1502,10 +1360,10 @@ __xfs_get_blocks(
        if (create &&
            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
             (offset >= i_size_read(inode)) ||
-             (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (iomap.iomap_flags & IOMAP_DELAY) {
+        if (imap.br_startblock == DELAYSTARTBLOCK) {
                BUG_ON(direct);
                if (create) {
                        set_buffer_uptodate(bh_result);
@@ -1514,11 +1372,23 @@ __xfs_get_blocks(
                }
        }
+        /*
+         * If this is O_DIRECT or the mpage code calling tell them how large
+         * the mapping is, so that we can avoid repeated get_blocks calls.
+         */
        if (direct || size > (1 << inode->i_blkbits)) {
-                ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
+                xfs_off_t               mapping_size;
-                offset = min_t(xfs_off_t,
-                                iomap.iomap_bsize - iomap.iomap_delta, size);
+                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+                mapping_size <<= inode->i_blkbits;
+                ASSERT(mapping_size > 0);
+                if (mapping_size > size)
+                        mapping_size = size;
+                if (mapping_size > LONG_MAX)
+                        mapping_size = LONG_MAX;
+                bh_result->b_size = mapping_size;
        }
        return 0;
@@ -1531,8 +1401,7 @@ xfs_get_blocks(
        struct buffer_head      *bh_result,
        int                     create)
 {
-        return __xfs_get_blocks(inode, iblock,
+        return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
-                                bh_result, create, 0, BMAPI_WRITE);
 }
 STATIC int
@@ -1542,61 +1411,59 @@ xfs_get_blocks_direct(
        struct buffer_head      *bh_result,
        int                     create)
 {
-        return __xfs_get_blocks(inode, iblock,
+        return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
-                                bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successfull AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
 STATIC void
-xfs_end_io_direct(
+xfs_end_io_direct_write(
-        struct kiocb    *iocb,
+        struct kiocb            *iocb,
-        loff_t          offset,
+        loff_t                  offset,
-        ssize_t         size,
+        ssize_t                 size,
-        void            *private)
+        void                    *private,
+        int                     ret,
+        bool                    is_async)
 {
-        xfs_ioend_t     *ioend = iocb->private;
+        struct xfs_ioend        *ioend = iocb->private;
        /*
-         * Non-NULL private data means we need to issue a transaction to
+         * blockdev_direct_IO can return an error even after the I/O
-         * convert a range from unwritten to written extents.  This needs
+         * completion handler was called.  Thus we need to protect
-         * to happen from process context but aio+dio I/O completion
+         * against double-freeing.
-         * happens from irq context so we need to defer it to a workqueue.
-         * This is not necessary for synchronous direct I/O, but we do
-         * it anyway to keep the code uniform and simpler.
-         *
-         * Well, if only it were that simple. Because synchronous direct I/O
-         * requires extent conversion to occur *before* we return to userspace,
-         * we have to wait for extent conversion to complete. Look at the
-         * iocb that has been passed to us to determine if this is AIO or
-         * not. If it is synchronous, tell xfs_finish_ioend() to kick the
-         * workqueue and wait for it to complete.
-         *
-         * The core direct I/O code might be changed to always call the
-         * completion handler in the future, in which case all this can
-         * go away.
         */
+        iocb->private = NULL;
        ioend->io_offset = offset;
        ioend->io_size = size;
-        if (ioend->io_type == IOMAP_READ) {
+        if (private && size > 0)
-                xfs_finish_ioend(ioend, 0);
+                ioend->io_type = IO_UNWRITTEN;
-        } else if (private && size > 0) {
-                xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
+        if (is_async) {
-        } else {
                /*
-                 * A direct I/O write ioend starts it's life in unwritten
+                 * If we are converting an unwritten extent we need to delay
-                 * state in case they map an unwritten extent.  This write
+                 * the AIO completion until after the unwrittent extent
-                 * didn't map an unwritten extent so switch it's completion
+                 * conversion has completed, otherwise do it ASAP.
-                 * handler.
                 */
-                ioend->io_type = IOMAP_NEW;
+                if (ioend->io_type == IO_UNWRITTEN) {
-                xfs_finish_ioend(ioend, 0);
+                        ioend->io_iocb = iocb;
+                        ioend->io_result = ret;
+                } else {
+                        aio_complete(iocb, ret, 0);
+                }
+                xfs_finish_ioend(ioend);
+        } else {
+                xfs_finish_ioend_sync(ioend);
        }
-        /*
-         * blockdev_direct_IO can return an error even after the I/O
-         * completion handler was called.  Thus we need to protect
-         * against double-freeing.
-         */
-        iocb->private = NULL;
 }
 STATIC ssize_t
@@ -1607,26 +1474,45 @@ xfs_vm_direct_IO(
        loff_t                  offset,
        unsigned long           nr_segs)
 {
-        struct file     *file = iocb->ki_filp;
+        struct inode            *inode = iocb->ki_filp->f_mapping->host;
-        struct inode    *inode = file->f_mapping->host;
+        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-        struct block_device *bdev;
+        ssize_t                 ret;
-        ssize_t         ret;
-        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
-        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+        if (rw & WRITE) {
-                                        IOMAP_UNWRITTEN : IOMAP_READ);
+                iocb->private = xfs_alloc_ioend(inode, IO_NEW);
-        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+                                            offset, nr_segs,
+                                            xfs_get_blocks_direct,
+                                            xfs_end_io_direct_write, NULL, 0);
+                if (ret != -EIOCBQUEUED && iocb->private)
+                        xfs_destroy_ioend(iocb->private);
+        } else {
+                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
-                                            xfs_end_io_direct);
+                                            NULL, NULL, 0);
+        }
-        if (unlikely(ret != -EIOCBQUEUED && iocb->private))
-                xfs_destroy_ioend(iocb->private);
        return ret;
 }
+STATIC void
+xfs_vm_write_failed(
+        struct address_space    *mapping,
+        loff_t                  to)
+{
+        struct inode            *inode = mapping->host;
+        if (to > inode->i_size) {
+                struct iattr    ia = {
+                        .ia_valid       = ATTR_SIZE | ATTR_FORCE,
+                        .ia_size        = inode->i_size,
+                };
+                xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
+        }
+}
 STATIC int
 xfs_vm_write_begin(
        struct file             *file,
@@ -1637,9 +1523,31 @@ xfs_vm_write_begin(
        struct page             **pagep,
        void                    **fsdata)
 {
-        *pagep = NULL;
+        int                     ret;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                                xfs_get_blocks);
+        ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+                                pagep, xfs_get_blocks);
+        if (unlikely(ret))
+                xfs_vm_write_failed(mapping, pos + len);
+        return ret;
+}
+STATIC int
+xfs_vm_write_end(
+        struct file             *file,
+        struct address_space    *mapping,
+        loff_t                  pos,
+        unsigned                len,
+        unsigned                copied,
+        struct page             *page,
+        void                    *fsdata)
+{
+        int                     ret;
+        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (unlikely(ret < len))
+                xfs_vm_write_failed(mapping, pos + len);
+        return ret;
 }
 STATIC sector_t
@@ -1650,7 +1558,7 @@ xfs_vm_bmap(
        struct inode            *inode = (struct inode *)mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_itrace_entry(XFS_I(inode));
+        trace_xfs_vm_bmap(XFS_I(inode));
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -1684,7 +1592,7 @@ const struct address_space_operations xfs_address_space_operations = {
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
-        .write_end              = generic_write_end,
+        .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 4cfc6ea87df8..c5057fb6237a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -37,6 +37,8 @@ typedef struct xfs_ioend {
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
+        struct kiocb            *io_iocb;
+        int                     io_result;
 } xfs_ioend_t;
 extern const struct address_space_operations xfs_address_space_operations;
@@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 extern void xfs_ioend_init(void);
 extern void xfs_ioend_wait(struct xfs_inode *);
-extern void xfs_count_page_state(struct page *, int *, int *, int *);
+extern void xfs_count_page_state(struct page *, int *, int *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44c2b0ef9a41..286e36e21dae 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,14 +37,14 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct shrinker xfs_buf_shake = {
        .shrink = xfsbufd_wakeup,
@@ -339,7 +339,7 @@ _xfs_buf_lookup_pages(
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
-                        xfsbufd_wakeup(0, gfp_mask);
+                        xfsbufd_wakeup(NULL, 0, gfp_mask);
                        congestion_wait(BLK_RW_ASYNC, HZ/50);
                        goto retry;
                }
@@ -440,12 +440,7 @@ _xfs_buf_find(
                ASSERT(btp == bp->b_target);
                if (bp->b_file_offset == range_base &&
                    bp->b_buffer_length == range_length) {
-                        /*
-                         * If we look at something, bring it to the
-                         * front of the list for next time.
-                         */
                        atomic_inc(&bp->b_hold);
-                        list_move(&bp->b_hash_list, &hash->bh_list);
                        goto found;
                }
        }
@@ -578,9 +573,9 @@ _xfs_buf_read(
                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
        status = xfs_buf_iorequest(bp);
-        if (!status && !(flags & XBF_ASYNC))
+        if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
-                status = xfs_buf_iowait(bp);
+                return status;
-        return status;
+        return xfs_buf_iowait(bp);
 }
 xfs_buf_t *
@@ -850,6 +845,12 @@ xfs_buf_lock_value(
 *      Note that this in no way locks the underlying pages, so it is only
 *      useful for synchronizing concurrent use of buffer objects, not for
 *      synchronizing independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we
+ *      are being asked to lock a buffer that has been reallocated. Because
+ *      it is pinned, we know that the log has not been pushed to disk and
+ *      hence it will still be locked. Rather than sleeping until someone
+ *      else pushes the log, push it ourselves before trying to get the lock.
 */
 void
 xfs_buf_lock(
@@ -857,6 +858,8 @@ xfs_buf_lock(
 {
        trace_xfs_buf_lock(bp, _RET_IP_);
+        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
@@ -888,36 +891,6 @@ xfs_buf_unlock(
        trace_xfs_buf_unlock(bp, _RET_IP_);
 }
-/*
- *      Pinning Buffer Storage in Memory
- *      Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
-        xfs_buf_t               *bp)
-{
-        trace_xfs_buf_pin(bp, _RET_IP_);
-        atomic_inc(&bp->b_pin_count);
-}
-void
-xfs_buf_unpin(
-        xfs_buf_t               *bp)
-{
-        trace_xfs_buf_unpin(bp, _RET_IP_);
-        if (atomic_dec_and_test(&bp->b_pin_count))
-                wake_up_all(&bp->b_waiters);
-}
-int
-xfs_buf_ispin(
-        xfs_buf_t               *bp)
-{
-        return atomic_read(&bp->b_pin_count);
-}
 STATIC void
 xfs_buf_wait_unpin(
        xfs_buf_t               *bp)
@@ -1007,25 +980,19 @@ xfs_bwrite(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
-        int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+        int                     error;
-        int                     error = 0;
-        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
-        if (!iowait)
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-                bp->b_flags |= _XBF_RUN_QUEUES;
        xfs_buf_delwri_dequeue(bp);
-        xfs_buf_iostrategy(bp);
+        xfs_bdstrat_cb(bp);
-        if (iowait) {
-                error = xfs_buf_iowait(bp);
-                if (error)
-                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-                xfs_buf_relse(bp);
-        }
+        error = xfs_buf_iowait(bp);
+        if (error)
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_relse(bp);
        return error;
 }
@@ -1036,7 +1003,6 @@ xfs_bdwrite(
 {
        trace_xfs_buf_bdwrite(bp, _RET_IP_);
-        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags &= ~XBF_READ;
@@ -1071,7 +1037,6 @@ xfs_bioerror(
        XFS_BUF_UNDONE(bp);
        XFS_BUF_STALE(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
        xfs_biodone(bp);
        return EIO;
@@ -1101,7 +1066,6 @@ xfs_bioerror_relse(
        XFS_BUF_DONE(bp);
        XFS_BUF_STALE(bp);
        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
        if (!(fl & XBF_ASYNC)) {
                /*
                 * Mark b_error and B_ERROR _both_.
@@ -1307,8 +1271,19 @@ submit_io:
                if (size)
                        goto next_chunk;
        } else {
-                bio_put(bio);
+                /*
+                 * if we get here, no pages were added to the bio. However,
+                 * we can't just error out here - if the pages are locked then
+                 * we have to unlock them otherwise we can hang on a later
+                 * access to the page.
+                 */
                xfs_buf_ioerror(bp, EIO);
+                if (bp->b_flags & _XBF_PAGE_LOCKED) {
+                        int i;
+                        for (i = 0; i < bp->b_page_count; i++)
+                                unlock_page(bp->b_pages[i]);
+                }
+                bio_put(bio);
        }
 }
@@ -1463,8 +1438,7 @@ xfs_alloc_bufhash(
 {
        unsigned int            i;
-        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
+        btp->bt_hashshift = external ? 3 : 12;  /* 8 or 4096 buckets */
-        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
        btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
                                         sizeof(xfs_bufhash_t));
        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
@@ -1614,7 +1588,8 @@ xfs_mapping_buftarg(
 STATIC int
 xfs_alloc_delwrite_queue(
-        xfs_buftarg_t           *btp)
+        xfs_buftarg_t           *btp,
+        const char              *fsname)
 {
        int     error = 0;
@@ -1622,7 +1597,7 @@ xfs_alloc_delwrite_queue(
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
-        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
        if (IS_ERR(btp->bt_task)) {
                error = PTR_ERR(btp->bt_task);
                goto out_error;
@@ -1635,7 +1610,8 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
        struct block_device     *bdev,
-        int                     external)
+        int                     external,
+        const char              *fsname)
 {
        xfs_buftarg_t           *btp;
@@ -1647,7 +1623,7 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
-        if (xfs_alloc_delwrite_queue(btp))
+        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
        xfs_alloc_bufhash(btp, external);
        return btp;
@@ -1756,6 +1732,7 @@ xfs_buf_runall_queues(
 STATIC int
 xfsbufd_wakeup(
+        struct shrinker         *shrink,
        int                     priority,
        gfp_t                   mask)
 {
@@ -1797,7 +1774,7 @@ xfs_buf_delwri_split(
                trace_xfs_buf_delwri_split(bp, _RET_IP_);
                ASSERT(bp->b_flags & XBF_DELWRI);
-                if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+                if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
                        if (!force &&
                            time_before(jiffies, bp->b_queuetime + age)) {
                                xfs_buf_unlock(bp);
@@ -1882,7 +1859,7 @@ xfsbufd(
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
-                        xfs_buf_iostrategy(bp);
+                        xfs_bdstrat_cb(bp);
                        count++;
                }
                if (count)
@@ -1929,7 +1906,7 @@ xfs_flush_buftarg(
                        bp->b_flags &= ~XBF_ASYNC;
                        list_add(&bp->b_list, &wait_list);
                }
-                xfs_buf_iostrategy(bp);
+                xfs_bdstrat_cb(bp);
        }
        if (wait) {
@@ -1955,7 +1932,8 @@ xfs_buf_init(void)
        if (!xfs_buf_zone)
                goto out;
-        xfslogd_workqueue = create_workqueue("xfslogd");
+        xfslogd_workqueue = alloc_workqueue("xfslogd",
+                                        WQ_RESCUER | WQ_HIGHPRI, 1);
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..2a05614f0b92 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -44,57 +44,57 @@ typedef enum {
        XBRW_ZERO = 3,                  /* Zero target memory */
 } xfs_buf_rw_t;
-typedef enum {
+#define XBF_READ        (1 << 0) /* buffer intended for reading from device */
-        XBF_READ = (1 << 0),    /* buffer intended for reading from device */
+#define XBF_WRITE       (1 << 1) /* buffer intended for writing to device */
-        XBF_WRITE = (1 << 1),   /* buffer intended for writing to device   */
+#define XBF_MAPPED      (1 << 2) /* buffer mapped (b_addr valid) */
-        XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
+#define XBF_ASYNC       (1 << 4) /* initiator will not wait for completion */
-        XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
+#define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
-        XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate        */
+#define XBF_DELWRI      (1 << 6) /* buffer has dirty pages */
-        XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
+#define XBF_STALE       (1 << 7) /* buffer has been staled, do not find it */
-        XBF_STALE = (1 << 7),   /* buffer has been staled, do not find it  */
+#define XBF_FS_MANAGED  (1 << 8) /* filesystem controls freeing memory */
-        XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
+#define XBF_ORDERED     (1 << 11)/* use ordered writes */
-        XBF_ORDERED = (1 << 11),    /* use ordered writes                  */
+#define XBF_READ_AHEAD  (1 << 12)/* asynchronous read-ahead */
-        XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
+#define XBF_LOG_BUFFER  (1 << 13)/* this is a buffer used for the log */
-        XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log   */
+/* flags used only as arguments to access routines */
-        /* flags used only as arguments to access routines */
+#define XBF_LOCK        (1 << 14)/* lock requested */
-        XBF_LOCK = (1 << 14),       /* lock requested                      */
+#define XBF_TRYLOCK     (1 << 15)/* lock requested, but do not wait */
-        XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait     */
+#define XBF_DONT_BLOCK  (1 << 16)/* do not block in current thread */
-        XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread      */
+/* flags used only internally */
-        /* flags used only internally */
+#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
-        _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache                 */
+#define _XBF_PAGES      (1 << 18)/* backed by refcounted pages */
-        _XBF_PAGES = (1 << 18),     /* backed by refcounted pages          */
+#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
-        _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue         */
+#define _XBF_DELWRI_Q   (1 << 21)/* buffer on delwri queue */
-        _XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue             */
+/*
-        /*
+ * Special flag for supporting metadata blocks smaller than a FSB.
-         * Special flag for supporting metadata blocks smaller than a FSB.
+ *
-         *
+ * In this case we can have multiple xfs_buf_t on a single page and
-         * In this case we can have multiple xfs_buf_t on a single page and
+ * need to lock out concurrent xfs_buf_t readers as they only
-         * need to lock out concurrent xfs_buf_t readers as they only
+ * serialise access to the buffer.
-         * serialise access to the buffer.
+ *
-         *
+ * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
-         * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+ * between reads of the page. Hence we can have one thread read the
-         * between reads of the page. Hence we can have one thread read the
+ * page and modify it, but then race with another thread that thinks
-         * page and modify it, but then race with another thread that thinks
+ * the page is not up-to-date and hence reads it again.
-         * the page is not up-to-date and hence reads it again.
+ *
-         *
+ * The result is that the first modifcation to the page is lost.
-         * The result is that the first modifcation to the page is lost.
+ * This sort of AGF/AGI reading race can happen when unlinking inodes
-         * This sort of AGF/AGI reading race can happen when unlinking inodes
+ * that require truncation and results in the AGI unlinked list
-         * that require truncation and results in the AGI unlinked list
+ * modifications being lost.
-         * modifications being lost.
+ */
-         */
+#define _XBF_PAGE_LOCKED        (1 << 22)
-        _XBF_PAGE_LOCKED = (1 << 22),
+/*
-        /*
+ * If we try a barrier write, but it fails we have to communicate
-         * If we try a barrier write, but it fails we have to communicate
+ * this to the upper layers.  Unfortunately b_error gets overwritten
-         * this to the upper layers.  Unfortunately b_error gets overwritten
+ * when the buffer is re-issued so we have to add another flag to
-         * when the buffer is re-issued so we have to add another flag to
+ * keep this information.
-         * keep this information.
+ */
-         */
+#define _XFS_BARRIER_FAILED     (1 << 23)
-        _XFS_BARRIER_FAILED = (1 << 23),
-} xfs_buf_flags_t;
+typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
        { XBF_READ,             "READ" }, \
@@ -137,7 +137,6 @@ typedef struct xfs_buftarg {
        size_t                  bt_smask;
        /* per device buffer hash table */
-        uint                    bt_hashmask;
        uint                    bt_hashshift;
        xfs_bufhash_t           *bt_hash;
@@ -187,7 +186,6 @@ typedef struct xfs_buf {
        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
-        xfs_buf_bdstrat_t       b_strat;        /* pre-write function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -245,11 +243,6 @@ extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
                                xfs_buf_rw_t);
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
-        return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
        return bp ? bp->b_error : ENOMEM;
@@ -258,11 +251,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 extern void xfs_buf_delwri_promote(xfs_buf_t *);
@@ -326,8 +314,6 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_IODONE_FUNC(bp)                 ((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)       ((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)             ((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)      ((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp)            ((bp)->b_strat = NULL)
 #define XFS_BUF_FSPRIVATE(bp, type)             ((type)(bp)->b_fspriv)
 #define XFS_BUF_SET_FSPRIVATE(bp, val)          ((bp)->b_fspriv = (void*)(val))
@@ -351,7 +337,7 @@ extern void xfs_buf_terminate(void);
 #define XFS_BUF_SET_VTYPE(bp, type)             do { } while (0)
 #define XFS_BUF_SET_REF(bp, ref)                do { } while (0)
-#define XFS_BUF_ISPINNED(bp)    xfs_buf_ispin(bp)
+#define XFS_BUF_ISPINNED(bp)    atomic_read(&((bp)->b_pin_count))
 #define XFS_BUF_VALUSEMA(bp)    xfs_buf_lock_value(bp)
 #define XFS_BUF_CPSEMA(bp)      (xfs_buf_cond_lock(bp) == 0)
@@ -370,8 +356,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
        xfs_buf_rele(bp);
 }
-#define xfs_bpin(bp)            xfs_buf_pin(bp)
-#define xfs_bunpin(bp)          xfs_buf_unpin(bp)
 #define xfs_biodone(bp)         xfs_buf_ioend(bp, 0)
 #define xfs_biomove(bp, off, len, data, rw) \
@@ -390,7 +374,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h
deleted file mode 100644
index a8b0b1685eed..000000000000
--- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_PRIV_H__
-#define __XFS_DMAPI_PRIV_H__
-/*
- *      Based on IO_ISDIRECT, decide which i_ flag is set.
- */
-#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
-                              DM_FLAGS_IMUX : 0)
-#define DM_SEM_FLAG_WR  (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX)
-#endif /*__XFS_DMAPI_PRIV_H__*/
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 846b75aeb2ab..3764d74790ec 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -23,13 +23,13 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
@@ -128,13 +128,11 @@ xfs_nfs_get_inode(
                return ERR_PTR(-ESTALE);
        /*
-         * The XFS_IGET_BULKSTAT means that an invalid inode number is just
+         * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
-         * fine and not an indication of a corrupted filesystem.  Because
+         * fine and not an indication of a corrupted filesystem as clients can
-         * clients can send any kind of invalid file handle, e.g. after
+         * send invalid file handles and we have to handle it gracefully..
-         * a restore on the server we have to deal with this case gracefully.
         */
-        error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
+        error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
-                         XFS_ILOCK_SHARED, &ip, 0);
        if (error) {
                /*
                 * EINVAL means the inode cluster doesn't exist anymore.
@@ -149,11 +147,10 @@ xfs_nfs_get_inode(
        }
        if (ip->i_d.di_gen != generation) {
-                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                IRELE(ip);
                return ERR_PTR(-ENOENT);
        }
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        return VFS_I(ip);
 }
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..ba8ad422a165 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -22,23 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_trans.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_ioctl.h"
@@ -100,21 +92,23 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
        struct file             *file,
-        struct dentry           *dentry,
        int                     datasync)
 {
-        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_fsync(ip);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -XFS_ERROR(EIO);
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        xfs_ioend_wait(ip);
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
@@ -138,8 +132,8 @@ xfs_file_fsync(
         * might gets cleared when the inode gets written out via the AIL
         * or xfs_iflush_cluster.
         */
-        if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+        if (((inode->i_state & I_DIRTY_DATASYNC) ||
-            ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+            ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
            ip->i_update_core) {
                /*
                 * Kick off a transaction to log the inode core to get the
@@ -164,8 +158,7 @@ xfs_file_fsync(
                 * transaction.  So we play it safe and fire off the
                 * transaction anyway.
                 */
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                xfs_trans_set_sync(tp);
                error = _xfs_trans_commit(tp, 0, &log_flushed);
@@ -273,20 +266,6 @@ xfs_file_aio_read(
                mutex_lock(&inode->i_mutex);
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-                int iolock = XFS_IOLOCK_SHARED;
-                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
-                                        dmflags, &iolock);
-                if (ret) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        if (unlikely(ioflags & IO_ISDIRECT))
-                                mutex_unlock(&inode->i_mutex);
-                        return ret;
-                }
-        }
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
@@ -319,7 +298,6 @@ xfs_file_splice_read(
        unsigned int            flags)
 {
        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-        struct xfs_mount        *mp = ip->i_mount;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -333,18 +311,6 @@ xfs_file_splice_read(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_SHARED;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(infilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        return -error;
-                }
-        }
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
        ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
@@ -365,7 +331,6 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
        xfs_fsize_t             isize, new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -380,18 +345,6 @@ xfs_file_splice_write(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
-                int iolock = XFS_IOLOCK_EXCL;
-                int error;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
-                                        FILP_DELAY_FLAG(outfilp), &iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                        return -error;
-                }
-        }
        new_size = *ppos + count;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -461,7 +414,7 @@ xfs_zero_last_block(
        last_fsb = XFS_B_TO_FSBT(mp, isize);
        nimaps = 1;
        error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-                          &nimaps, NULL, NULL);
+                          &nimaps, NULL);
        if (error) {
                return error;
        }
@@ -556,7 +509,7 @@ xfs_zero_eof(
                nimaps = 1;
                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
                error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-                                  0, NULL, 0, &imap, &nimaps, NULL, NULL);
+                                  0, NULL, 0, &imap, &nimaps, NULL);
                if (error) {
                        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
                        return error;
@@ -625,7 +578,6 @@ xfs_file_aio_write(
        int                     ioflags = 0;
        xfs_fsize_t             isize, new_size;
        int                     iolock;
-        int                     eventsent = 0;
        size_t                  ocount = 0, count;
        int                     need_i_mutex;
@@ -671,33 +623,6 @@ start:
                goto out_unlock_mutex;
        }
-        if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
-            !(ioflags & IO_INVIS) && !eventsent)) {
-                int             dmflags = FILP_DELAY_FLAG(file);
-                if (need_i_mutex)
-                        dmflags |= DM_FLAGS_IMUX;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
-                                      pos, count, dmflags, &iolock);
-                if (error) {
-                        goto out_unlock_internal;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                eventsent = 1;
-                /*
-                 * The iolock was dropped and reacquired in XFS_SEND_DATA
-                 * so we have to recheck the size when appending.
-                 * We will only "goto start;" once, since having sent the
-                 * event prevents another call to XFS_SEND_DATA, which is
-                 * what allows the size to change in the first place.
-                 */
-                if ((file->f_flags & O_APPEND) && pos != ip->i_size)
-                        goto start;
-        }
        if (ioflags & IO_ISDIRECT) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
@@ -828,22 +753,6 @@ write_retry:
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
-        if (ret == -ENOSPC &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
-                xfs_iunlock(ip, iolock);
-                if (need_i_mutex)
-                        mutex_unlock(&inode->i_mutex);
-                error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
-                                DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
-                                0, 0, 0); /* Delay flag intentionally  unused */
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
-                if (error)
-                        goto out_unlock_internal;
-                goto start;
-        }
        error = -ret;
        if (ret <= 0)
                goto out_unlock_internal;
@@ -866,7 +775,7 @@ write_retry:
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file, file->f_path.dentry,
+                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
                if (!error)
                        error = error2;
@@ -1012,9 +921,6 @@ const struct file_operations xfs_file_operations = {
        .open           = xfs_file_open,
        .release        = xfs_file_release,
        .fsync          = xfs_file_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
-        .open_exec      = xfs_file_open_exec,
-#endif
 };
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index b6918d76bc7b..1f279b012f94 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -21,10 +21,6 @@
 #include "xfs_inode.h"
 #include "xfs_trace.h"
-int  fs_noerr(void) { return 0; }
-int  fs_nosys(void) { return ENOSYS; }
-void fs_noval(void) { return; }
 /*
 * note: all filemap functions return negative error codes. These
 * need to be inverted before returning to the xfs core functions.
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index 82bb19b2599e..000000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-extern int  fs_noerr(void);
-extern int  fs_nosys(void);
-extern void fs_noval(void);
-#endif  /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7b26cc2fd284..3b9e626f7cd1 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,24 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ioctl.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
@@ -527,6 +518,10 @@ xfs_attrmulti_by_handle(
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -675,10 +670,9 @@ xfs_ioc_bulkstat(
                error = xfs_bulkstat_single(mp, &inlast,
                                                bulkreq.ubuffer, &done);
        else    /* XFS_IOC_FSBULKSTAT */
-                error = xfs_bulkstat(mp, &inlast, &count,
+                error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
-                        (bulkstat_one_pf)xfs_bulkstat_one, NULL,
+                                     sizeof(xfs_bstat_t), bulkreq.ubuffer,
-                        sizeof(xfs_bstat_t), bulkreq.ubuffer,
+                                     &done);
-                        BULKSTAT_FG_QUICK, &done);
        if (error)
                return -error;
@@ -791,6 +785,8 @@ xfs_ioc_fsgetxattr(
 {
        struct fsxattr          fa;
+        memset(&fa, 0, sizeof(struct fsxattr));
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        fa.fsx_xflags = xfs_ip2xflags(ip);
        fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
@@ -905,7 +901,7 @@ xfs_ioctl_setattr(
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
-        xfs_itrace_entry(ip);
+        trace_xfs_ioctl_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return XFS_ERROR(EROFS);
@@ -913,6 +909,13 @@ xfs_ioctl_setattr(
                return XFS_ERROR(EIO);
        /*
+         * Disallow 32bit project ids because on-disk structure
+         * is 16bit only.
+         */
+        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+                return XFS_ERROR(EINVAL);
+        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
         * before we start any other transactions. Trying to do this later
         * is messy. We don't care to take a readlock to look at the ids
@@ -1040,8 +1043,7 @@ xfs_ioctl_setattr(
                }
        }
-        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        /*
         * Change file ownership.  Must be the owner or privileged.
@@ -1113,16 +1115,7 @@ xfs_ioctl_setattr(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
-        if (code)
+        return code;
-                return code;
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-                                NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
-                                (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
-        }
-        return 0;
 error_return:
        xfs_qm_dqrele(udqp);
@@ -1298,7 +1291,7 @@ xfs_file_ioctl(
        if (filp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_ioctl(ip);
        switch (cmd) {
        case XFS_IOC_ALLOCSP:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 593c05b4df8d..6c83f7f62dc9 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -28,12 +28,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -237,15 +233,12 @@ xfs_bulkstat_one_compat(
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-        void            *private_data,  /* my private data */
-        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
-        void            *dibuff,        /* on-disk inode buffer */
        int             *stat)          /* BULKSTAT_RV_... */
 {
        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-                                    xfs_bulkstat_one_fmt_compat, bno,
+                                    xfs_bulkstat_one_fmt_compat,
-                                    ubused, dibuff, stat);
+                                    ubused, stat);
 }
 /* copied from xfs_ioctl.c */
@@ -298,13 +291,11 @@ xfs_compat_ioc_bulkstat(
                int res;
                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-                                sizeof(compat_xfs_bstat_t),
+                                sizeof(compat_xfs_bstat_t), 0, &res);
-                                NULL, 0, NULL, NULL, &res);
        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
-                        xfs_bulkstat_one_compat, NULL,
+                        xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
-                        sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
+                        bulkreq.ubuffer, &done);
-                        BULKSTAT_FG_QUICK, &done);
        } else
                error = XFS_ERROR(EINVAL);
        if (error)
@@ -420,6 +411,10 @@ xfs_compat_attrmulti_by_handle(
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
@@ -545,7 +540,7 @@ xfs_file_compat_ioctl(
        if (filp->f_mode & FMODE_NOCMTIME)
                ioflags |= IO_INVIS;
-        xfs_itrace_entry(ip);
+        trace_xfs_file_compat_ioctl(ip);
        switch (cmd) {
        /* No size or alignment issues on any arch */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e65a7937f3a4..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -24,21 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
@@ -88,7 +80,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
-        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty_sync(inode);
 }
@@ -98,7 +90,7 @@ xfs_mark_inode_dirty(
 {
        struct inode    *inode = VFS_I(ip);
-        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
                mark_inode_dirty(inode);
 }
@@ -496,7 +488,7 @@ xfs_vn_getattr(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-        xfs_itrace_entry(ip);
+        trace_xfs_getattr(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -548,21 +540,6 @@ xfs_vn_setattr(
        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
-/*
- * block_truncate_page can return an error, but we can't propagate it
- * at all here. Leave a complaint + stack trace in the syslog because
- * this could be bad. If it is bad, we need to propagate the error further.
- */
-STATIC void
-xfs_vn_truncate(
-        struct inode    *inode)
-{
-        int     error;
-        error = block_truncate_page(inode->i_mapping, inode->i_size,
-                                                        xfs_get_blocks);
-        WARN_ON(error);
-}
 STATIC long
 xfs_vn_fallocate(
        struct inode    *inode,
@@ -585,11 +562,20 @@ xfs_vn_fallocate(
        bf.l_len = len;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
+        }
        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
                                       0, XFS_ATTR_NOLOCK);
-        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+        if (error)
-            offset + len > i_size_read(inode))
+                goto out_unlock;
-                new_size = offset + len;
        /* Change file size if needed */
        if (new_size) {
@@ -600,6 +586,7 @@ xfs_vn_fallocate(
                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
+out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
        return error;
@@ -673,8 +660,11 @@ xfs_vn_fiemap(
                bm.bmv_length = BTOBB(length);
        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = fieinfo->fi_extents_max + 1;
+        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-        bm.bmv_iflags = BMV_IF_PREALLOC;
+                                        fieinfo->fi_extents_max + 1;
+        bm.bmv_count = min_t(__s32, bm.bmv_count,
+                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
+        bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
@@ -689,7 +679,6 @@ xfs_vn_fiemap(
 static const struct inode_operations xfs_inode_operations = {
        .check_acl              = xfs_check_acl,
-        .truncate               = xfs_vn_truncate,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index facfb323a706..2fa0bd9ebc7f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -87,7 +87,6 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_globals.h>
-#include <xfs_fs_subr.h>
 #include <xfs_buf.h>
 /*
@@ -157,8 +156,6 @@
 */
 #define xfs_sort(a,n,s,fn)      sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()       dump_stack()
-#define xfs_itruncate_data(ip, off)     \
-        (-vmtruncate(VFS_I(ip), (off)))
 /* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..29b9d642e93d 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -16,13 +16,12 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include "xfs.h"
-#include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
@@ -69,15 +68,15 @@ xfs_fs_set_xstate(
        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        if (uflags & XFS_QUOTA_UDQ_ACCT)
+        if (uflags & FS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_PDQ_ACCT)
+        if (uflags & FS_QUOTA_PDQ_ACCT)
                flags |= XFS_PQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_GDQ_ACCT)
+        if (uflags & FS_QUOTA_GDQ_ACCT)
                flags |= XFS_GQUOTA_ACCT;
-        if (uflags & XFS_QUOTA_UDQ_ENFD)
+        if (uflags & FS_QUOTA_UDQ_ENFD)
                flags |= XFS_UQUOTA_ENFD;
-        if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+        if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
                flags |= XFS_OQUOTA_ENFD;
        switch (op) {
@@ -97,7 +96,7 @@ xfs_fs_set_xstate(
 }
 STATIC int
-xfs_fs_get_xquota(
+xfs_fs_get_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -114,7 +113,7 @@ xfs_fs_get_xquota(
 }
 STATIC int
-xfs_fs_set_xquota(
+xfs_fs_set_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -135,6 +134,6 @@ xfs_fs_set_xquota(
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
-        .get_xquota             = xfs_fs_get_xquota,
+        .get_dqblk              = xfs_fs_get_dqblk,
-        .set_xquota             = xfs_fs_set_xquota,
+        .set_dqblk              = xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 29f1edca76de..a4e07974955b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -25,14 +25,11 @@
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -43,7 +40,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_BARRIER  "barrier"       /* use writer barriers for log write and
                                         * unwritten extent conversion */
 #define MNTOPT_NOBARRIER "nobarrier"    /* .. disable */
-#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"   /* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP    "ikeep"         /* do not free empty inode clusters */
 #define MNTOPT_NOIKEEP  "noikeep"       /* free empty inode clusters */
@@ -116,9 +111,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DMAPI    "dmapi"         /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
-#define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
-#define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
 /*
 * Table driven mount option parser.
@@ -170,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
-        char                    *options,
+        char                    *options)
-        char                    **mtpt)
 {
        struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
        int                     dsunit = 0;
        int                     dswidth = 0;
        int                     iosize = 0;
-        int                     dmapi_implies_ikeep = 1;
        __uint8_t               iosizelog = 0;
        /*
@@ -241,15 +233,10 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        if (!value || !*value) {
+                        cmn_err(CE_WARN,
-                                cmn_err(CE_WARN,
+                                "XFS: %s option not allowed on this system",
-                                        "XFS: %s option requires an argument",
+                                this_char);
-                                        this_char);
+                        return EINVAL;
-                                return EINVAL;
-                        }
-                        *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
-                        if (!*mtpt)
-                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -286,8 +273,6 @@ xfs_parseargs(
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
                        mp->m_flags |= XFS_MOUNT_WSYNC;
-                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
@@ -327,7 +312,6 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
                        mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
-                        dmapi_implies_ikeep = 0;
                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
@@ -368,19 +352,22 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
+                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
-                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
+                        cmn_err(CE_WARN,
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                                "Enabling EXPERIMENTAL delayed logging feature "
-                } else if (!strcmp(this_char, MNTOPT_DMI)) {
+                                "- use at your own risk.\n");
-                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        /* no-op, this is now the default */
                        cmn_err(CE_WARN,
-        "XFS: osyncisdsync is now the default, option is deprecated.");
+        "XFS: osyncisdsync has no effect, option is deprecated.");
+                } else if (!strcmp(this_char, "osyncisosync")) {
+                        cmn_err(CE_WARN,
+        "XFS: osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
                        cmn_err(CE_WARN,
        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
@@ -421,12 +408,6 @@ xfs_parseargs(
                return EINVAL;
        }
-        if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
-                printk("XFS: %s option needs the mount point option as well\n",
-                        MNTOPT_DMAPI);
-                return EINVAL;
-        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                cmn_err(CE_WARN,
                        "XFS: sunit and swidth must be specified together");
@@ -440,18 +421,6 @@ xfs_parseargs(
                return EINVAL;
        }
-        /*
-         * Applications using DMI filesystems often expect the
-         * inode generation number to be monotonically increasing.
-         * If we delete inode chunks we break this assumption, so
-         * keep unused inode chunks on disk for DMI filesystems
-         * until we come up with a better solution.
-         * Note that if "ikeep" or "noikeep" mount options are
-         * supplied, then they are honored.
-         */
-        if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
 done:
        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
                /*
@@ -530,11 +499,10 @@ xfs_showargs(
                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
-                { XFS_MOUNT_OSYNCISOSYNC,       "," MNTOPT_OSYNCISOSYNC },
                { XFS_MOUNT_ATTR2,              "," MNTOPT_ATTR2 },
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
-                { XFS_MOUNT_DMAPI,              "," MNTOPT_DMAPI },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +693,8 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, NULL);
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -789,18 +758,18 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -902,7 +871,8 @@ xfsaild_start(
        struct xfs_ail  *ailp)
 {
        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                                    ailp->xa_mount->m_fsname);
        if (IS_ERR(ailp->xa_task))
                return -PTR_ERR(ailp->xa_task);
        return 0;
@@ -935,7 +905,7 @@ xfs_fs_destroy_inode(
 {
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_itrace_entry(ip);
+        trace_xfs_destroy_inode(ip);
        XFS_STATS_INC(vn_reclaim);
@@ -1051,10 +1021,8 @@ xfs_log_inode(
         * an inode in another recent transaction.  So we play it safe and
         * fire off the transaction anyway.
         */
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
        xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
@@ -1070,28 +1038,20 @@ xfs_fs_write_inode(
        struct xfs_mount        *mp = ip->i_mount;
        int                     error = EAGAIN;
-        xfs_itrace_entry(ip);
+        trace_xfs_write_inode(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
        if (wbc->sync_mode == WB_SYNC_ALL) {
                /*
-                 * Make sure the inode has hit stable storage.  By using the
+                 * Make sure the inode has made it it into the log.  Instead
-                 * log and the fsync transactions we reduce the IOs we have
+                 * of forcing it all the way to stable storage using a
-                 * to do here from two (log and inode) to just the log.
+                 * synchronous transaction we let the log force inside the
-                 *
+                 * ->sync_fs call do that for thus, which reduces the number
-                 * Note: We still need to do a delwri write of the inode after
+                 * of synchronous log foces dramatically.
-                 * this to flush it to the backing buffer so that bulkstat
-                 * works properly if this is the first time the inode has been
-                 * written.  Because we hold the ilock atomically over the
-                 * transaction commit and the inode flush we are guaranteed
-                 * that the inode is not pinned when it returns. If the flush
-                 * lock is already held, then the inode has already been
-                 * flushed once and we don't need to flush it again.  Hence
-                 * the code will only flush the inode if it isn't already
-                 * being flushed.
                 */
+                xfs_ioend_wait(ip);
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                if (ip->i_update_core) {
                        error = xfs_log_inode(ip);
@@ -1103,27 +1063,29 @@ xfs_fs_write_inode(
                 * We make this non-blocking if the inode is contended, return
                 * EAGAIN to indicate to the caller that they did not succeed.
                 * This prevents the flush path from blocking on inodes inside
-                 * another operation right now, they get caught later by xfs_sync.
+                 * another operation right now, they get caught later by
+                 * xfs_sync.
                 */
                if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                        goto out;
-        }
-        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-                goto out_unlock;
+                        goto out_unlock;
-        /*
+                /*
-         * Now we have the flush lock and the inode is not pinned, we can check
+                 * Now we have the flush lock and the inode is not pinned, we
-         * if the inode is really clean as we know that there are no pending
+                 * can check if the inode is really clean as we know that
-         * transaction completions, it is not waiting on the delayed write
+                 * there are no pending transaction completions, it is not
-         * queue and there is no IO in progress.
+                 * waiting on the delayed write queue and there is no IO in
-         */
+                 * progress.
-        if (xfs_inode_clean(ip)) {
+                 */
-                xfs_ifunlock(ip);
+                if (xfs_inode_clean(ip)) {
-                error = 0;
+                        xfs_ifunlock(ip);
-                goto out_unlock;
+                        error = 0;
+                        goto out_unlock;
+                }
+                error = xfs_iflush(ip, 0);
        }
-        error = xfs_iflush(ip, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -1138,12 +1100,15 @@ xfs_fs_write_inode(
 }
 STATIC void
-xfs_fs_clear_inode(
+xfs_fs_evict_inode(
        struct inode            *inode)
 {
        xfs_inode_t             *ip = XFS_I(inode);
-        xfs_itrace_entry(ip);
+        trace_xfs_evict_inode(ip);
+        truncate_inode_pages(&inode->i_data, 0);
+        end_writeback(inode);
        XFS_STATS_INC(vn_rele);
        XFS_STATS_INC(vn_remove);
        XFS_STATS_DEC(vn_active);
@@ -1180,22 +1145,13 @@ xfs_fs_put_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        /*
+         * Unregister the memory shrinker before we tear down the mount
+         * structure so we don't have memory reclaim racing with us here.
+         */
+        xfs_inode_shrinker_unregister(mp);
        xfs_syncd_stop(mp);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                /*
-                 * XXX(hch): this should be SYNC_WAIT.
-                 *
-                 * Or more likely not needed at all because the VFS is already
-                 * calling ->sync_fs after shutting down all filestem
-                 * operations and just before calling ->put_super.
-                 */
-                xfs_sync_data(mp, 0);
-                xfs_sync_attr(mp, 0);
-        }
-        XFS_SEND_PREUNMOUNT(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
         * This can and will cause log traffic as inodes go inactive
@@ -1205,14 +1161,10 @@ xfs_fs_put_super(
        XFS_bflush(mp->m_ddev_targp);
-        XFS_SEND_UNMOUNT(mp);
        xfs_unmountfs(mp);
        xfs_freesb(mp);
-        xfs_inode_shrinker_unregister(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
-        xfs_dmops_put(mp);
        xfs_free_fsname(mp);
        kfree(mp);
 }
@@ -1274,6 +1226,7 @@ xfs_fs_statfs(
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
+        __int64_t               ffree;
        statp->f_type = XFS_SB_MAGIC;
        statp->f_namelen = MAXNAMELEN - 1;
@@ -1297,7 +1250,11 @@ xfs_fs_statfs(
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
                                        mp->m_maxicount);
-        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        /* make sure statp->f_ffree does not underflow */
+        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        statp->f_ffree = max_t(__int64_t, ffree, 0);
        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
@@ -1450,7 +1407,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 STATIC int
@@ -1530,7 +1487,6 @@ xfs_fs_fill_super(
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
        int                     flags = 0, error = ENOMEM;
-        char                    *mtpt = NULL;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
@@ -1546,7 +1502,7 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
-        error = xfs_parseargs(mp, (char *)data, &mtpt);
+        error = xfs_parseargs(mp, (char *)data);
        if (error)
                goto out_free_fsname;
@@ -1558,16 +1514,12 @@ xfs_fs_fill_super(
 #endif
        sb->s_op = &xfs_super_operations;
-        error = xfs_dmops_get(mp);
-        if (error)
-                goto out_free_fsname;
        if (silent)
                flags |= XFS_MFSI_QUIET;
        error = xfs_open_devices(mp);
        if (error)
-                goto out_put_dmops;
+                goto out_free_fsname;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1595,8 +1547,6 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1625,7 +1575,6 @@ xfs_fs_fill_super(
        xfs_inode_shrinker_register(mp);
-        kfree(mtpt);
        return 0;
 out_filestream_unmount:
@@ -1635,11 +1584,8 @@ xfs_fs_fill_super(
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
- out_put_dmops:
-        xfs_dmops_put(mp);
 out_free_fsname:
        xfs_free_fsname(mp);
-        kfree(mtpt);
        kfree(mp);
 out:
        return -error;
@@ -1683,7 +1629,7 @@ static const struct super_operations xfs_super_operations = {
        .destroy_inode          = xfs_fs_destroy_inode,
        .dirty_inode            = xfs_fs_dirty_inode,
        .write_inode            = xfs_fs_write_inode,
-        .clear_inode            = xfs_fs_clear_inode,
+        .evict_inode            = xfs_fs_evict_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
        .freeze_fs              = xfs_fs_freeze,
@@ -1746,16 +1692,22 @@ xfs_init_zones(void)
        if (!xfs_trans_zone)
                goto out_destroy_ifork_zone;
+        xfs_log_item_desc_zone =
+                kmem_zone_init(sizeof(struct xfs_log_item_desc),
+                               "xfs_log_item_desc");
+        if (!xfs_log_item_desc_zone)
+                goto out_destroy_trans_zone;
        /*
         * The size of the zone allocated buf log item is the maximum
         * size possible under XFS.  This wastes a little bit of memory,
         * but it is much faster.
         */
        xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
                                  NBWORD) * sizeof(int))), "xfs_buf_item");
        if (!xfs_buf_item_zone)
-                goto out_destroy_trans_zone;
+                goto out_destroy_log_item_desc_zone;
        xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
                        ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1792,6 +1744,8 @@ xfs_init_zones(void)
        kmem_zone_destroy(xfs_efd_zone);
 out_destroy_buf_item_zone:
        kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+        kmem_zone_destroy(xfs_log_item_desc_zone);
 out_destroy_trans_zone:
        kmem_zone_destroy(xfs_trans_zone);
 out_destroy_ifork_zone:
@@ -1822,6 +1776,7 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_efi_zone);
        kmem_zone_destroy(xfs_efd_zone);
        kmem_zone_destroy(xfs_buf_item_zone);
+        kmem_zone_destroy(xfs_log_item_desc_zone);
        kmem_zone_destroy(xfs_trans_zone);
        kmem_zone_destroy(xfs_ifork_zone);
        kmem_zone_destroy(xfs_dabuf_zone);
@@ -1870,7 +1825,6 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
-        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1898,7 +1852,6 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
-        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..1ef4a4d2d997 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -56,12 +56,6 @@ extern void xfs_qm_exit(void);
 # define XFS_BIGFS_STRING
 #endif
-#ifdef CONFIG_XFS_DMAPI
-# define XFS_DMAPI_STRING       "dmapi support, "
-#else
-# define XFS_DMAPI_STRING
-#endif
 #ifdef DEBUG
 # define XFS_DBG_STRING         "debug"
 #else
@@ -72,7 +66,6 @@ extern void xfs_qm_exit(void);
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
                                XFS_BIGFS_STRING \
-                                XFS_DMAPI_STRING \
                                XFS_DBG_STRING /* DBG must be last */
 struct xfs_inode;
@@ -85,7 +78,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a427c638d909..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -24,27 +24,17 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_inode.h"
 #include "xfs_dinode.h"
 #include "xfs_error.h"
-#include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
-#include "xfs_utils.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
-#include "xfs_rw.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -144,6 +134,41 @@ restart:
        return last_error;
 }
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          *first,
+        int                     tag)
+{
+        struct xfs_perag        *pag = NULL;
+        if (tag == XFS_ICI_RECLAIM_TAG) {
+                int found;
+                int ref;
+                spin_lock(&mp->m_perag_lock);
+                found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                (void **)&pag, *first, 1, tag);
+                if (found <= 0) {
+                        spin_unlock(&mp->m_perag_lock);
+                        return NULL;
+                }
+                *first = pag->pag_agno + 1;
+                /* open coded pag reference increment */
+                ref = atomic_inc_return(&pag->pag_ref);
+                spin_unlock(&mp->m_perag_lock);
+                trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+        } else {
+                pag = xfs_perag_get(mp, *first);
+                (*first)++;
+        }
+        return pag;
+}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
@@ -154,20 +179,15 @@ xfs_inode_ag_iterator(
        int                     exclusive,
        int                     *nr_to_scan)
 {
+        struct xfs_perag        *pag;
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
        int                     nr;
        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
-        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+        ag = 0;
-                struct xfs_perag        *pag;
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
-                pag = xfs_perag_get(mp, ag);
-                if (!pag->pag_ici_init) {
-                        xfs_perag_put(pag);
-                        continue;
-                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -289,7 +309,7 @@ xfs_sync_inode_attr(
 /*
 * Write out pagecache data for the whole filesystem.
 */
-int
+STATIC int
 xfs_sync_data(
        struct xfs_mount        *mp,
        int                     flags)
@@ -310,7 +330,7 @@ xfs_sync_data(
 /*
 * Write out inode metadata (attributes) for the whole filesystem.
 */
-int
+STATIC int
 xfs_sync_attr(
        struct xfs_mount        *mp,
        int                     flags)
@@ -322,102 +342,24 @@ xfs_sync_attr(
 }
 STATIC int
-xfs_commit_dummy_trans(
-        struct xfs_mount        *mp,
-        uint                    flags)
-{
-        struct xfs_inode        *ip = mp->m_rootip;
-        struct xfs_trans        *tp;
-        int                     error;
-        /*
-         * Put a dummy transaction in the log to tell recovery
-         * that all others are OK.
-         */
-        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        xfs_trans_ihold(tp, ip);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /* the log force ensures this transaction is pushed to disk */
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return error;
-}
-STATIC int
 xfs_sync_fsdata(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        int                     flags)
 {
        struct xfs_buf          *bp;
-        struct xfs_buf_log_item *bip;
-        int                     error = 0;
-        /*
-         * If this is xfssyncd() then only sync the superblock if we can
-         * lock it without sleeping and it is not pinned.
-         */
-        if (flags & SYNC_TRYLOCK) {
-                ASSERT(!(flags & SYNC_WAIT));
-                bp = xfs_getsb(mp, XBF_TRYLOCK);
-                if (!bp)
-                        goto out;
-                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
-                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
-                        goto out_brelse;
-        } else {
-                bp = xfs_getsb(mp, 0);
-                /*
-                 * If the buffer is pinned then push on the log so we won't
-                 * get stuck waiting in the write for someone, maybe
-                 * ourselves, to flush the log.
-                 *
-                 * Even though we just pushed the log above, we did not have
-                 * the superblock buffer locked at that point so it can
-                 * become pinned in between there and here.
-                 */
-                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(mp, 0);
-        }
-        if (flags & SYNC_WAIT)
-                XFS_BUF_UNASYNC(bp);
-        else
-                XFS_BUF_ASYNC(bp);
-        error = xfs_bwrite(mp, bp);
-        if (error)
-                return error;
        /*
-         * If this is a data integrity sync make sure all pending buffers
+         * If the buffer is pinned then push on the log so we won't get stuck
-         * are flushed out for the log coverage check below.
+         * waiting in the write for someone, maybe ourselves, to flush the log.
+         *
+         * Even though we just pushed the log above, we did not have the
+         * superblock buffer locked at that point so it can become pinned in
+         * between there and here.
         */
-        if (flags & SYNC_WAIT)
+        bp = xfs_getsb(mp, 0);
-                xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        if (XFS_BUF_ISPINNED(bp))
+                xfs_log_force(mp, 0);
-        if (xfs_log_need_covered(mp))
-                error = xfs_commit_dummy_trans(mp, flags);
-        return error;
- out_brelse:
+        return xfs_bwrite(mp, bp);
-        xfs_buf_relse(bp);
- out:
-        return error;
 }
 /*
@@ -441,7 +383,7 @@ int
 xfs_quiesce_data(
        struct xfs_mount        *mp)
 {
-        int error;
+        int                     error, error2 = 0;
        /* push non-blocking */
        xfs_sync_data(mp, 0);
@@ -452,13 +394,20 @@ xfs_quiesce_data(
        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp, SYNC_WAIT);
+        error = xfs_sync_fsdata(mp);
+        /* make sure all delwri buffers are written out */
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        /* mark the log as covered if needed */
+        if (xfs_log_need_covered(mp))
+                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
                XFS_bflush(mp->m_rtdev_targp);
-        return error;
+        return error ? error : error2;
 }
 STATIC void
@@ -581,9 +530,9 @@ xfs_flush_inodes(
 }
 /*
- * Every sync period we need to unpin all items, reclaim inodes, sync
+ * Every sync period we need to unpin all items, reclaim inodes and sync
- * quota and write out the superblock. We might need to cover the log
+ * disk quotas.  We might need to cover the log to indicate that the
- * to indicate it is idle.
+ * filesystem is idle and not frozen.
 */
 STATIC void
 xfs_sync_worker(
@@ -597,7 +546,9 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+                if (mp->m_super->s_frozen == SB_UNFROZEN &&
+                    xfs_log_need_covered(mp))
+                        error = xfs_fs_log_dummy(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -660,7 +611,7 @@ xfs_syncd_init(
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
        mp->m_sync_work.w_completion = NULL;
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
        return 0;
@@ -681,6 +632,17 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        if (!pag->pag_ici_reclaimable) {
+                /* propagate the reclaim tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
        pag->pag_ici_reclaimable++;
 }
@@ -706,6 +668,24 @@ xfs_inode_set_reclaim_tag(
        xfs_perag_put(pag);
 }
+STATIC void
+__xfs_inode_clear_reclaim(
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        pag->pag_ici_reclaimable--;
+        if (!pag->pag_ici_reclaimable) {
+                /* clear the reclaim tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
+}
 void
 __xfs_inode_clear_reclaim_tag(
        xfs_mount_t     *mp,
@@ -714,7 +694,7 @@ __xfs_inode_clear_reclaim_tag(
 {
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-        pag->pag_ici_reclaimable--;
+        __xfs_inode_clear_reclaim(pag, ip);
 }
 /*
@@ -853,7 +833,37 @@ out:
 reclaim:
        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_ireclaim(ip);
+        XFS_STATS_INC(xs_ig_reclaims);
+        /*
+         * Remove the inode from the per-AG radix tree.
+         *
+         * Because radix_tree_delete won't complain even if the item was never
+         * added to the tree assert that it's been there before to catch
+         * problems with the inode life time early on.
+         */
+        write_lock(&pag->pag_ici_lock);
+        if (!radix_tree_delete(&pag->pag_ici_root,
+                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                ASSERT(0);
+        __xfs_inode_clear_reclaim(pag, ip);
+        write_unlock(&pag->pag_ici_lock);
+        /*
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.  We get
+         * both the ilock and the iolock because the code may need to drop the
+         * ilock one but will still hold the iolock.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_qm_dqdetach(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_inode_free(ip);
        return error;
 }
@@ -869,88 +879,52 @@ xfs_reclaim_inodes(
 /*
 * Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
 */
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
 static int
 xfs_reclaim_inode_shrink(
+        struct shrinker *shrink,
        int             nr_to_scan,
        gfp_t           gfp_mask)
 {
        struct xfs_mount *mp;
        struct xfs_perag *pag;
        xfs_agnumber_t  ag;
-        int             reclaimable = 0;
+        int             reclaimable;
+        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                down_read(&xfs_mount_list_lock);
+                xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
-                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-                        if (nr_to_scan <= 0)
+                /* if we don't exhaust the scan, don't bother coming back */
-                                break;
+                if (nr_to_scan > 0)
-                }
+                        return -1;
-                up_read(&xfs_mount_list_lock);
+       }
-        }
-        down_read(&xfs_mount_list_lock);
-        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-                        pag = xfs_perag_get(mp, ag);
+        reclaimable = 0;
-                        if (!pag->pag_ici_init) {
+        ag = 0;
-                                xfs_perag_put(pag);
+        while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
-                                continue;
+                                        XFS_ICI_RECLAIM_TAG))) {
-                        }
+                reclaimable += pag->pag_ici_reclaimable;
-                        reclaimable += pag->pag_ici_reclaimable;
+                xfs_perag_put(pag);
-                        xfs_perag_put(pag);
-                }
        }
-        up_read(&xfs_mount_list_lock);
        return reclaimable;
 }
-static struct shrinker xfs_inode_shrinker = {
-        .shrink = xfs_reclaim_inode_shrink,
-        .seeks = DEFAULT_SEEKS,
-};
-void __init
-xfs_inode_shrinker_init(void)
-{
-        init_rwsem(&xfs_mount_list_lock);
-        register_shrinker(&xfs_inode_shrinker);
-}
-void
-xfs_inode_shrinker_destroy(void)
-{
-        ASSERT(list_empty(&xfs_mount_list));
-        unregister_shrinker(&xfs_inode_shrinker);
-}
 void
 xfs_inode_shrinker_register(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
-        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
-        up_write(&xfs_mount_list_lock);
+        register_shrinker(&mp->m_inode_shrink);
 }
 void
 xfs_inode_shrinker_unregister(
        struct xfs_mount        *mp)
 {
-        down_write(&xfs_mount_list_lock);
+        unregister_shrinker(&mp->m_inode_shrink);
-        list_del(&mp->m_mplist);
-        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..fe78726196f8 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,9 +35,6 @@ typedef struct xfs_sync_work {
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_sync_attr(struct xfs_mount *mp, int flags);
-int xfs_sync_data(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
@@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
        int flags, int tag, int write_lock, int *nr_to_scan);
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..88d25d4aa56e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -24,24 +24,19 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
@@ -50,6 +45,8 @@
 #include "xfs_aops.h"
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
 /*
 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..be5dffd282a1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xlog_ticket;
 struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -78,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
        )
 )
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
-                 unsigned long caller_ip), \
-        TP_ARGS(mp, agno, refcount, caller_ip), \
-        TP_STRUCT__entry( \
-                __field(dev_t, dev) \
-                __field(xfs_agnumber_t, agno) \
-                __field(int, refcount) \
-                __field(unsigned long, caller_ip) \
-        ), \
-        TP_fast_assign( \
-                __entry->dev = mp->m_super->s_dev; \
-                __entry->agno = agno; \
-                __entry->refcount = refcount; \
-                __entry->caller_ip = caller_ip; \
-        ), \
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
-                  __entry->agno, \
-                  __entry->refcount, \
-                  (char *)__entry->caller_ip) \
-);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
        TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -118,6 +95,40 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+                 unsigned long caller_ip),
+        TP_ARGS(mp, agno, refcount, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, refcount)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->refcount = refcount;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->refcount,
+                  (char *)__entry->caller_ip)
+);
+#define DEFINE_PERAG_REF_EVENT(name)    \
+DEFINE_EVENT(xfs_perag_class, name,     \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
+                 unsigned long caller_ip),                                      \
+        TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
                 struct xfs_da_node_entry *btree),
@@ -306,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init);
 DEFINE_BUF_EVENT(xfs_buf_free);
 DEFINE_BUF_EVENT(xfs_buf_hold);
 DEFINE_BUF_EVENT(xfs_buf_rele);
-DEFINE_BUF_EVENT(xfs_buf_pin);
-DEFINE_BUF_EVENT(xfs_buf_unpin);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_iorequest);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
@@ -530,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait);
 DEFINE_LOCK_EVENT(xfs_ilock_demote);
 DEFINE_LOCK_EVENT(xfs_iunlock);
-DECLARE_EVENT_CLASS(xfs_iget_class,
+DECLARE_EVENT_CLASS(xfs_inode_class,
        TP_PROTO(struct xfs_inode *ip),
        TP_ARGS(ip),
        TP_STRUCT__entry(
@@ -546,47 +555,127 @@ DECLARE_EVENT_CLASS(xfs_iget_class,
                  __entry->ino)
 )
-#define DEFINE_IGET_EVENT(name) \
+#define DEFINE_INODE_EVENT(name) \
-DEFINE_EVENT(xfs_iget_class, name, \
+DEFINE_EVENT(xfs_inode_class, name, \
        TP_PROTO(struct xfs_inode *ip), \
        TP_ARGS(ip))
-DEFINE_IGET_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_skip);
-DEFINE_IGET_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
-DEFINE_IGET_EVENT(xfs_iget_found);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
-DEFINE_IGET_EVENT(xfs_iget_alloc);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
-DECLARE_EVENT_CLASS(xfs_inode_class,
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, count)
+                __field(int, pincount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->count = atomic_read(&VFS_I(ip)->i_count);
+                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
+                  __entry->pincount,
                  (char *)__entry->caller_ip)
 )
-#define DEFINE_INODE_EVENT(name) \
+#define DEFINE_IREF_EVENT(name) \
-DEFINE_EVENT(xfs_inode_class, name, \
+DEFINE_EVENT(xfs_iref_class, name, \
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
        TP_ARGS(ip, caller_ip))
-DEFINE_INODE_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_ihold);
-DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_irele);
-/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
+DEFINE_IREF_EVENT(xfs_inode_pin);
-DEFINE_INODE_EVENT(xfs_inode);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
-#define xfs_itrace_entry(ip)    \
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
-        trace_xfs_inode(ip, _THIS_IP_)
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+        TP_ARGS(dp, name),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, dp_ino)
+                __dynamic_array(char, name, name->len)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(dp)->i_sb->s_dev;
+                __entry->dp_ino = dp->i_ino;
+                memcpy(__get_str(name), name->name, name->len);
+        ),
+        TP_printk("dev %d:%d dp ino 0x%llx name %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->dp_ino,
+                  __get_str(name))
+)
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+        TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+        TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+TRACE_EVENT(xfs_rename,
+        TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+                 struct xfs_name *src_name, struct xfs_name *target_name),
+        TP_ARGS(src_dp, target_dp, src_name, target_name),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, src_dp_ino)
+                __field(xfs_ino_t, target_dp_ino)
+                __dynamic_array(char, src_name, src_name->len)
+                __dynamic_array(char, target_name, target_name->len)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+                __entry->src_dp_ino = src_dp->i_ino;
+                __entry->target_dp_ino = target_dp->i_ino;
+                memcpy(__get_str(src_name), src_name->name, src_name->len);
+                memcpy(__get_str(target_name), target_name->name, target_name->len);
+        ),
+        TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+                  " src name %s target name %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->src_dp_ino,
+                  __entry->target_dp_ino,
+                  __get_str(src_name),
+                  __get_str(target_name))
+)
 DECLARE_EVENT_CLASS(xfs_dquot_class,
        TP_PROTO(struct xfs_dquot *dqp),
@@ -642,8 +731,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
        TP_PROTO(struct xfs_dquot *dqp), \
        TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +745,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -669,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
 DEFINE_DQUOT_EVENT(xfs_dqflush_force);
 DEFINE_DQUOT_EVENT(xfs_dqflush_done);
-/* not really iget events, but we re-use the format */
-DEFINE_IGET_EVENT(xfs_dquot_dqalloc);
-DEFINE_IGET_EVENT(xfs_dquot_dqdetach);
 DECLARE_EVENT_CLASS(xfs_loggrant_class,
        TP_PROTO(struct log *log, struct xlog_ticket *tic),
@@ -767,165 +850,177 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_file_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags),
-        TP_ARGS(ip, count, offset, flags), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(xfs_fsize_t, size)
-                __field(xfs_fsize_t, size) \
+                __field(xfs_fsize_t, new_size)
-                __field(xfs_fsize_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count 0x%zx ioflags %s",
-                  "offset 0x%llx count 0x%zx ioflags %s", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
-                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
 )
+#define DEFINE_RW_EVENT(name)           \
+DEFINE_EVENT(xfs_file_class, name,      \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
+DECLARE_EVENT_CLASS(xfs_page_class,
-#define DEFINE_PAGE_EVENT(name) \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-TRACE_EVENT(name, \
+        TP_ARGS(inode, page, off),
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+        TP_STRUCT__entry(
-        TP_ARGS(inode, page, off), \
+                __field(dev_t, dev)
-        TP_STRUCT__entry( \
+                __field(xfs_ino_t, ino)
-                __field(dev_t, dev) \
+                __field(pgoff_t, pgoff)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(pgoff_t, pgoff) \
+                __field(unsigned long, offset)
-                __field(loff_t, size) \
+                __field(int, delalloc)
-                __field(unsigned long, offset) \
+                __field(int, unwritten)
-                __field(int, delalloc) \
+        ),
-                __field(int, unmapped) \
+        TP_fast_assign(
-                __field(int, unwritten) \
+                int delalloc = -1, unwritten = -1;
-        ), \
-        TP_fast_assign( \
+                if (page_has_buffers(page))
-                int delalloc = -1, unmapped = -1, unwritten = -1; \
+                        xfs_count_page_state(page, &delalloc, &unwritten);
-        \
+                __entry->dev = inode->i_sb->s_dev;
-                if (page_has_buffers(page)) \
+                __entry->ino = XFS_I(inode)->i_ino;
-                        xfs_count_page_state(page, &delalloc, \
+                __entry->pgoff = page_offset(page);
-                                             &unmapped, &unwritten); \
+                __entry->size = i_size_read(inode);
-                __entry->dev = inode->i_sb->s_dev; \
+                __entry->offset = off;
-                __entry->ino = XFS_I(inode)->i_ino; \
+                __entry->delalloc = delalloc;
-                __entry->pgoff = page_offset(page); \
+                __entry->unwritten = unwritten;
-                __entry->size = i_size_read(inode); \
+        ),
-                __entry->offset = off; \
+        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                __entry->delalloc = delalloc; \
+                  "delalloc %d unwritten %d",
-                __entry->unmapped = unmapped; \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                __entry->unwritten = unwritten; \
+                  __entry->ino,
-        ), \
+                  __entry->pgoff,
-        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+                  __entry->size,
-                  "delalloc %d unmapped %d unwritten %d", \
+                  __entry->offset,
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->delalloc,
-                  __entry->ino, \
+                  __entry->unwritten)
-                  __entry->pgoff, \
-                  __entry->size, \
-                  __entry->offset, \
-                  __entry->delalloc, \
-                  __entry->unmapped, \
-                  __entry->unwritten) \
 )
+#define DEFINE_PAGE_EVENT(name)         \
+DEFINE_EVENT(xfs_page_class, name,      \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),
-                 int flags, struct xfs_bmbt_irec *irec), \
+        TP_ARGS(ip, offset, count, flags, irec),
-        TP_ARGS(ip, offset, count, flags, irec), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+                __field(xfs_fileoff_t, startoff)
-                __field(xfs_fileoff_t, startoff) \
+                __field(xfs_fsblock_t, startblock)
-                __field(xfs_fsblock_t, startblock) \
+                __field(xfs_filblks_t, blockcount)
-                __field(xfs_filblks_t, blockcount) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+                __entry->startoff = irec ? irec->br_startoff : 0;
-                __entry->startoff = irec ? irec->br_startoff : 0; \
+                __entry->startblock = irec ? irec->br_startblock : 0;
-                __entry->startblock = irec ? irec->br_startblock : 0; \
+                __entry->blockcount = irec ? irec->br_blockcount : 0;
-                __entry->blockcount = irec ? irec->br_blockcount : 0; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd flags %s "
-                  "offset 0x%llx count %zd flags %s " \
+                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
-                  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+                  __entry->startoff,
-                  __entry->startoff, \
+                  (__int64_t)__entry->startblock,
-                  (__int64_t)__entry->startblock, \
+                  __entry->blockcount)
-                  __entry->blockcount) \
 )
+#define DEFINE_IOMAP_EVENT(name)        \
+DEFINE_EVENT(xfs_iomap_class, name,     \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),                \
+        TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+        TP_ARGS(ip, offset, count),
-        TP_ARGS(ip, offset, count), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd",
-                  "offset 0x%llx count %zd", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count)
-                  __entry->count) \
 );
+#define DEFINE_SIMPLE_IO_EVENT(name)    \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
+        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
@@ -1051,83 +1146,112 @@ TRACE_EVENT(xfs_bunmap,
 );
+#define XFS_BUSY_SYNC \
+        { 0,    "async" }, \
+        { 1,    "sync" }
 TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_extlen_t len, int slot),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(mp, agno, agbno, len, slot),
+        TP_ARGS(trans, agno, agbno, len, sync),
        TP_STRUCT__entry(
                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(int, tid)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, slot)
+                __field(int, sync)
        ),
        TP_fast_assign(
-                __entry->dev = mp->m_super->s_dev;
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->tid = trans->t_ticket->t_tid;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->slot = slot;
+                __entry->sync = sync;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
+                  __entry->tid,
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __entry->slot)
+                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 );
-#define XFS_BUSY_STATES \
-        { 0,    "found" }, \
-        { 1,    "missing" }
 TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 int slot, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len),
-        TP_ARGS(mp, agno, slot, found),
+        TP_ARGS(mp, agno, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
-                __field(int, slot)
+                __field(xfs_agblock_t, agbno)
-                __field(int, found)
+                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
-                __entry->slot = slot;
+                __entry->agbno = agbno;
-                __entry->found = found;
+                __entry->len = len;
        ),
-        TP_printk("dev %d:%d agno %u slot %d %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
-                  __entry->slot,
+                  __entry->agbno,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->len)
 );
+#define XFS_BUSY_STATES \
+        { 0,    "missing" }, \
+        { 1,    "found" }
 TRACE_EVENT(xfs_alloc_busysearch,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_extlen_t len, xfs_lsn_t lsn),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
-        TP_ARGS(mp, agno, agbno, len, lsn),
+        TP_ARGS(mp, agno, agbno, len, found),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(xfs_lsn_t, lsn)
+                __field(int, found)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->lsn = lsn;
+                __entry->found = found;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
+                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+TRACE_EVENT(xfs_trans_commit_lsn,
+        TP_PROTO(struct xfs_trans *trans),
+        TP_ARGS(trans),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->lsn = trans->t_commit_lsn;
+        ),
+        TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
                  __entry->lsn)
 );
@@ -1495,6 +1619,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+        TP_PROTO(struct log *log, struct xlog_recover *trans,
+                struct xlog_recover_item *item, int pass),
+        TP_ARGS(log, trans, item, pass),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(unsigned long, item)
+                __field(xlog_tid_t, tid)
+                __field(int, type)
+                __field(int, pass)
+                __field(int, count)
+                __field(int, total)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->item = (unsigned long)item;
+                __entry->tid = trans->r_log_tid;
+                __entry->type = ITEM_TYPE(item);
+                __entry->pass = pass;
+                __entry->count = item->ri_cnt;
+                __entry->total = item->ri_total;
+        ),
+        TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+                  "item region count/total %d/%d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tid,
+                  __entry->pass,
+                  (void *)__entry->item,
+                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                  __entry->count,
+                  __entry->total)
+)
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+        TP_PROTO(struct log *log, struct xlog_recover *trans, \
+                struct xlog_recover_item *item, int pass), \
+        TP_ARGS(log, trans, item, pass))
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+        TP_ARGS(log, buf_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(__int64_t, blkno)
+                __field(unsigned short, len)
+                __field(unsigned short, flags)
+                __field(unsigned short, size)
+                __field(unsigned int, map_size)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->blkno = buf_f->blf_blkno;
+                __entry->len = buf_f->blf_len;
+                __entry->flags = buf_f->blf_flags;
+                __entry->size = buf_f->blf_size;
+                __entry->map_size = buf_f->blf_map_size;
+        ),
+        TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+                        "map_size %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->flags,
+                  __entry->size,
+                  __entry->map_size)
+)
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+        TP_ARGS(log, buf_f))
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+        TP_ARGS(log, in_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(unsigned short, size)
+                __field(int, fields)
+                __field(unsigned short, asize)
+                __field(unsigned short, dsize)
+                __field(__int64_t, blkno)
+                __field(int, len)
+                __field(int, boffset)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->ino = in_f->ilf_ino;
+                __entry->size = in_f->ilf_size;
+                __entry->fields = in_f->ilf_fields;
+                __entry->asize = in_f->ilf_asize;
+                __entry->dsize = in_f->ilf_dsize;
+                __entry->blkno = in_f->ilf_blkno;
+                __entry->len = in_f->ilf_len;
+                __entry->boffset = in_f->ilf_boffset;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+                        "dsize %d, blkno 0x%llx, len %d, boffset %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->fields,
+                  __entry->asize,
+                  __entry->dsize,
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+        TP_ARGS(log, in_f))
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                                (void *)value, size, xflags);
 }
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = 0, /* no flags implies user namespace */
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .flags  = ATTR_ROOT,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = ATTR_SECURE,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..e1a2f6800e01 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -64,8 +54,6 @@
   flush lock - ditto.
 */
-STATIC void             xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
 #ifdef DEBUG
 xfs_buftarg_t *xfs_dqerror_target;
 int xfs_do_dqerror;
@@ -101,7 +89,7 @@ xfs_qm_dqinit(
         * No need to re-initialize these if this is a reclaimed dquot.
         */
        if (brandnewdquot) {
-                dqp->dq_flnext = dqp->dq_flprev = dqp;
+                INIT_LIST_HEAD(&dqp->q_freelist);
                mutex_init(&dqp->q_qlock);
                init_waitqueue_head(&dqp->q_pinwait);
@@ -119,20 +107,20 @@ xfs_qm_dqinit(
                 * Only the q_core portion was zeroed in dqreclaim_one().
                 * So, we need to reset others.
                 */
-                 dqp->q_nrefs = 0;
+                dqp->q_nrefs = 0;
-                 dqp->q_blkno = 0;
+                dqp->q_blkno = 0;
-                 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+                INIT_LIST_HEAD(&dqp->q_mplist);
-                 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+                INIT_LIST_HEAD(&dqp->q_hashlist);
-                 dqp->q_bufoffset = 0;
+                dqp->q_bufoffset = 0;
-                 dqp->q_fileoffset = 0;
+                dqp->q_fileoffset = 0;
-                 dqp->q_transp = NULL;
+                dqp->q_transp = NULL;
-                 dqp->q_gdquot = NULL;
+                dqp->q_gdquot = NULL;
-                 dqp->q_res_bcount = 0;
+                dqp->q_res_bcount = 0;
-                 dqp->q_res_icount = 0;
+                dqp->q_res_icount = 0;
-                 dqp->q_res_rtbcount = 0;
+                dqp->q_res_rtbcount = 0;
-                 atomic_set(&dqp->q_pincount, 0);
+                atomic_set(&dqp->q_pincount, 0);
-                 dqp->q_hash = NULL;
+                dqp->q_hash = NULL;
-                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+                ASSERT(list_empty(&dqp->q_freelist));
                trace_xfs_dqreuse(dqp);
        }
@@ -158,7 +146,7 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
        sv_destroy(&dqp->q_pinwait);
@@ -252,7 +240,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_bcount) >=
                      be64_to_cpu(d->d_blk_hardlimit)))) {
                        d->d_btimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_BTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_btimelimit);
                } else {
                        d->d_bwarns = 0;
                }
@@ -275,7 +263,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_icount) >=
                      be64_to_cpu(d->d_ino_hardlimit)))) {
                        d->d_itimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_ITIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_itimelimit);
                } else {
                        d->d_iwarns = 0;
                }
@@ -298,7 +286,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_rtbcount) >=
                      be64_to_cpu(d->d_rtb_hardlimit)))) {
                        d->d_rtbtimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_RTBTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_rtbtimelimit);
                } else {
                        d->d_rtbwarns = 0;
                }
@@ -325,6 +313,7 @@ xfs_qm_init_dquot_blk(
        uint            type,
        xfs_buf_t       *bp)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_dqblk_t     *d;
        int             curid, i;
@@ -337,16 +326,16 @@ xfs_qm_init_dquot_blk(
        /*
         * ID of the first dquot in the block - id's are zero based.
         */
-        curid = id - (id % XFS_QM_DQPERBLK(mp));
+        curid = id - (id % q->qi_dqperchunk);
        ASSERT(curid >= 0);
-        memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+        memset(d, 0, BBTOB(q->qi_dqchunklen));
-        for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
                xfs_qm_dqinit_core(curid, type, d);
        xfs_trans_dquot_buf(tp, bp,
-                            (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
+                            (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
-                            ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
+                            ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
-                             XFS_BLI_GDQUOT_BUF)));
+                             XFS_BLF_GDQUOT_BUF)));
-        xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
@@ -389,21 +378,14 @@ xfs_qm_dqalloc(
                return (ESRCH);
        }
-        /*
+        xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to keep the quota
-         * inode around, we bump the vnode ref count now.
-         */
-        IHOLD(quotip);
-        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
        if ((error = xfs_bmapi(tp, quotip,
                              offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
                              XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
                              &firstblock,
                              XFS_QM_DQALLOC_SPACE_RES(mp),
-                              &map, &nmaps, &flist, NULL))) {
+                              &map, &nmaps, &flist))) {
                goto error0;
        }
        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -419,7 +401,7 @@ xfs_qm_dqalloc(
        /* now we can just get the buffer (there's nothing to read yet) */
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                               dqp->q_blkno,
-                               XFS_QI_DQCHUNKLEN(mp),
+                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp || (error = XFS_BUF_GETERROR(bp)))
                goto error1;
@@ -500,7 +482,8 @@ xfs_qm_dqtobp(
         */
        if (dqp->q_blkno == (xfs_daddr_t) 0) {
                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+                dqp->q_fileoffset = (xfs_fileoff_t)id /
+                                        mp->m_quotainfo->qi_dqperchunk;
                nmaps = 1;
                quotip = XFS_DQ_TO_QIP(dqp);
                xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -518,7 +501,7 @@ xfs_qm_dqtobp(
                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
                                  XFS_BMAPI_METADATA,
-                                  NULL, 0, &map, &nmaps, NULL, NULL);
+                                  NULL, 0, &map, &nmaps, NULL);
                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
                if (error)
@@ -529,7 +512,7 @@ xfs_qm_dqtobp(
                /*
                 * offset of dquot in the (fixed sized) dquot chunk.
                 */
-                dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
                        sizeof(xfs_dqblk_t);
                if (map.br_startblock == HOLESTARTBLOCK) {
                        /*
@@ -559,15 +542,13 @@ xfs_qm_dqtobp(
         * Read in the buffer, unless we've just done the allocation
         * (in which case we already have the buf).
         */
-        if (! newdquot) {
+        if (!newdquot) {
                trace_xfs_dqtobp_read(dqp);
-                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                               dqp->q_blkno,
+                                           dqp->q_blkno,
-                                               XFS_QI_DQCHUNKLEN(mp),
+                                           mp->m_quotainfo->qi_dqchunklen,
-                                               0, &bp))) {
+                                           0, &bp);
-                        return (error);
-                }
                if (error || !bp)
                        return XFS_ERROR(error);
        }
@@ -689,14 +670,14 @@ xfs_qm_idtodq(
        tp = NULL;
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-                if ((error = xfs_trans_reserve(tp,
+                error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                       XFS_QM_DQALLOC_SPACE_RES(mp),
+                                XFS_WRITE_LOG_RES(mp) +
-                                       XFS_WRITE_LOG_RES(mp) +
+                                BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
-                                              BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+                                128,
-                                              128,
+                                0,
-                                       0,
+                                XFS_TRANS_PERM_LOG_RES,
-                                       XFS_TRANS_PERM_LOG_RES,
+                                XFS_WRITE_LOG_COUNT);
-                                       XFS_WRITE_LOG_COUNT))) {
+                if (error) {
                        cancelflags = 0;
                        goto error0;
                }
@@ -751,7 +732,6 @@ xfs_qm_dqlookup(
 {
        xfs_dquot_t             *dqp;
        uint                    flist_locked;
-        xfs_dquot_t             *d;
        ASSERT(mutex_is_locked(&qh->qh_lock));
@@ -760,7 +740,7 @@ xfs_qm_dqlookup(
        /*
         * Traverse the hashchain looking for a match
         */
-        for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+        list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
                /*
                 * We already have the hashlock. We don't need the
                 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +752,12 @@ xfs_qm_dqlookup(
                        /*
                         * All in core dquots must be on the dqlist of mp
                         */
-                        ASSERT(dqp->MPL_PREVP != NULL);
+                        ASSERT(!list_empty(&dqp->q_mplist));
                        xfs_dqlock(dqp);
                        if (dqp->q_nrefs == 0) {
-                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+                                ASSERT(!list_empty(&dqp->q_freelist));
-                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                                if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                                        trace_xfs_dqlookup_want(dqp);
                                        /*
@@ -787,7 +767,7 @@ xfs_qm_dqlookup(
                                         */
                                        dqp->dq_flags |= XFS_DQ_WANT;
                                        xfs_dqunlock(dqp);
-                                        xfs_qm_freelist_lock(xfs_Gqm);
+                                        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                                        xfs_dqlock(dqp);
                                        dqp->dq_flags &= ~(XFS_DQ_WANT);
                                }
@@ -802,46 +782,28 @@ xfs_qm_dqlookup(
                        if (flist_locked) {
                                if (dqp->q_nrefs != 0) {
-                                        xfs_qm_freelist_unlock(xfs_Gqm);
+                                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                                        flist_locked = B_FALSE;
                                } else {
-                                        /*
+                                        /* take it off the freelist */
-                                         * take it off the freelist
-                                         */
                                        trace_xfs_dqlookup_freelist(dqp);
-                                        XQM_FREELIST_REMOVE(dqp);
+                                        list_del_init(&dqp->q_freelist);
-                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
+                                        xfs_Gqm->qm_dqfrlist_cnt--;
-                                                        qm_dqfreelist),
-                                                        "after removal"); */
                                }
                        }
-                        /*
-                         * grab a reference
-                         */
                        XFS_DQHOLD(dqp);
                        if (flist_locked)
-                                xfs_qm_freelist_unlock(xfs_Gqm);
+                                mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        /*
                         * move the dquot to the front of the hashchain
                         */
                        ASSERT(mutex_is_locked(&qh->qh_lock));
-                        if (dqp->HL_PREVP != &qh->qh_next) {
+                        list_move(&dqp->q_hashlist, &qh->qh_list);
-                                trace_xfs_dqlookup_move(dqp);
-                                if ((d = dqp->HL_NEXT))
-                                        d->HL_PREVP = dqp->HL_PREVP;
-                                *(dqp->HL_PREVP) = d;
-                                d = qh->qh_next;
-                                d->HL_PREVP = &dqp->HL_NEXT;
-                                dqp->HL_NEXT = d;
-                                dqp->HL_PREVP = &qh->qh_next;
-                                qh->qh_next = dqp;
-                        }
                        trace_xfs_dqlookup_done(dqp);
                        *O_dqpp = dqp;
-                        ASSERT(mutex_is_locked(&qh->qh_lock));
+                        return 0;
-                        return (0);
                }
        }
@@ -975,16 +937,17 @@ xfs_qm_dqget(
         */
        if (ip) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (! XFS_IS_DQTYPE_ON(mp, type)) {
-                        /* inode stays locked on return */
-                        xfs_qm_dqdestroy(dqp);
-                        return XFS_ERROR(ESRCH);
-                }
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
                if (type == XFS_DQ_USER) {
+                        if (!XFS_IS_UQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_udquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_udquot;
@@ -992,6 +955,11 @@ xfs_qm_dqget(
                                goto dqret;
                        }
                } else {
+                        if (!XFS_IS_OQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_gdquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_gdquot;
@@ -1033,13 +1001,14 @@ xfs_qm_dqget(
         */
        ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
-        XQM_HASHLIST_INSERT(h, dqp);
+        list_add(&dqp->q_hashlist, &h->qh_list);
+        h->qh_version++;
        /*
         * Attach this dquot to this filesystem's list of all dquots,
         * kept inside the mount structure in m_quotainfo field
         */
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
        /*
         * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1016,9 @@ xfs_qm_dqget(
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
-        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+        mp->m_quotainfo->qi_dquots++;
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1055,10 @@ xfs_qm_dqput(
         * drop the dqlock and acquire the freelist and dqlock
         * in the right order; but try to get it out-of-order first
         */
-        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+        if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                trace_xfs_dqput_wait(dqp);
                xfs_dqunlock(dqp);
-                xfs_qm_freelist_lock(xfs_Gqm);
+                mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                xfs_dqlock(dqp);
        }
@@ -1100,10 +1069,8 @@ xfs_qm_dqput(
                if (--dqp->q_nrefs == 0) {
                        trace_xfs_dqput_free(dqp);
-                        /*
+                        list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-                         * insert at end of the freelist.
+                        xfs_Gqm->qm_dqfrlist_cnt++;
-                         */
-                        XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
                        /*
                         * If we just added a udquot to the freelist, then
@@ -1118,10 +1085,6 @@ xfs_qm_dqput(
                                xfs_dqlock(gdqp);
                                dqp->q_gdquot = NULL;
                        }
-                        /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-                           "@@@@@++ Free list (after append) @@@@@+");
-                           */
                }
                xfs_dqunlock(dqp);
@@ -1133,7 +1096,7 @@ xfs_qm_dqput(
                        break;
                dqp = gdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 }
 /*
@@ -1159,6 +1122,46 @@ xfs_qm_dqrele(
        xfs_qm_dqput(dqp);
 }
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+        struct xfs_buf          *bp,
+        struct xfs_log_item     *lip)
+{
+        xfs_dq_logitem_t        *qip = (struct xfs_dq_logitem *)lip;
+        xfs_dquot_t             *dqp = qip->qli_dquot;
+        struct xfs_ail          *ailp = lip->li_ailp;
+        /*
+         * We only want to pull the item from the AIL if its
+         * location in the log has not changed since we started the flush.
+         * Thus, we only bother if the dquot's lsn has
+         * not changed. First we check the lsn outside the lock
+         * since it's cheaper, and then we recheck while
+         * holding the lock before removing the dquot from the AIL.
+         */
+        if ((lip->li_flags & XFS_LI_IN_AIL) &&
+            lip->li_lsn == qip->qli_flush_lsn) {
+                /* xfs_trans_ail_delete() drops the AIL lock. */
+                spin_lock(&ailp->xa_lock);
+                if (lip->li_lsn == qip->qli_flush_lsn)
+                        xfs_trans_ail_delete(ailp, lip);
+                else
+                        spin_unlock(&ailp->xa_lock);
+        }
+        /*
+         * Release the dq's flush lock since we're done with it.
+         */
+        xfs_dqfunlock(dqp);
+}
 /*
 * Write a modified dquot to disk.
@@ -1240,8 +1243,9 @@ xfs_qm_dqflush(
         * Attach an iodone routine so that we can remove this dquot from the
         * AIL and release the flush lock once the dquot is synced to disk.
         */
-        xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
+        xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
-                              xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+                                  &dqp->q_logitem.qli_item);
        /*
         * If the buffer is pinned then push on the log so we won't
         * get stuck waiting in the write for too long.
@@ -1265,50 +1269,6 @@ xfs_qm_dqflush(
 }
-/*
- * This is the dquot flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk.  It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
-        xfs_buf_t               *bp,
-        xfs_dq_logitem_t        *qip)
-{
-        xfs_dquot_t             *dqp;
-        struct xfs_ail          *ailp;
-        dqp = qip->qli_dquot;
-        ailp = qip->qli_item.li_ailp;
-        /*
-         * We only want to pull the item from the AIL if its
-         * location in the log has not changed since we started the flush.
-         * Thus, we only bother if the dquot's lsn has
-         * not changed. First we check the lsn outside the lock
-         * since it's cheaper, and then we recheck while
-         * holding the lock before removing the dquot from the AIL.
-         */
-        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                spin_lock(&ailp->xa_lock);
-                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-                else
-                        spin_unlock(&ailp->xa_lock);
-        }
-        /*
-         * Release the dq's flush lock since we're done with it.
-         */
-        xfs_dqfunlock(dqp);
-}
 int
 xfs_qm_dqlock_nowait(
        xfs_dquot_t *dqp)
@@ -1386,10 +1346,10 @@ int
 xfs_qm_dqpurge(
        xfs_dquot_t     *dqp)
 {
-        xfs_dqhash_t    *thishash;
+        xfs_dqhash_t    *qh = dqp->q_hash;
        xfs_mount_t     *mp = dqp->q_mount;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
        ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
        xfs_dqlock(dqp);
@@ -1407,7 +1367,7 @@ xfs_qm_dqpurge(
                return (1);
        }
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        /*
         * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1412,16 @@ xfs_qm_dqpurge(
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-        thishash = dqp->q_hash;
+        list_del_init(&dqp->q_hashlist);
-        XQM_HASHLIST_REMOVE(thishash, dqp);
+        qh->qh_version++;
-        XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_del_init(&dqp->q_mplist);
+        mp->m_quotainfo->qi_dqreclaims++;
+        mp->m_quotainfo->qi_dquots--;
        /*
         * XXX Move this to the front of the freelist, if we can get the
         * freelist lock.
         */
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        dqp->q_mount = NULL;
        dqp->q_hash = NULL;
@@ -1467,7 +1429,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        mutex_unlock(&thishash->qh_lock);
+        mutex_unlock(&qh->qh_lock);
        return (0);
 }
@@ -1517,6 +1479,7 @@ void
 xfs_qm_dqflock_pushbuf_wait(
        xfs_dquot_t     *dqp)
 {
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_buf_t       *bp;
        /*
@@ -1525,14 +1488,14 @@ xfs_qm_dqflock_pushbuf_wait(
         * out immediately.  We'll be able to acquire
         * the flush lock when the I/O completes.
         */
-        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+        bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
-                    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        if (!bp)
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(dqp->q_mount, 0);
+                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
        }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
 * The hash chain headers (hash buckets)
 */
 typedef struct xfs_dqhash {
-        struct xfs_dquot *qh_next;
+        struct list_head  qh_list;
        struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
-typedef struct xfs_dqlink {
-        struct xfs_dquot  *ql_next;     /* forward link */
-        struct xfs_dquot **ql_prevp;    /* pointer to prev ql_next */
-} xfs_dqlink_t;
 struct xfs_mount;
 struct xfs_trans;
 /*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-        struct xfs_dquot*dqm_flnext;    /* link to freelist: must be first */
-        struct xfs_dquot*dqm_flprev;
-        xfs_dqlink_t     dqm_mplist;    /* link to mount's list of dquots */
-        xfs_dqlink_t     dqm_hashlist;  /* link to the hash chain */
-        uint             dqm_flags;     /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-/*
 * The incore dquot structure
 */
 typedef struct xfs_dquot {
-        xfs_dqmarker_t   q_lists;       /* list ptrs, q_flags (marker) */
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_freelist;    /* global free list of dquots */
+        struct list_head q_mplist;      /* mount's list of dquots */
+        struct list_head q_hashlist;    /* gloabl hash list of dquots */
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 } xfs_dquot_t;
-#define dq_flnext       q_lists.dqm_flnext
-#define dq_flprev       q_lists.dqm_flprev
-#define dq_mplist       q_lists.dqm_mplist
-#define dq_hashlist     q_lists.dqm_hashlist
-#define dq_flags        q_lists.dqm_flags
 /*
 * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 }
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..2a1f3dc10a02 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,42 +23,36 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
 /*
 * returns the number of iovecs needed to log the given dquot item.
 */
-/* ARGSUSED */
 STATIC uint
 xfs_qm_dquot_logitem_size(
-        xfs_dq_logitem_t        *logitem)
+        struct xfs_log_item     *lip)
 {
        /*
         * we need only two iovecs, one for the format, one for the real thing
         */
-        return (2);
+        return 2;
 }
 /*
@@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size(
 */
 STATIC void
 xfs_qm_dquot_logitem_format(
-        xfs_dq_logitem_t        *logitem,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *logvec)
+        struct xfs_log_iovec    *logvec)
 {
-        ASSERT(logitem);
+        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-        ASSERT(logitem->qli_dquot);
-        logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+        logvec->i_addr = &qlip->qli_format;
        logvec->i_len  = sizeof(xfs_dq_logformat_t);
        logvec->i_type = XLOG_REG_TYPE_QFORMAT;
        logvec++;
-        logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+        logvec->i_addr = &qlip->qli_dquot->q_core;
        logvec->i_len  = sizeof(xfs_disk_dquot_t);
        logvec->i_type = XLOG_REG_TYPE_DQUOT;
-        ASSERT(2 == logitem->qli_item.li_desc->lid_size);
+        ASSERT(2 == lip->li_desc->lid_size);
-        logitem->qli_format.qlf_size = 2;
+        qlip->qli_format.qlf_size = 2;
 }
@@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format(
 */
 STATIC void
 xfs_qm_dquot_logitem_pin(
-        xfs_dq_logitem_t *logitem)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t *dqp = logitem->qli_dquot;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        atomic_inc(&dqp->q_pincount);
@@ -104,28 +97,18 @@ xfs_qm_dquot_logitem_pin(
 * dquot must have been previously pinned with a call to
 * xfs_qm_dquot_logitem_pin().
 */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-        xfs_dq_logitem_t *logitem,
+        struct xfs_log_item     *lip,
-        int               stale)
+        int                     remove)
 {
-        xfs_dquot_t *dqp = logitem->qli_dquot;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
        ASSERT(atomic_read(&dqp->q_pincount) > 0);
        if (atomic_dec_and_test(&dqp->q_pincount))
                wake_up(&dqp->q_pinwait);
 }
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
-        xfs_dq_logitem_t *logitem,
-        xfs_trans_t      *tp)
-{
-        xfs_qm_dquot_logitem_unpin(logitem, 0);
-}
 /*
 * Given the logitem, this writes the corresponding dquot entry to disk
 * asynchronously. This is called with the dquot entry securely locked;
@@ -134,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove(
 */
 STATIC void
 xfs_qm_dquot_logitem_push(
-        xfs_dq_logitem_t        *logitem)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        int             error;
+        int                     error;
-        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        ASSERT(!completion_done(&dqp->q_flush));
@@ -161,27 +142,25 @@ xfs_qm_dquot_logitem_push(
        xfs_dqunlock(dqp);
 }
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_dquot_logitem_committed(
-        xfs_dq_logitem_t        *l,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
        /*
         * We always re-log the entire dquot when it becomes dirty,
         * so, the latest copy _is_ the only one that matters.
         */
-        return (lsn);
+        return lsn;
 }
 /*
 * This is called to wait for the given dquot to be unpinned.
 * Most of these pin/unpin routines are plagiarized from inode code.
 */
 void
 xfs_qm_dqunpin_wait(
-        xfs_dquot_t     *dqp)
+        struct xfs_dquot        *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        if (atomic_read(&dqp->q_pincount) == 0)
@@ -207,13 +186,12 @@ xfs_qm_dqunpin_wait(
 */
 STATIC void
 xfs_qm_dquot_logitem_pushbuf(
-        xfs_dq_logitem_t    *qip)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-        xfs_mount_t     *mp;
+        struct xfs_dquot        *dqp = qlip->qli_dquot;
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp;
-        dqp = qip->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        /*
@@ -221,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if (completion_done(&dqp->q_flush)  ||
+        if (completion_done(&dqp->q_flush) ||
-            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_dqunlock(dqp);
                return;
        }
-        mp = dqp->q_mount;
-        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
+        bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
-                    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
+                        dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
                return;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
-        return;
 }
 /*
@@ -251,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf(
 */
 STATIC uint
 xfs_qm_dquot_logitem_trylock(
-        xfs_dq_logitem_t        *qip)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t             *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        dqp = qip->qli_dquot;
        if (atomic_read(&dqp->q_pincount) > 0)
                return XFS_ITEM_PINNED;
-        if (! xfs_qm_dqlock_nowait(dqp))
+        if (!xfs_qm_dqlock_nowait(dqp))
                return XFS_ITEM_LOCKED;
        if (!xfs_dqflock_nowait(dqp)) {
@@ -270,11 +245,10 @@ xfs_qm_dquot_logitem_trylock(
                return XFS_ITEM_PUSHBUF;
        }
-        ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+        ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        return XFS_ITEM_SUCCESS;
 }
 /*
 * Unlock the dquot associated with the log item.
 * Clear the fields of the dquot and dquot log item that
@@ -283,12 +257,10 @@ xfs_qm_dquot_logitem_trylock(
 */
 STATIC void
 xfs_qm_dquot_logitem_unlock(
-        xfs_dq_logitem_t    *ql)
+        struct xfs_log_item     *lip)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-        ASSERT(ql != NULL);
-        dqp = ql->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        /*
@@ -305,44 +277,32 @@ xfs_qm_dquot_logitem_unlock(
        xfs_dqunlock(dqp);
 }
 /*
 * this needs to stamp an lsn into the dquot, I think.
 * rpc's that look at user dquot's would then have to
 * push on the dependency recorded in the dquot
 */
-/* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_committing(
-        xfs_dq_logitem_t        *l,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector for dquots
 */
 static struct xfs_item_ops xfs_dquot_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
+        .iop_size       = xfs_qm_dquot_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_dquot_logitem_format,
-                                        xfs_qm_dquot_logitem_format,
+        .iop_pin        = xfs_qm_dquot_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
+        .iop_unpin      = xfs_qm_dquot_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_trylock    = xfs_qm_dquot_logitem_trylock,
-                                        xfs_qm_dquot_logitem_unpin,
+        .iop_unlock     = xfs_qm_dquot_logitem_unlock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_committed  = xfs_qm_dquot_logitem_committed,
-                                        xfs_qm_dquot_logitem_unpin_remove,
+        .iop_push       = xfs_qm_dquot_logitem_push,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))
+        .iop_pushbuf    = xfs_qm_dquot_logitem_pushbuf,
-                                        xfs_qm_dquot_logitem_trylock,
+        .iop_committing = xfs_qm_dquot_logitem_committing
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_dquot_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))
-                                        xfs_qm_dquot_logitem_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_dquot_logitem_committing
 };
 /*
@@ -352,14 +312,12 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
 */
 void
 xfs_qm_dquot_logitem_init(
-        struct xfs_dquot *dqp)
+        struct xfs_dquot        *dqp)
 {
-        xfs_dq_logitem_t  *lp;
+        struct xfs_dq_logitem   *lp = &dqp->q_logitem;
-        lp = &dqp->q_logitem;
-        lp->qli_item.li_type = XFS_LI_DQUOT;
+        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
-        lp->qli_item.li_ops = &xfs_dquot_item_ops;
+                                        &xfs_dquot_item_ops);
-        lp->qli_item.li_mountp = dqp->q_mount;
        lp->qli_dquot = dqp;
        lp->qli_format.qlf_type = XFS_LI_DQUOT;
        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -377,16 +335,22 @@ xfs_qm_dquot_logitem_init(
 /*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
 /*
 * This returns the number of iovecs needed to log the given quotaoff item.
 * We only need 1 iovec for an quotaoff item.  It just logs the
 * quotaoff_log_format structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_size(
+        struct xfs_log_item     *lip)
 {
-        return (1);
+        return 1;
 }
 /*
@@ -397,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
 * slots in the quotaoff item have been filled.
 */
 STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t   *qf,
+xfs_qm_qoff_logitem_format(
-                           xfs_log_iovec_t      *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+        struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+        ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-        log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+        log_vector->i_addr = &qflip->qql_format;
        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
        log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
-        qf->qql_format.qf_size = 1;
+        qflip->qql_format.qf_size = 1;
 }
 /*
 * Pinning has no meaning for an quotaoff item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * Since pinning has no meaning for an quotaoff item, unpinning does
 * not either.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        return;
-}
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
-        return;
 }
 /*
 * Quotaoff items have no locking, so just return success.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_LOCKED;
 }
@@ -452,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
 * Quotaoff items have no locking or pushing, so return failure
 * so that the caller doesn't bother with us.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_unlock(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * The quotaoff-start-item is logged only once and cannot be moved in the log,
 * so simply return the lsn at which it's been logged.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+xfs_qm_qoff_logitem_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return (lsn);
+        return lsn;
 }
 /*
 * There isn't much you can do to push on an quotaoff item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+xfs_qm_qoff_logitem_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_qm_qoffend_logitem_committed(
-        xfs_qoff_logitem_t *qfe,
+        struct xfs_log_item     *lip,
-        xfs_lsn_t lsn)
+        xfs_lsn_t               lsn)
 {
-        xfs_qoff_logitem_t      *qfs;
+        struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
-        struct xfs_ail          *ailp;
+        struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+        struct xfs_ail          *ailp = qfs->qql_item.li_ailp;
-        qfs = qfe->qql_start_lip;
-        ailp = qfs->qql_item.li_ailp;
-        spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
         * xfs_trans_ail_delete() drops the AIL lock.
         */
+        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
@@ -518,82 +473,58 @@ xfs_qm_qoffend_logitem_committed(
 * (truly makes the quotaoff irrevocable).  If we do something else,
 * then maybe we don't need two.
 */
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
-        return;
-}
-/* ARGSUSED */
 STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+xfs_qm_qoff_logitem_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               commit_lsn)
 {
-        return;
 }
 static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_size       = xfs_qm_qoff_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_qoff_logitem_format,
-                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = xfs_qm_qoff_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t* ,int))
+        .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-                                        xfs_qm_qoff_logitem_unpin,
+        .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+        .iop_committed  = xfs_qm_qoffend_logitem_committed,
-                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_push       = xfs_qm_qoff_logitem_push,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_committing = xfs_qm_qoff_logitem_committing
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoffend_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoffend_logitem_committing
 };
 /*
 * This is the ops vector shared by all quotaoff-start log items.
 */
 static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_size       = xfs_qm_qoff_logitem_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_qm_qoff_logitem_format,
-                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = xfs_qm_qoff_logitem_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-                                        xfs_qm_qoff_logitem_unpin,
+        .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+        .iop_committed  = xfs_qm_qoff_logitem_committed,
-                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_push       = xfs_qm_qoff_logitem_push,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_committing = xfs_qm_qoff_logitem_committing
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoff_logitem_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_qm_qoff_logitem_committing
 };
 /*
 * Allocate and initialize an quotaoff item of the correct quota type(s).
 */
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
 xfs_qm_qoff_logitem_init(
-        struct xfs_mount *mp,
+        struct xfs_mount        *mp,
-        xfs_qoff_logitem_t *start,
+        struct xfs_qoff_logitem *start,
-        uint flags)
+        uint                    flags)
 {
-        xfs_qoff_logitem_t      *qf;
+        struct xfs_qoff_logitem *qf;
-        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+        qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
-        qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
-        if (start)
+                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
-                qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-        else
-                qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
        qf->qql_item.li_mountp = mp;
        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
        qf->qql_format.qf_flags = flags;
        qf->qql_start_lip = start;
-        return (qf);
+        return qf;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..9a92407109a1 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -23,25 +23,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -67,12 +60,9 @@ static cred_t	xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
-STATIC void     xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void     xfs_qm_freelist_destroy(xfs_frlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int      xfs_qm_shake(int, gfp_t);
+STATIC int      xfs_qm_shake(struct shrinker *, int, gfp_t);
 static struct shrinker xfs_qm_shaker = {
        .shrink = xfs_qm_shake,
@@ -84,21 +74,25 @@ extern struct mutex	qcheck_lock;
 #endif
 #ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
+static void
-{ \
+xfs_qm_dquot_list_print(
-        xfs_dquot_t     *dqp; int i = 0; \
+        struct xfs_mount *mp)
-        cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+{
-        for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+        xfs_dquot_t     *dqp;
-                cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
+        int             i = 0;
-                                  "bcnt = %d, icnt = %d, refs = %d", \
-                        ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
+        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                        DQFLAGTO_TYPESTR(dqp),       \
+                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
-                        (int) be64_to_cpu(dqp->q_core.d_bcount), \
+                                  "bcnt = %lld, icnt = %lld, refs = %d",
-                        (int) be64_to_cpu(dqp->q_core.d_icount), \
+                        i++, be32_to_cpu(dqp->q_core.d_id),
-                        (int) dqp->q_nrefs);  } \
+                        DQFLAGTO_TYPESTR(dqp),
+                        (long long)be64_to_cpu(dqp->q_core.d_bcount),
+                        (long long)be64_to_cpu(dqp->q_core.d_icount),
+                        dqp->q_nrefs);
+        }
 }
 #else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
 #endif
 /*
@@ -144,7 +138,9 @@ xfs_Gqm_init(void)
        /*
         * Freelist of all dquots of all file systems
         */
-        xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+        INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+        xqm->qm_dqfrlist_cnt = 0;
+        mutex_init(&xqm->qm_dqfrlist_lock);
        /*
         * dquot zone. we register our own low-memory callback.
@@ -189,6 +185,7 @@ STATIC void
 xfs_qm_destroy(
        struct xfs_qm   *xqm)
 {
+        struct xfs_dquot *dqp, *n;
        int             hsize, i;
        ASSERT(xqm != NULL);
@@ -204,7 +201,21 @@ xfs_qm_destroy(
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
-        xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+        /* frlist cleanup */
+        mutex_lock(&xqm->qm_dqfrlist_lock);
+        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+                xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                xfs_dqunlock(dqp);
+                xfs_qm_dqdestroy(dqp);
+        }
+        mutex_unlock(&xqm->qm_dqfrlist_lock);
+        mutex_destroy(&xqm->qm_dqfrlist_lock);
 #ifdef DEBUG
        mutex_destroy(&qcheck_lock);
 #endif
@@ -231,8 +242,10 @@ xfs_qm_hold_quotafs_ref(
        if (!xfs_Gqm) {
                xfs_Gqm = xfs_Gqm_init();
-                if (!xfs_Gqm)
+                if (!xfs_Gqm) {
+                        mutex_unlock(&xfs_Gqm_lock);
                        return ENOMEM;
+                }
        }
        /*
@@ -256,7 +269,7 @@ STATIC void
 xfs_qm_rele_quotafs_ref(
        struct xfs_mount *mp)
 {
-        xfs_dquot_t     *dqp, *nextdqp;
+        xfs_dquot_t     *dqp, *n;
        ASSERT(xfs_Gqm);
        ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +277,24 @@ xfs_qm_rele_quotafs_ref(
        /*
         * Go thru the freelist and destroy all inactive dquots.
         */
-        xfs_qm_freelist_lock(xfs_Gqm);
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
                        ASSERT(dqp->q_mount == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
-                        XQM_FREELIST_REMOVE(dqp);
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
                        xfs_dqunlock(dqp);
                        xfs_qm_dqdestroy(dqp);
                } else {
                        xfs_dqunlock(dqp);
                }
-                dqp = nextdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        /*
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +316,7 @@ xfs_qm_unmount(
        struct xfs_mount        *mp)
 {
        if (mp->m_quotainfo) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                xfs_qm_destroy_quotainfo(mp);
        }
 }
@@ -449,20 +460,21 @@ xfs_qm_unmount_quotas(
 */
 STATIC int
 xfs_qm_dqflush_all(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             sync_mode)
+        int                     sync_mode)
 {
-        int             recl;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl;
-        int             niters;
+        struct xfs_dquot        *dqp;
-        int             error;
+        int                     niters;
+        int                     error;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        niters = 0;
 again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                xfs_dqlock(dqp);
                if (! XFS_DQ_IS_DIRTY(dqp)) {
                        xfs_dqunlock(dqp);
@@ -470,7 +482,7 @@ again:
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
@@ -485,21 +497,21 @@ again:
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write.
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, sync_mode);
                xfs_dqunlock(dqp);
                if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        /* XXX restart limit */
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        /* return ! busy */
        return 0;
 }
@@ -509,15 +521,15 @@ again:
 */
 STATIC void
 xfs_qm_detach_gdquots(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_dquot_t     *dqp, *gdqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        int             nrecl;
+        struct xfs_dquot        *dqp, *gdqp;
+        int                     nrecl;
 again:
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                xfs_dqlock(dqp);
                if ((gdqp = dqp->q_gdquot)) {
                        xfs_dqlock(gdqp);
@@ -530,15 +542,14 @@ xfs_qm_detach_gdquots(
                         * Can't hold the mplist lock across a dqput.
                         * XXXmust convert to marker based iterations here.
                         */
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        xfs_qm_dqput(gdqp);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+                        if (nrecl != q->qi_dqreclaims)
                                goto again;
                }
-                dqp = dqp->MPL_NEXT;
        }
 }
@@ -550,23 +561,23 @@ xfs_qm_detach_gdquots(
 */
 STATIC int
 xfs_qm_dqpurge_int(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        uint            flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+        uint                    flags)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        uint            dqtype;
+        struct xfs_dquot        *dqp, *n;
-        int             nrecl;
+        uint                    dqtype;
-        xfs_dquot_t     *nextdqp;
+        int                     nrecl;
-        int             nmisses;
+        int                     nmisses;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
        dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +589,25 @@ xfs_qm_dqpurge_int(
      again:
        nmisses = 0;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
        /*
         * Try to get rid of all of the unwanted dquots. The idea is to
         * get them off mplist and hashlist, but leave them on freelist.
         */
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                /*
                 * It's OK to look at the type without taking dqlock here.
                 * We're holding the mplist lock here, and that's needed for
                 * a dqreclaim.
                 */
-                if ((dqp->dq_flags & dqtype) == 0) {
+                if ((dqp->dq_flags & dqtype) == 0)
-                        dqp = dqp->MPL_NEXT;
                        continue;
-                }
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        mutex_lock(&dqp->q_hash->qh_lock);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
                        /*
                         * XXXTheoretically, we can get into a very long
@@ -607,7 +615,7 @@ xfs_qm_dqpurge_int(
                         * No one can be adding dquots to the mplist at
                         * this point, but somebody might be taking things off.
                         */
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+                        if (nrecl != q->qi_dqreclaims) {
                                mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
@@ -617,11 +625,9 @@ xfs_qm_dqpurge_int(
                 * Take the dquot off the mplist and hashlist. It may remain on
                 * freelist in INACTIVE state.
                 */
-                nextdqp = dqp->MPL_NEXT;
                nmisses += xfs_qm_dqpurge(dqp);
-                dqp = nextdqp;
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return nmisses;
 }
@@ -921,12 +927,13 @@ xfs_qm_dqdetach(
 int
 xfs_qm_sync(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     flags)
 {
-        int             recl, restarts;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl, restarts;
-        int             error;
+        struct xfs_dquot        *dqp;
+        int                     error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -934,18 +941,19 @@ xfs_qm_sync(
        restarts = 0;
  again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * dqpurge_all() also takes the mplist lock and iterate thru all dquots
         * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
         * when we have the mplist lock, we know that dquots will be consistent
         * as long as we have it locked.
         */
-        if (! XFS_IS_QUOTA_ON(mp)) {
+        if (!XFS_IS_QUOTA_ON(mp)) {
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                return 0;
        }
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                /*
                 * If this is vfs_sync calling, then skip the dquots that
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +977,7 @@ xfs_qm_sync(
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
@@ -989,7 +997,7 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, flags);
                xfs_dqunlock(dqp);
                if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1005,17 @@ xfs_qm_sync(
                else if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
                        if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
                                break;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return 0;
 }
@@ -1052,8 +1060,9 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        INIT_LIST_HEAD(&qinf->qi_dqlist);
-        lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+        mutex_init(&qinf->qi_dqlist_lock);
+        lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
        qinf->qi_dqreclaims = 0;
@@ -1150,7 +1159,8 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        xfs_qm_list_destroy(&qi->qi_dqlist);
+        ASSERT(list_empty(&qi->qi_dqlist));
+        mutex_destroy(&qi->qi_dqlist_lock);
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -1177,7 +1187,7 @@ xfs_qm_list_init(
        int             n)
 {
        mutex_init(&list->qh_lock);
-        list->qh_next = NULL;
+        INIT_LIST_HEAD(&list->qh_list);
        list->qh_version = 0;
        list->qh_nelems = 0;
 }
@@ -1316,9 +1326,6 @@ xfs_qm_qino_alloc(
         */
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1338,6 @@ xfs_qm_qino_alloc(
                /* qflags will get updated _after_ quotacheck */
                mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                cmn_err(CE_NOTE,
-                        "Old superblock version %x, converting to %x.",
-                        oldv, mp->m_sb.sb_versionnum);
-#endif
        }
        if (flags & XFS_QMOPT_UQUOTA)
                mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1373,10 @@ xfs_qm_reset_dqcounts(
 #ifdef DEBUG
        j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
        do_div(j, sizeof(xfs_dqblk_t));
-        ASSERT(XFS_QM_DQPERBLK(mp) == j);
+        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-        for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
                 * output any warnings because it's perfectly possible to
@@ -1429,7 +1431,7 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
                if (error)
                        break;
@@ -1439,7 +1441,7 @@ xfs_qm_dqiter_bufs(
                 * goto the next block.
                 */
                bno++;
-                firstid += XFS_QM_DQPERBLK(mp);
+                firstid += mp->m_quotainfo->qi_dqperchunk;
        }
        return error;
 }
@@ -1488,7 +1490,7 @@ xfs_qm_dqiterate(
                                  maxlblkcnt - lblkno,
                                  XFS_BMAPI_METADATA,
                                  NULL,
-                                  0, map, &nmaps, NULL, NULL);
+                                  0, map, &nmaps, NULL);
                xfs_iunlock(qip, XFS_ILOCK_SHARED);
                if (error)
                        break;
@@ -1505,7 +1507,7 @@ xfs_qm_dqiterate(
                                continue;
                        firstid = (xfs_dqid_t) map[i].br_startoff *
-                                XFS_QM_DQPERBLK(mp);
+                                mp->m_quotainfo->qi_dqperchunk;
                        /*
                         * Do a read-ahead on the next extent.
                         */
@@ -1516,7 +1518,7 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_baread(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               (int)XFS_QI_DQCHUNKLEN(mp));
+                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
                                }
                        }
@@ -1576,8 +1578,10 @@ xfs_qm_quotacheck_dqadjust(
        /*
         * Set default limits, adjust timers (since we changed usages)
+         *
+         * There are no timers for the default values set in the root dquot.
         */
-        if (! XFS_IS_SUSER_DQUOT(dqp)) {
+        if (dqp->q_core.d_id) {
                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
        }
@@ -1621,10 +1625,7 @@ xfs_qm_dqusage_adjust(
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* not used */
        int             ubsize,         /* not used */
-        void            *private_data,  /* not used */
-        xfs_daddr_t     bno,            /* starting block of inode cluster */
        int             *ubused,        /* not used */
-        void            *dip,           /* on-disk inode pointer (not used) */
        int             *res)           /* result code value */
 {
        xfs_inode_t     *ip;
@@ -1649,7 +1650,7 @@ xfs_qm_dqusage_adjust(
         * the case in all other instances. It's OK that we do this because
         * quotacheck is done only at mount time.
         */
-        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
+        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) {
                *res = BULKSTAT_RV_NOTHING;
                return error;
        }
@@ -1661,7 +1662,8 @@ xfs_qm_dqusage_adjust(
         * making us disable quotas for the file system.
         */
        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-                xfs_iput(ip, XFS_ILOCK_EXCL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                IRELE(ip);
                *res = BULKSTAT_RV_GIVEUP;
                return error;
        }
@@ -1674,7 +1676,8 @@ xfs_qm_dqusage_adjust(
                 * Walk thru the extent list and count the realtime blocks.
                 */
                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
-                        xfs_iput(ip, XFS_ILOCK_EXCL);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        IRELE(ip);
                        if (udqp)
                                xfs_qm_dqput(udqp);
                        if (gdqp)
@@ -1747,14 +1750,14 @@ xfs_qm_quotacheck(
        lastino = 0;
        flags = 0;
-        ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        /*
         * There should be no cached dquots. The (simplistic) quotacheck
         * algorithm doesn't like that.
         */
-        ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
@@ -1763,15 +1766,19 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-        if ((uip = XFS_QI_UQIP(mp))) {
+        uip = mp->m_quotainfo->qi_uquotaip;
-                if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+        if (uip) {
+                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_UQUOTA_CHKD;
        }
-        if ((gip = XFS_QI_GQIP(mp))) {
+        gip = mp->m_quotainfo->qi_gquotaip;
-                if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+        if (gip) {
-                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_OQUOTA_CHKD;
        }
@@ -1781,12 +1788,13 @@ xfs_qm_quotacheck(
                 * Iterate thru all the inodes in the file system,
                 * adjusting the corresponding dquot counters in core.
                 */
-                if ((error = xfs_bulkstat(mp, &lastino, &count,
+                error = xfs_bulkstat(mp, &lastino, &count,
-                                     xfs_qm_dqusage_adjust, NULL,
+                                     xfs_qm_dqusage_adjust,
-                                     structsz, NULL, BULKSTAT_FG_IGET, &done)))
+                                     structsz, NULL, &done);
+                if (error)
                        break;
-        } while (! done);
+        } while (!done);
        /*
         * We've made all the changes that we need to make incore.
@@ -1804,7 +1812,7 @@ xfs_qm_quotacheck(
         * at this point (because we intentionally didn't in dqget_noattach).
         */
        if (error) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                goto error_return;
        }
@@ -1825,7 +1833,7 @@ xfs_qm_quotacheck(
        mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
        mp->m_qflags |= flags;
-        XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+        xfs_qm_dquot_list_print(mp);
 error_return:
        if (error) {
@@ -1874,14 +1882,14 @@ xfs_qm_init_quotainos(
                    mp->m_sb.sb_uquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_uquotino > 0);
                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                             0, 0, &uip, 0)))
+                                             0, 0, &uip)))
                                return XFS_ERROR(error);
                }
                if (XFS_IS_OQUOTA_ON(mp) &&
                    mp->m_sb.sb_gquotino != NULLFSINO) {
                        ASSERT(mp->m_sb.sb_gquotino > 0);
                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                             0, 0, &gip, 0))) {
+                                             0, 0, &gip))) {
                                if (uip)
                                        IRELE(uip);
                                return XFS_ERROR(error);
@@ -1920,59 +1928,53 @@ xfs_qm_init_quotainos(
                }
        }
-        XFS_QI_UQIP(mp) = uip;
+        mp->m_quotainfo->qi_uquotaip = uip;
-        XFS_QI_GQIP(mp) = gip;
+        mp->m_quotainfo->qi_gquotaip = gip;
        return 0;
 }
 /*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * Just pop the least recently used dquot off the freelist and
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * recycle it. The returned dquot is locked.
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
 */
-STATIC int
+STATIC xfs_dquot_t *
-xfs_qm_shake_freelist(
+xfs_qm_dqreclaim_one(void)
-        int howmany)
 {
-        int             nreclaimed;
+        xfs_dquot_t     *dqpout;
-        xfs_dqhash_t    *hash;
+        xfs_dquot_t     *dqp;
-        xfs_dquot_t     *dqp, *nextdqp;
        int             restarts;
-        int             nflushes;
-        if (howmany <= 0)
-                return 0;
-        nreclaimed = 0;
        restarts = 0;
-        nflushes = 0;
+        dqpout = NULL;
-#ifdef QUOTADEBUG
+        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-        cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
+startagain:
-#endif
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+                struct xfs_mount *mp = dqp->q_mount;
-              nreclaimed < howmany); ) {
                xfs_dqlock(dqp);
                /*
                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants.
+                 * want to reclaim a dquot that lookup wants. We release the
+                 * freelist lock and start over, so that lookup will grab
+                 * both the dquot and the freelistlock.
                 */
                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+                        trace_xfs_dqreclaim_want(dqp);
                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return nreclaimed;
+                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto tryagain;
+                        goto startagain;
                }
                /*
@@ -1981,23 +1983,27 @@ xfs_qm_shake_freelist(
                 * life easier.
                 */
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(mp == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
+                        xfs_dqunlock(dqp);
+                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        nextdqp = dqp->dq_flnext;
+                        break;
-                        goto off_freelist;
                }
-                ASSERT(dqp->MPL_PREVP);
+                ASSERT(dqp->q_hash);
+                ASSERT(!list_empty(&dqp->q_mplist));
                /*
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
-                        dqp = dqp->dq_flnext;
                        continue;
                }
@@ -2010,21 +2016,21 @@ xfs_qm_shake_freelist(
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        trace_xfs_dqshake_dirty(dqp);
+                        trace_xfs_dqreclaim_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
-                         * releasing the mplock.
+                         * releasing the freelist lock.
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                xfs_fs_cmn_err(CE_WARN, mp,
-                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        dqp = dqp->dq_flnext;
                        continue;
                }
                /*
                 * We're trying to get the hashlock out of order. This races
                 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,62 +2039,83 @@ xfs_qm_shake_freelist(
                 * waiting for the freelist lock.
                 */
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        xfs_dqfunlock(dqp);
+                        restarts++;
-                        xfs_dqunlock(dqp);
+                        goto dqfunlock;
-                        dqp = dqp->dq_flnext;
-                        continue;
                }
                /*
                 * This races with dquot allocation code as well as dqflush_all
                 * and reclaim code. So, if we failed to grab the mplist lock,
                 * giveup everything and start over.
                 */
-                hash = dqp->q_hash;
+                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
-                ASSERT(hash);
+                        restarts++;
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        mutex_unlock(&dqp->q_hash->qh_lock);
-                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                        mutex_unlock(&hash->qh_lock);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return NULL;
-                                return nreclaimed;
+                        goto startagain;
-                        goto tryagain;
                }
-                trace_xfs_dqshake_unlink(dqp);
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-                        dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
                ASSERT(dqp->q_nrefs == 0);
-                nextdqp = dqp->dq_flnext;
+                list_del_init(&dqp->q_mplist);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                mp->m_quotainfo->qi_dquots--;
-                XQM_HASHLIST_REMOVE(hash, dqp);
+                mp->m_quotainfo->qi_dqreclaims++;
+                list_del_init(&dqp->q_hashlist);
+                dqp->q_hash->qh_version++;
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                dqpout = dqp;
+                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+                mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
                xfs_dqfunlock(dqp);
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                mutex_unlock(&hash->qh_lock);
- off_freelist:
-                XQM_FREELIST_REMOVE(dqp);
                xfs_dqunlock(dqp);
-                nreclaimed++;
+                if (dqpout)
-                XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+                        break;
+                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                        return NULL;
+        }
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        return dqpout;
+}
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+        int     howmany)
+{
+        int             nreclaimed = 0;
+        xfs_dquot_t     *dqp;
+        if (howmany <= 0)
+                return 0;
+        while (nreclaimed < howmany) {
+                dqp = xfs_qm_dqreclaim_one();
+                if (!dqp)
+                        return nreclaimed;
                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
+                nreclaimed++;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
        return nreclaimed;
 }
 /*
 * The kmem_shake interface is invoked when memory is running low.
 */
 /* ARGSUSED */
 STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+        struct shrinker *shrink,
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
 {
        int     ndqused, nfree, n;
@@ -2097,7 +2124,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
        if (!xfs_Gqm)
                return 0;
-        nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+        nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
        /* incore dquots in all f/s's */
        ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
@@ -2113,131 +2140,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 }
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-        xfs_dquot_t     *dqpout;
-        xfs_dquot_t     *dqp;
-        int             restarts;
-        int             nflushes;
-        restarts = 0;
-        dqpout = NULL;
-        nflushes = 0;
-        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-                xfs_dqlock(dqp);
-                /*
-                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants. We release the
-                 * freelist lock and start over, so that lookup will grab
-                 * both the dquot and the freelistlock.
-                 */
-                if (dqp->dq_flags & XFS_DQ_WANT) {
-                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
-                }
-                /*
-                 * If the dquot is inactive, we are assured that it is
-                 * not on the mplist or the hashlist, and that makes our
-                 * life easier.
-                 */
-                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
-                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
-                        ASSERT(dqp->MPL_PREVP == NULL);
-                        XQM_FREELIST_REMOVE(dqp);
-                        xfs_dqunlock(dqp);
-                        dqpout = dqp;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
-                }
-                ASSERT(dqp->q_hash);
-                ASSERT(dqp->MPL_PREVP);
-                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
-                 * getting flushed to disk, we don't want to reclaim it.
-                 */
-                if (!xfs_dqflock_nowait(dqp)) {
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                /*
-                 * We have the flush lock so we know that this is not in the
-                 * process of being flushed. So, if this is dirty, flush it
-                 * DELWRI so that we don't get a freelist infested with
-                 * dirty dquots.
-                 */
-                if (XFS_DQ_IS_DIRTY(dqp)) {
-                        int     error;
-                        trace_xfs_dqreclaim_dirty(dqp);
-                        /*
-                         * We flush it delayed write, so don't bother
-                         * releasing the freelist lock.
-                         */
-                        error = xfs_qm_dqflush(dqp, 0);
-                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
-                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        continue;
-                }
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-                        xfs_dqfunlock(dqp);
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                if (!mutex_trylock(&dqp->q_hash->qh_lock))
-                        goto mplistunlock;
-                trace_xfs_dqreclaim_unlink(dqp);
-                ASSERT(dqp->q_nrefs == 0);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-                XQM_FREELIST_REMOVE(dqp);
-                dqpout = dqp;
-                mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                xfs_dqfunlock(dqp);
-                xfs_dqunlock(dqp);
-                if (dqpout)
-                        break;
-        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
-        return dqpout;
-}
 /*------------------------------------------------------------------*/
 /*
@@ -2662,66 +2564,3 @@ xfs_qm_vop_create_dqattach(
        }
 }
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-        ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-        mutex_init(&ql->qh_lock);
-        ql->qh_version = 0;
-        ql->qh_nelems = 0;
-}
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-        xfs_dquot_t     *dqp, *nextdqp;
-        mutex_lock(&ql->qh_lock);
-        for (dqp = ql->qh_next;
-             dqp != (xfs_dquot_t *)ql; ) {
-                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-                XQM_FREELIST_REMOVE(dqp);
-                xfs_dqunlock(dqp);
-                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
-        }
-        mutex_unlock(&ql->qh_lock);
-        mutex_destroy(&ql->qh_lock);
-        ASSERT(ql->qh_nelems == 0);
-}
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        dq->dq_flnext = ql->qh_next;
-        dq->dq_flprev = (xfs_dquot_t *)ql;
-        ql->qh_next = dq;
-        dq->dq_flnext->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems++;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-        xfs_dquot_t *next = dq->dq_flnext;
-        xfs_dquot_t *prev = dq->dq_flprev;
-        next->dq_flprev = prev;
-        prev->dq_flnext = next;
-        dq->dq_flnext = dq->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems--;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       struct mutex      qh_lock;
-       uint              qh_version;
-       uint              qh_nelems;
-} xfs_frlist_t;
 /*
 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-        xfs_frlist_t     qm_dqfreelist;  /* freelist of dquots */
+        struct list_head qm_dqfrlist;    /* freelist of dquots */
+        struct mutex     qm_dqfrlist_lock;
+        int              qm_dqfrlist_cnt;
        atomic_t         qm_totaldquots; /* total incore dquots */
        uint             qm_nrefs;       /* file systems with quota on */
        int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
+        struct list_head qi_dqlist;      /* all dquots in filesys */
+        struct mutex     qi_dqlist_lock;
+        int              qi_dquots;
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* list stuff */
-extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
 #ifdef DEBUG
 extern int              xfs_qm_internalqcheck(xfs_mount_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 97b410c12794..bea02d786c5d 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..8671a0b32644 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
@@ -55,7 +45,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+                        xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
        return 0;
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 50bee07d6b0e..45e5849df238 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,25 +26,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -79,6 +69,7 @@ xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
        uint                    flags)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        uint                    dqtype;
        int                     error;
        uint                    inactivate_flags;
@@ -102,11 +93,8 @@ xfs_qm_scall_quotaoff(
         * critical thing.
         * If quotaoff, then we must be dealing with the root filesystem.
         */
-        ASSERT(mp->m_quotainfo);
+        ASSERT(q);
-        if (mp->m_quotainfo)
+        mutex_lock(&q->qi_quotaofflock);
-                mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-        ASSERT(mp->m_quotainfo);
        /*
         * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +105,7 @@ xfs_qm_scall_quotaoff(
                spin_lock(&mp->m_sb_lock);
                mp->m_sb.sb_qflags = mp->m_qflags;
                spin_unlock(&mp->m_sb_lock);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +138,8 @@ xfs_qm_scall_quotaoff(
         * Nothing to do?  Don't complain. This happens when we're just
         * turning off quota enforcement.
         */
-        if ((mp->m_qflags & flags) == 0) {
+        if ((mp->m_qflags & flags) == 0)
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                goto out_unlock;
-                return (0);
-        }
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +148,7 @@ xfs_qm_scall_quotaoff(
         */
        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
        if (error)
-                goto out_error;
+                goto out_unlock;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +190,7 @@ xfs_qm_scall_quotaoff(
         * So, if we couldn't purge all the dquots from the filesystem,
         * we can't get rid of the incore data structures.
         */
-        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
                delay(10 * nculprits);
        /*
@@ -222,7 +208,7 @@ xfs_qm_scall_quotaoff(
        if (error) {
                /* We're screwed now. Shutdown is the only option. */
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                goto out_error;
+                goto out_unlock;
        }
        /*
@@ -230,27 +216,74 @@ xfs_qm_scall_quotaoff(
         */
        if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
            ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
                return (0);
        }
        /*
-         * Release our quotainode references, and vn_purge them,
+         * Release our quotainode references if we don't need them anymore.
-         * if we don't need them anymore.
         */
-        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+        if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
-                IRELE(XFS_QI_UQIP(mp));
+                IRELE(q->qi_uquotaip);
-                XFS_QI_UQIP(mp) = NULL;
+                q->qi_uquotaip = NULL;
        }
-        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
+        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
-                IRELE(XFS_QI_GQIP(mp));
+                IRELE(q->qi_gquotaip);
-                XFS_QI_GQIP(mp) = NULL;
+                q->qi_gquotaip = NULL;
        }
-out_error:
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (error);
+out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
+        return error;
+}
+STATIC int
+xfs_qm_scall_trunc_qfile(
+        struct xfs_mount        *mp,
+        xfs_ino_t               ino)
+{
+        struct xfs_inode        *ip;
+        struct xfs_trans        *tp;
+        int                     error;
+        if (ino == NULLFSINO)
+                return 0;
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+        if (error)
+                return error;
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+        error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                  XFS_TRANS_PERM_LOG_RES,
+                                  XFS_ITRUNCATE_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                goto out_put;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
+        error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+        if (error) {
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                     XFS_TRANS_ABORT);
+                goto out_unlock;
+        }
+        xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out_unlock:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+        IRELE(ip);
+        return error;
 }
 int
@@ -259,34 +292,20 @@ xfs_qm_scall_trunc_qfiles(
        uint            flags)
 {
        int             error = 0, error2 = 0;
-        xfs_inode_t     *qip;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
-        if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
+        if (flags & XFS_DQ_USER)
-                error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
+                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-                if (!error) {
+        if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
-                        error = xfs_truncate_file(mp, qip);
+                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
-                        IRELE(qip);
-                }
-        }
-        if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
-            mp->m_sb.sb_gquotino != NULLFSINO) {
-                error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-                if (!error2) {
-                        error2 = xfs_truncate_file(mp, qip);
-                        IRELE(qip);
-                }
-        }
        return error ? error : error2;
 }
 /*
 * Switch on (a given) quota enforcement for a filesystem.  This takes
 * effect immediately.
@@ -379,9 +398,9 @@ xfs_qm_scall_quotaon(
        /*
         * Switch on quota enforcement in core.
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
        return (0);
 }
@@ -392,11 +411,12 @@ xfs_qm_scall_quotaon(
 */
 int
 xfs_qm_scall_getqstat(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        fs_quota_stat_t *out)
+        struct fs_quota_stat    *out)
 {
-        xfs_inode_t     *uip, *gip;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        boolean_t       tempuqip, tempgqip;
+        struct xfs_inode        *uip, *gip;
+        boolean_t               tempuqip, tempgqip;
        uip = gip = NULL;
        tempuqip = tempgqip = B_FALSE;
@@ -415,18 +435,18 @@ xfs_qm_scall_getqstat(
        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-        if (mp->m_quotainfo) {
+        if (q) {
-                uip = mp->m_quotainfo->qi_uquotaip;
+                uip = q->qi_uquotaip;
-                gip = mp->m_quotainfo->qi_gquotaip;
+                gip = q->qi_gquotaip;
        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                        0, 0, &uip, 0) == 0)
+                                        0, 0, &uip) == 0)
                        tempuqip = B_TRUE;
        }
        if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                        0, 0, &gip, 0) == 0)
+                                        0, 0, &gip) == 0)
                        tempgqip = B_TRUE;
        }
        if (uip) {
@@ -441,17 +461,20 @@ xfs_qm_scall_getqstat(
                if (tempgqip)
                        IRELE(gip);
        }
-        if (mp->m_quotainfo) {
+        if (q) {
-                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+                out->qs_incoredqs = q->qi_dquots;
-                out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+                out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+                out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+                out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+                out->qs_iwarnlimit = q->qi_iwarnlimit;
        }
-        return (0);
+        return 0;
 }
+#define XFS_DQ_MASK \
+        (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
 /*
 * Adjust quota limits, and start/stop timers accordingly.
 */
@@ -462,15 +485,17 @@ xfs_qm_scall_setqlim(
        uint                    type,
        fs_disk_quota_t         *newlim)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_disk_dquot_t        *ddq;
        xfs_dquot_t             *dqp;
        xfs_trans_t             *tp;
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if ((newlim->d_fieldmask &
+        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-            (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
+                return EINVAL;
-                return (0);
+        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+                return 0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +510,7 @@ xfs_qm_scall_setqlim(
         * a quotaoff from happening). (XXXThis doesn't currently happen
         * because we take the vfslock before calling xfs_qm_sysent).
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&q->qi_quotaofflock);
        /*
         * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +518,8 @@ xfs_qm_scall_setqlim(
         */
        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
                ASSERT(error != ENOENT);
-                return (error);
+                goto out_unlock;
        }
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -513,8 +537,8 @@ xfs_qm_scall_setqlim(
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
                ddq->d_blk_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_bhardlimit = hard;
+                        q->qi_bhardlimit = hard;
-                        mp->m_quotainfo->qi_bsoftlimit = soft;
+                        q->qi_bsoftlimit = soft;
                }
        } else {
                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +553,8 @@ xfs_qm_scall_setqlim(
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
                ddq->d_rtb_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_rtbhardlimit = hard;
+                        q->qi_rtbhardlimit = hard;
-                        mp->m_quotainfo->qi_rtbsoftlimit = soft;
+                        q->qi_rtbsoftlimit = soft;
                }
        } else {
                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +570,8 @@ xfs_qm_scall_setqlim(
                ddq->d_ino_hardlimit = cpu_to_be64(hard);
                ddq->d_ino_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_ihardlimit = hard;
+                        q->qi_ihardlimit = hard;
-                        mp->m_quotainfo->qi_isoftlimit = soft;
+                        q->qi_isoftlimit = soft;
                }
        } else {
                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +596,23 @@ xfs_qm_scall_setqlim(
                 * for warnings.
                 */
                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-                        mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_btimer;
                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
                }
                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-                        mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_itimer;
                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
                }
                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-                        mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
                }
                if (newlim->d_fieldmask & FS_DQ_BWARNS)
-                        mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_bwarns;
                if (newlim->d_fieldmask & FS_DQ_IWARNS)
-                        mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_iwarns;
                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-                        mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -605,8 +629,9 @@ xfs_qm_scall_setqlim(
        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
        return error;
 }
@@ -785,9 +810,9 @@ xfs_qm_export_dquot(
        }
 #ifdef DEBUG
-        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) ||
+        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
             (XFS_IS_OQUOTA_ENFORCED(mp) &&
-                        (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) &&
+                        (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
            dst->d_id != 0) {
                if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
                    (dst->d_blk_softlimit > 0)) {
@@ -808,17 +833,17 @@ xfs_qm_export_qtype_flags(
        /*
         * Can't be more than one, or none.
         */
-        ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
+        ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-                (XFS_PROJ_QUOTA | XFS_USER_QUOTA));
+                (FS_PROJ_QUOTA | FS_USER_QUOTA));
-        ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
+        ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-                (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
+                (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
+        ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-                (XFS_USER_QUOTA | XFS_GROUP_QUOTA));
+                (FS_USER_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
+        ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
        return (flags & XFS_DQ_USER) ?
-                XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+                FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-                        XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
+                        FS_PROJ_QUOTA : FS_GROUP_QUOTA;
 }
 STATIC uint
@@ -829,16 +854,16 @@ xfs_qm_export_flags(
        uflags = 0;
        if (flags & XFS_UQUOTA_ACCT)
-                uflags |= XFS_QUOTA_UDQ_ACCT;
+                uflags |= FS_QUOTA_UDQ_ACCT;
        if (flags & XFS_PQUOTA_ACCT)
-                uflags |= XFS_QUOTA_PDQ_ACCT;
+                uflags |= FS_QUOTA_PDQ_ACCT;
        if (flags & XFS_GQUOTA_ACCT)
-                uflags |= XFS_QUOTA_GDQ_ACCT;
+                uflags |= FS_QUOTA_GDQ_ACCT;
        if (flags & XFS_UQUOTA_ENFD)
-                uflags |= XFS_QUOTA_UDQ_ENFD;
+                uflags |= FS_QUOTA_UDQ_ENFD;
        if (flags & (XFS_OQUOTA_ENFD)) {
                uflags |= (flags & XFS_GQUOTA_ACCT) ?
-                        XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
+                        FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
        }
        return (uflags);
 }
@@ -853,7 +878,8 @@ xfs_dqrele_inode(
        int                     error;
        /* skip quota inodes */
-        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
                read_unlock(&pag->pag_ici_lock);
@@ -873,8 +899,9 @@ xfs_dqrele_inode(
                xfs_qm_dqrele(ip->i_gdquot);
                ip->i_gdquot = NULL;
        }
-        xfs_iput(ip, XFS_ILOCK_EXCL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
        return 0;
 }
@@ -931,7 +958,8 @@ struct mutex  qcheck_lock;
 }
 typedef struct dqtest {
-        xfs_dqmarker_t  q_lists;
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_hashlist;
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        xfs_mount_t     *q_mount;       /* filesystem this relates to */
        xfs_dqid_t      d_id;           /* user id or group id */
@@ -942,14 +970,9 @@ typedef struct dqtest {
 STATIC void
 xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 {
-        xfs_dquot_t *d;
+        list_add(&dqp->q_hashlist, &h->qh_list);
-        if (((d) = (h)->qh_next))
+        h->qh_version++;
-                (d)->HL_PREVP = &((dqp)->HL_NEXT);
+        h->qh_nelems++;
-        (dqp)->HL_NEXT = d;
-        (dqp)->HL_PREVP = &((h)->qh_next);
-        (h)->qh_next = (xfs_dquot_t *)dqp;
-        (h)->qh_version++;
-        (h)->qh_nelems++;
 }
 STATIC void
 xfs_qm_dqtest_print(
@@ -1061,9 +1084,7 @@ xfs_qm_internalqcheck_dqget(
        xfs_dqhash_t    *h;
        h = DQTEST_HASH(mp, id, type);
-        for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+        list_for_each_entry(d, &h->qh_list, q_hashlist) {
-             d = (xfs_dqtest_t *) d->HL_NEXT) {
-                /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
                if (d->d_id == id && mp == d->q_mount) {
                        *O_dq = d;
                        return (0);
@@ -1074,6 +1095,7 @@ xfs_qm_internalqcheck_dqget(
        d->d_id = id;
        d->q_mount = mp;
        d->q_hash = h;
+        INIT_LIST_HEAD(&d->q_hashlist);
        xfs_qm_hashinsert(h, d);
        *O_dq = d;
        return (0);
@@ -1112,10 +1134,7 @@ xfs_qm_internalqcheck_adjust(
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* not used */
        int             ubsize,         /* not used */
-        void            *private_data,  /* not used */
-        xfs_daddr_t     bno,            /* starting block of inode cluster */
        int             *ubused,        /* not used */
-        void            *dip,           /* not used */
        int             *res)           /* bulkstat result code */
 {
        xfs_inode_t             *ip;
@@ -1137,7 +1156,7 @@ xfs_qm_internalqcheck_adjust(
        ipreleased = B_FALSE;
 again:
        lock_flags = XFS_ILOCK_SHARED;
-        if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
+        if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
                *res = BULKSTAT_RV_NOTHING;
                return (error);
        }
@@ -1149,7 +1168,8 @@ xfs_qm_internalqcheck_adjust(
         * of those now.
         */
        if (! ipreleased) {
-                xfs_iput(ip, lock_flags);
+                xfs_iunlock(ip, lock_flags);
+                IRELE(ip);
                ipreleased = B_TRUE;
                goto again;
        }
@@ -1166,7 +1186,8 @@ xfs_qm_internalqcheck_adjust(
                ASSERT(gd);
                xfs_qm_internalqcheck_dqadjust(ip, gd);
        }
-        xfs_iput(ip, lock_flags);
+        xfs_iunlock(ip, lock_flags);
+        IRELE(ip);
        *res = BULKSTAT_RV_DIDONE;
        return (0);
 }
@@ -1180,8 +1201,6 @@ xfs_qm_internalqcheck(
        xfs_ino_t       lastino;
        int             done, count;
        int             i;
-        xfs_dqtest_t    *d, *e;
-        xfs_dqhash_t    *h1;
        int             error;
        lastino = 0;
@@ -1210,30 +1229,29 @@ xfs_qm_internalqcheck(
                 * Iterate thru all the inodes in the file system,
                 * adjusting the corresponding dquot counters
                 */
-                if ((error = xfs_bulkstat(mp, &lastino, &count,
+                error = xfs_bulkstat(mp, &lastino, &count,
-                                 xfs_qm_internalqcheck_adjust, NULL,
+                                 xfs_qm_internalqcheck_adjust,
-                                 0, NULL, BULKSTAT_FG_IGET, &done))) {
+                                 0, NULL, &done);
+                if (error) {
+                        cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
                        break;
                }
-        } while (! done);
+        } while (!done);
-        if (error) {
-                cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
-        }
        cmn_err(CE_DEBUG, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
-                h1 = &qmtest_udqtab[i];
+                xfs_dqtest_t    *d, *n;
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                xfs_dqhash_t    *h;
+                h = &qmtest_udqtab[i];
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
-                h1 = &qmtest_gdqtab[i];
+                h = &qmtest_gdqtab[i];
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
        }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
 */
 #define XFS_DQITER_MAP_SIZE     10
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)     ((mp)->m_quotainfo->qi_dqperchunk)
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)     ((d)->q_transp == (t))
-#define XFS_QI_MPLRECLAIMS(mp)  ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)         ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)         ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)   ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)   ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)   ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
-#define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define xfs_qm_mplist_lock(mp) \
-        mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
-        mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
-        mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-        mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_freelist_lock(qm) \
-        mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
-        mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
-        mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
 */
@@ -72,9 +35,6 @@
                                      XFS_DQ_HASHVAL(mp, id)) : \
                                     (xfs_Gqm->qm_grp_dqhtable + \
                                      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-                                        XFS_IS_UQUOTA_ON(mp) : \
-                                        XFS_IS_OQUOTA_ON(mp))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
        !dqp->q_core.d_rtbcount && \
        !dqp->q_core.d_icount)
-#define HL_PREVP        dq_hashlist.ql_prevp
-#define HL_NEXT         dq_hashlist.ql_next
-#define MPL_PREVP       dq_mplist.ql_prevp
-#define MPL_NEXT        dq_mplist.ql_next
-#define _LIST_REMOVE(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (dqp)->NXT))                                \
-                         (d)->PVP = (dqp)->PVP;                 \
-                 *((dqp)->PVP) = d;                             \
-                 (dqp)->NXT = NULL;                             \
-                 (dqp)->PVP = NULL;                             \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems--;                              \
-        }
-#define _LIST_INSERT(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (h)->qh_next))                      \
-                         (d)->PVP = &((dqp)->NXT);              \
-                 (dqp)->NXT = d;                                \
-                 (dqp)->PVP = &((h)->qh_next);                  \
-                 (h)->qh_next = dqp;                            \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems++;                              \
-         }
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-        for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)   \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-#define XQM_HASHLIST_INSERT(h, dqp)     \
-         _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_INSERT(h, dqp)     \
-         xfs_qm_freelist_append(h, dqp)
-#define XQM_MPLIST_INSERT(h, dqp)       \
-         _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-#define XQM_HASHLIST_REMOVE(h, dqp)     \
-         _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)        \
-         xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)       \
-        { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-          XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)    ((dqp)->q_logitem.qli_dquot == (dqp))
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)   (XFS_QM_ISUDQ(dqp) ? \
-                                         (tp)->t_dqinfo->dqa_usrdquots : \
-                                         (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)         \
-        (!((dqp)->q_core.d_id))
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..7de91d1b75c0 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,25 +23,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -59,17 +49,14 @@ xfs_trans_dqjoin(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_dq_logitem_t    *lp;
+        ASSERT(dqp->q_transp != tp);
-        ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+        ASSERT(dqp->q_logitem.qli_dquot == dqp);
-        lp = &dqp->q_logitem;
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+        xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
        /*
         * Initialize i_transp so we can later determine if this dquot is
@@ -94,16 +81,11 @@ xfs_trans_log_dquot(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_log_item_desc_t     *lidp;
+        ASSERT(dqp->q_transp == tp);
-        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 /*
@@ -198,16 +180,16 @@ xfs_trans_get_dqtrx(
        int             i;
        xfs_dqtrx_t     *qa;
-        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+        qa = XFS_QM_ISUDQ(dqp) ?
-                qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+                tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
-                    qa[i].qt_dquot == dqp) {
+                    qa[i].qt_dquot == dqp)
-                        return (&qa[i]);
+                        return &qa[i];
-                }
        }
-        return (NULL);
+        return NULL;
 }
 /*
@@ -381,7 +363,7 @@ xfs_trans_apply_dquot_deltas(
                                break;
                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-                        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+                        ASSERT(dqp->q_transp == tp);
                        /*
                         * adjust the actual number of blocks used
@@ -639,7 +621,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_bsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_btimer);
                warns = be16_to_cpu(dqp->q_core.d_bwarns);
-                warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
                resbcountp = &dqp->q_res_bcount;
        } else {
                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +633,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_rtbsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
                warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-                warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
                resbcountp = &dqp->q_res_rtbcount;
        }
@@ -691,7 +673,7 @@ xfs_trans_dqresv(
                        count = be64_to_cpu(dqp->q_core.d_icount);
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                        warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
                        hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
                        if (!hardlimit)
                                hardlimit = q->qi_ihardlimit;
@@ -875,9 +857,8 @@ xfs_trans_get_qoff_item(
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
+        xfs_trans_add_item(tp, &q->qql_item);
+        return q;
-        return (q);
 }
@@ -891,13 +872,8 @@ xfs_trans_log_quotaoff_item(
        xfs_trans_t             *tp,
        xfs_qoff_logitem_t      *qlp)
 {
-        xfs_log_item_desc_t     *lidp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 }
 STATIC void
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 3f3610a7ee05..975aa10e1a47 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -22,7 +22,6 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl                                  NULL
 # define xfs_get_acl(inode, type)                       NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
- * but whose transactions aren't committed to disk yet.
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
 */
-typedef struct xfs_perag_busy {
+struct xfs_busy_extent {
-        xfs_agblock_t   busy_start;
+        struct rb_node  rb_node;        /* ag by-bno indexed search tree */
-        xfs_extlen_t    busy_length;
+        struct list_head list;          /* transaction busy extent list */
-        struct xfs_trans *busy_tp;      /* transaction that did the free */
+        xfs_agnumber_t  agno;
-} xfs_perag_busy_t;
+        xfs_agblock_t   bno;
+        xfs_extlen_t    length;
+        xlog_tid_t      tid;            /* transaction that created this */
+};
 /*
 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,17 +222,16 @@ typedef struct xfs_perag {
        xfs_agino_t     pagl_leftrec;
        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
-        spinlock_t      pagb_lock;      /* lock for pagb_list */
+        spinlock_t      pagb_lock;      /* lock for pagb_tree */
+        struct rb_root  pagb_tree;      /* ordered tree of busy extents */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
-        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
 } xfs_perag_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..af168faccc7a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,18 +24,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -46,11 +41,9 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC void
+static int
-xfs_alloc_search_busy(xfs_trans_t *tp,
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno, xfs_extlen_t len);
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
@@ -540,9 +533,16 @@ xfs_alloc_ag_vextent(
                                be32_to_cpu(agf->agf_length));
                        xfs_alloc_log_agf(args->tp, args->agbp,
                                                XFS_AGF_FREEBLKS);
-                        /* search the busylist for these blocks */
+                        /*
-                        xfs_alloc_search_busy(args->tp, args->agno,
+                         * Search the busylist for these blocks and mark the
-                                        args->agbno, args->len);
+                         * transaction as synchronous if blocks are found. This
+                         * avoids the need to block due to a synchronous log
+                         * force to ensure correct ordering as the synchronous
+                         * transaction will guarantee that for us.
+                         */
+                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                                                args->agbno, args->len))
+                                xfs_trans_set_sync(args->tp);
                }
                if (!args->isfl)
                        xfs_trans_mod_sb(args->tp,
@@ -683,8 +683,6 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltbno;          /* start bno of left side entry */
        xfs_agblock_t   ltbnoa;         /* aligned ... */
        xfs_extlen_t    ltdiff;         /* difference to left side entry */
-        /*REFERENCED*/
-        xfs_agblock_t   ltend;          /* end bno of left side entry */
        xfs_extlen_t    ltlen;          /* length of left side entry */
        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
@@ -809,8 +807,7 @@ xfs_alloc_ag_vextent_near(
                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                ltend = ltbno + ltlen;
+                ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-                ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
                args->len = blen;
                if (!xfs_alloc_fix_minleft(args)) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -823,7 +820,7 @@ xfs_alloc_ag_vextent_near(
                 */
                args->agbno = bnew;
                ASSERT(bnew >= ltbno);
-                ASSERT(bnew + blen <= ltend);
+                ASSERT(bnew + blen <= ltbno + ltlen);
                /*
                 * Set up a cursor for the by-bno tree.
                 */
@@ -1152,7 +1149,6 @@ xfs_alloc_ag_vextent_near(
        /*
         * Fix up the length and compute the useful address.
         */
-        ltend = ltbno + ltlen;
        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
        xfs_alloc_fix_len(args);
        if (!xfs_alloc_fix_minleft(args)) {
@@ -1165,7 +1161,7 @@ xfs_alloc_ag_vextent_near(
        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
                ltlen, &ltnew);
        ASSERT(ltnew >= ltbno);
-        ASSERT(ltnew + rlen <= ltend);
+        ASSERT(ltnew + rlen <= ltbno + ltlen);
        ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        args->agbno = ltnew;
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1693,7 +1689,7 @@ xfs_free_ag_extent(
         * when the iclog commits to disk.  If a busy block is allocated,
         * the iclog is pushed up to the LSN that freed the block.
         */
-        xfs_alloc_mark_busy(tp, agno, bno, len);
+        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1989,14 +1985,20 @@ xfs_alloc_get_freelist(
        *bnop = bno;
        /*
-         * As blocks are freed, they are added to the per-ag busy list
+         * As blocks are freed, they are added to the per-ag busy list and
-         * and remain there until the freeing transaction is committed to
+         * remain there until the freeing transaction is committed to disk.
-         * disk.  Now that we have allocated blocks, this list must be
+         * Now that we have allocated blocks, this list must be searched to see
-         * searched to see if a block is being reused.  If one is, then
+         * if a block is being reused.  If one is, then the freeing transaction
-         * the freeing transaction must be pushed to disk NOW by forcing
+         * must be pushed to disk before this transaction.
-         * to disk all iclogs up that transaction's LSN.
+         *
+         * We do this by setting the current transaction to a sync transaction
+         * which guarantees that the freeing transaction is on disk before this
+         * transaction. This is done instead of a synchronous log force here so
+         * that we don't sit and wait with the AGF locked in the transaction
+         * during the log force.
         */
-        xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2201,7 +2203,7 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
                spin_lock_init(&pag->pagb_lock);
                pag->pagb_count = 0;
-                memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+                pag->pagb_tree = RB_ROOT;
                pag->pagf_init = 1;
        }
 #ifdef DEBUG
@@ -2479,127 +2481,263 @@ error0:
 * list is reused, the transaction that freed it must be forced to disk
 * before continuing to use the block.
 *
- * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1        Thread 2        Thread 3        xfslogd
+ *      xact alloc
+ *      free X
+ *      mark busy
+ *      commit xact
+ *      free xact
+ *                      xact alloc
+ *                      alloc X
+ *                      busy search
+ *                      mark xact sync
+ *                      commit xact
+ *                      free xact
+ *                      force log
+ *                      checkpoint starts
+ *                      ....
+ *                                      xact alloc
+ *                                      free X
+ *                                      mark busy
+ *                                      finds match
+ *                                      *** KABOOM! ***
+ *                                      ....
+ *                                                      log IO completes
+ *                                                      unbusy X
+ *                      checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
 */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(
-                    xfs_agnumber_t agno,
+        struct xfs_trans        *tp,
-                    xfs_agblock_t bno,
+        xfs_agnumber_t          agno,
-                    xfs_extlen_t len)
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
-        xfs_perag_busy_t        *bsy;
+        struct xfs_busy_extent  *new;
+        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
-        int                     n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        int                     match;
-        pag = xfs_perag_get(tp->t_mountp, agno);
-        spin_lock(&pag->pagb_lock);
-        /* search pagb_list for an open slot */
+        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
-        for (bsy = pag->pagb_list, n = 0;
+        if (!new) {
-             n < XFS_PAGB_NUM_SLOTS;
+                /*
-             bsy++, n++) {
+                 * No Memory!  Since it is now not possible to track the free
-                if (bsy->busy_tp == NULL) {
+                 * block, make this a synchronous transaction to insure that
-                        break;
+                 * the block is not reused before this transaction commits.
-                }
+                 */
+                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                xfs_trans_set_sync(tp);
+                return;
        }
-        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+        new->agno = agno;
+        new->bno = bno;
+        new->length = len;
+        new->tid = xfs_log_get_trans_ident(tp);
-        if (n < XFS_PAGB_NUM_SLOTS) {
+        INIT_LIST_HEAD(&new->list);
-                bsy = &pag->pagb_list[n];
-                pag->pagb_count++;
+        /* trace before insert to be able to see failed inserts */
-                bsy->busy_start = bno;
+        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
-                bsy->busy_length = len;
-                bsy->busy_tp = tp;
+        pag = xfs_perag_get(tp->t_mountp, new->agno);
-                xfs_trans_add_busy(tp, agno, n);
+restart:
-        } else {
+        spin_lock(&pag->pagb_lock);
+        rbp = &pag->pagb_tree.rb_node;
+        parent = NULL;
+        busyp = NULL;
+        match = 0;
+        while (*rbp && match >= 0) {
+                parent = *rbp;
+                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+                if (new->bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        rbp = &(*rbp)->rb_left;
+                        if (new->bno + new->length > busyp->bno)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else if (new->bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        rbp = &(*rbp)->rb_right;
+                        if (bno < busyp->bno + busyp->length)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else {
+                        match = busyp->tid == new->tid ? 1 : -1;
+                        break;
+                }
+        }
+        if (match < 0) {
+                /* overlap marked busy in different transaction */
+                spin_unlock(&pag->pagb_lock);
+                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+                goto restart;
+        }
+        if (match > 0) {
                /*
-                 * The busy list is full!  Since it is now not possible to
+                 * overlap marked busy in same transaction. Update if exact
-                 * track the free block, make this a synchronous transaction
+                 * start block match, otherwise combine the busy extents into
-                 * to insure that the block is not reused before this
+                 * a single range.
-                 * transaction commits.
                 */
-                xfs_trans_set_sync(tp);
+                if (busyp->bno == new->bno) {
-        }
+                        busyp->length = max(busyp->length, new->length);
+                        spin_unlock(&pag->pagb_lock);
+                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+                        xfs_perag_put(pag);
+                        kmem_free(new);
+                        return;
+                }
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                new->length = max(busyp->bno + busyp->length,
+                                        new->bno + new->length) -
+                                min(busyp->bno, new->bno);
+                new->bno = min(busyp->bno, new->bno);
+        } else
+                busyp = NULL;
+        rb_link_node(&new->rb_node, parent, rbp);
+        rb_insert_color(&new->rb_node, &pag->pagb_tree);
+        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+        kmem_free(busyp);
 }
-void
+/*
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+ * Search for a busy extent within the range of the extent we are about to
-                     xfs_agnumber_t agno,
+ * allocate.  You need to be holding the busy extent tree lock when calling
-                     int idx)
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *list;
+        struct rb_node          *rbp;
+        struct xfs_busy_extent  *busyp;
+        int                     match = 0;
-        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        pag = xfs_perag_get(mp, agno);
-        pag = xfs_perag_get(tp->t_mountp, agno);
        spin_lock(&pag->pagb_lock);
-        list = pag->pagb_list;
-        trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
+        rbp = pag->pagb_tree.rb_node;
-        if (list[idx].busy_tp == tp) {
+        /* find closest start bno overlap */
-                list[idx].busy_tp = NULL;
+        while (rbp) {
-                pag->pagb_count--;
+                busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                if (bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        if (bno + len > busyp->bno)
+                                match = -1;
+                        rbp = rbp->rb_left;
+                } else if (bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        if (bno < busyp->bno + busyp->length)
+                                match = -1;
+                        rbp = rbp->rb_right;
+                } else {
+                        /* bno matches busyp, length determines exact match */
+                        match = (busyp->length == len) ? 1 : -1;
+                        break;
+                }
        }
        spin_unlock(&pag->pagb_lock);
+        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
+        return match;
 }
+void
-/*
+xfs_alloc_busy_clear(
- * If we find the extent in the busy list, force the log out to get the
+        struct xfs_mount        *mp,
- * extent out of the busy list so the caller can use it straight away.
+        struct xfs_busy_extent  *busyp)
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-                    xfs_agnumber_t agno,
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *bsy;
-        xfs_agblock_t           uend, bend;
-        xfs_lsn_t               lsn = 0;
-        int                     cnt;
-        pag = xfs_perag_get(tp->t_mountp, agno);
+        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-        spin_lock(&pag->pagb_lock);
+                                                busyp->length);
-        cnt = pag->pagb_count;
-        /*
+        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-         * search pagb_list for this slot, skipping open slots. We have to
+                                                busyp->length) == 1);
-         * search the entire array as there may be multiple overlaps and
-         * we have to get the most recent LSN for the log force to push out
-         * all the transactions that span the range.
-         */
-        uend = bno + len - 1;
-        for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-                bsy = &pag->pagb_list[cnt];
-                if (!bsy->busy_tp)
-                        continue;
-                bend = bsy->busy_start + bsy->busy_length - 1;
+        list_del_init(&busyp->list);
-                if (bno > bend || uend < bsy->busy_start)
-                        continue;
-                /* (start1,length1) within (start2, length2) */
+        pag = xfs_perag_get(mp, busyp->agno);
-                if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+        spin_lock(&pag->pagb_lock);
-                        lsn = bsy->busy_tp->t_commit_lsn;
+        rb_erase(&busyp->rb_node, &pag->pagb_tree);
-        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
-        /*
+        kmem_free(busyp);
-         * If a block was found, force the log through the LSN of the
-         * transaction that freed the block
-         */
-        if (lsn)
-                xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..895009a97271 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,20 +22,21 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
 */
-typedef enum xfs_alloctype
+#define XFS_ALLOCTYPE_ANY_AG    0x01    /* allocate anywhere, use rotor */
-{
+#define XFS_ALLOCTYPE_FIRST_AG  0x02    /* ... start at ag 0 */
-        XFS_ALLOCTYPE_ANY_AG,           /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_START_AG  0x04    /* anywhere, start in this a.g. */
-        XFS_ALLOCTYPE_FIRST_AG,         /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_THIS_AG   0x08    /* anywhere in this a.g. */
-        XFS_ALLOCTYPE_START_AG,         /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10    /* near this block else anywhere */
-        XFS_ALLOCTYPE_THIS_AG,          /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_NEAR_BNO  0x20    /* in this a.g. and near this block */
-        XFS_ALLOCTYPE_START_BNO,        /* near this block else anywhere */
+#define XFS_ALLOCTYPE_THIS_BNO  0x40    /* at exactly this block */
-        XFS_ALLOCTYPE_NEAR_BNO,         /* in this a.g. and near this block */
-        XFS_ALLOCTYPE_THIS_BNO          /* at exactly this block */
+/* this should become an enum again when the tracing code is fixed */
-} xfs_alloctype_t;
+typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_TYPES \
        { XFS_ALLOCTYPE_ANY_AG,         "ANY_AG" }, \
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
                xfs_agnumber_t agno,
                xfs_agblock_t bno,
                xfs_extlen_t len);
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
-                xfs_agnumber_t ag,
-                int idx);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..97f7328967fd 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,19 +24,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -134,7 +129,7 @@ xfs_allocbt_free_block(
         * disk. If a busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
-        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b9c196a53c42..c2568242a901 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -25,19 +25,13 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
@@ -325,8 +319,7 @@ xfs_attr_set_int(
                return (error);
        }
-        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(args.trans, dp);
-        xfs_trans_ihold(args.trans, dp);
        /*
         * If the attribute list is non-existent or a shortform list,
@@ -396,10 +389,8 @@ xfs_attr_set_int(
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args.trans, dp);
-                        xfs_trans_ihold(args.trans, dp);
-                }
                /*
                 * Commit the leaf transformation.  We'll need another (linked)
@@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
         * No need to make quota reservations here. We expect to release some
         * blocks not allocate in the common case.
         */
-        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(args.trans, dp);
-        xfs_trans_ihold(args.trans, dp);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
         * No need to make quota reservations here. We expect to release some
         * blocks, not allocate, in the common case.
         */
-        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(trans, dp);
-        xfs_trans_ihold(trans, dp);
        /*
         * Decide on what work routines to call based on the inode size.
@@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                /*
                 * Commit the current trans (including the inode) and start
@@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                } else
                        xfs_da_buf_done(bp);
@@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
        } else
                xfs_da_buf_done(bp);
        return(0);
@@ -1317,10 +1300,8 @@ restart:
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                        /*
                         * Commit the node conversion and start the next
@@ -1356,10 +1337,8 @@ restart:
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
        } else {
                /*
                 * Addition succeeded, update Btree hashvals.
@@ -1470,10 +1449,8 @@ restart:
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                }
                /*
@@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                /*
                 * Commit the Btree join operation and start a new trans.
@@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                         * and started a new one.  We need the inode to be
                         * in all transactions.
                         */
-                        if (committed) {
+                        if (committed)
-                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(args->trans, dp);
-                                xfs_trans_ihold(args->trans, dp);
-                        }
                } else
                        xfs_da_brelse(args->trans, bp);
        }
@@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                  NULL, 0, map, &nmap, NULL, NULL);
+                                  NULL, 0, map, &nmap, NULL);
                if (error)
                        return(error);
                ASSERT(nmap >= 1);
@@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
                                                        XFS_BMAPI_WRITE,
                                  args->firstblock, args->total, &map, &nmap,
-                                  args->flist, NULL);
+                                  args->flist);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                &committed);
@@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, dp);
-                        xfs_trans_ihold(args->trans, dp);
-                }
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                                  args->rmtblkcnt,
                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                  args->firstblock, 0, &map, &nmap,
-                                  NULL, NULL);
+                                  NULL);
                if (error) {
                        return(error);
                }
@@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                                        args->rmtblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                        args->firstblock, 0, &map, &nmap,
-                                        args->flist, NULL);
+                                        args->flist);
                if (error) {
                        return(error);
                }
@@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                    1, args->firstblock, args->flist,
-                                    NULL, &done);
+                                    &done);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                &committed);
@@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                 * bmap_finish() may have committed the last trans and started
                 * a new one.  We need the inode to be in all transactions.
                 */
-                if (committed) {
+                if (committed)
-                        xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(args->trans, args->dp);
-                        xfs_trans_ihold(args->trans, args->dp);
-                }
                /*
                 * Close out trans and start the next one in the chain.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index a90ce74fc256..a6cff8edcdb6 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
@@ -33,7 +31,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                nmap = 1;
                error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                        NULL, 0, &map, &nmap, NULL, NULL);
+                                        NULL, 0, &map, &nmap, NULL);
                if (error) {
                        return(error);
                }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -30,13 +30,10 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
@@ -104,7 +101,6 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);  /* OK to allocate reserved blocks */
@@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd);  /* OK to allocate reserved blocks */
 /*
@@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork); /* data or attr fork */
 /*
@@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp); /* inode logging flags */
-        xfs_extdelta_t          *delta); /* Change made to incore extents */
 /*
 * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -200,7 +192,6 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp,/* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd);   /* OK to allocate reserved blocks */
@@ -489,7 +480,6 @@ xfs_bmap_add_extent(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to use reserved data blocks */
 {
@@ -524,15 +514,6 @@ xfs_bmap_add_extent(
                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else
                        logflags = 0;
-                /* DELTA: single new extent */
-                if (delta) {
-                        if (delta->xed_startoff > new->br_startoff)
-                                delta->xed_startoff = new->br_startoff;
-                        if (delta->xed_blockcount <
-                                        new->br_startoff + new->br_blockcount)
-                                delta->xed_blockcount = new->br_startoff +
-                                                new->br_blockcount;
-                }
        }
        /*
         * Any kind of new delayed allocation goes here.
@@ -542,7 +523,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
-                                &logflags, delta, rsvd)))
+                                &logflags, rsvd)))
                        goto done;
        }
        /*
@@ -553,7 +534,7 @@ xfs_bmap_add_extent(
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-                                &logflags, delta, whichfork)))
+                                &logflags, whichfork)))
                        goto done;
        } else {
                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
@@ -578,17 +559,17 @@ xfs_bmap_add_extent(
                                                XFS_BTCUR_BPRV_WASDEL);
                                if ((error = xfs_bmap_add_extent_delay_real(ip,
                                        idx, &cur, new, &da_new, first, flist,
-                                        &logflags, delta, rsvd)))
+                                        &logflags, rsvd)))
                                        goto done;
                        } else if (new->br_state == XFS_EXT_NORM) {
                                ASSERT(new->br_state == XFS_EXT_NORM);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags, delta)))
+                                        ip, idx, &cur, new, &logflags)))
                                        goto done;
                        } else {
                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
                                if ((error = xfs_bmap_add_extent_unwritten_real(
-                                        ip, idx, &cur, new, &logflags, delta)))
+                                        ip, idx, &cur, new, &logflags)))
                                        goto done;
                        }
                        ASSERT(*curp == cur || *curp == NULL);
@@ -601,7 +582,7 @@ xfs_bmap_add_extent(
                                ASSERT((cur->bc_private.b.flags &
                                        XFS_BTCUR_BPRV_WASDEL) == 0);
                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-                                        new, &logflags, delta, whichfork)))
+                                        new, &logflags, whichfork)))
                                goto done;
                }
        }
@@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real(
        xfs_fsblock_t           *first, /* pointer to firstblock variable */
        xfs_bmap_free_t         *flist, /* list of extents to be freed */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)   /* OK to use reserved data block allocation */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
@@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Three in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                *dnew = 0;
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
-                /* DELTA: The in-core extent described by new changed type. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING:
@@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING:
@@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real(
                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                *dnew = temp;
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real(
                        nullstartblock((int)temp2));
                trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
                *dnew = temp + temp2;
-                /* DELTA: One in-core extent is split in three. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real(
                ASSERT(0);
        }
        *curp = cur;
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
-        int                     *logflagsp, /* inode logging flags */
+        int                     *logflagsp) /* inode logging flags */
-        xfs_extdelta_t          *delta) /* Change made to incore extents */
 {
        xfs_btree_cur_t         *cur;   /* btree cursor */
        xfs_bmbt_rec_host_t     *ep;    /* extent entry for idx */
@@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
-        xfs_filblks_t           temp=0;
-        xfs_filblks_t           temp2=0;
 #define LEFT            r[0]
 #define RIGHT           r[1]
@@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real(
                                RIGHT.br_blockcount, LEFT.br_state)))
                                goto done;
                }
-                /* DELTA: Three in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state)))
                                goto done;
                }
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: Two in-core extents are replaced by one. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: The in-core extent described by new changed type. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real(
                                LEFT.br_state))
                                goto done;
                }
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = LEFT.br_startoff;
-                temp2 = LEFT.br_blockcount +
-                        PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING:
@@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real(
                                newext)))
                                goto done;
                }
-                /* DELTA: The boundary between two in-core extents moved. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount +
-                        RIGHT.br_blockcount;
                break;
        case BMAP_RIGHT_FILLING:
@@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in two. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case 0:
@@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: One in-core extent is split in three. */
-                temp = PREV.br_startoff;
-                temp2 = PREV.br_blockcount;
                break;
        case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
@@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real(
                ASSERT(0);
        }
        *curp = cur;
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_extnum_t            idx,    /* extent number to update/insert */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     rsvd)           /* OK to allocate reserved blocks */
 {
        xfs_bmbt_rec_host_t     *ep;    /* extent record for idx */
@@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     state;  /* state bits, accessed thru macros */
        xfs_filblks_t           temp=0; /* temp for indirect calculations */
-        xfs_filblks_t           temp2=0;
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        ep = xfs_iext_get_ext(ifp, idx);
@@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay(
                xfs_iext_remove(ip, idx, 1, state);
                ip->i_df.if_lastex = idx - 1;
-                /* DELTA: Two in-core extents were replaced by one. */
-                temp2 = temp;
-                temp = left.br_startoff;
                break;
        case BMAP_LEFT_CONTIG:
@@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay(
                trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
                ip->i_df.if_lastex = idx - 1;
-                /* DELTA: One in-core extent grew into a hole. */
-                temp2 = temp;
-                temp = left.br_startoff;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay(
                trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
                ip->i_df.if_lastex = idx;
-                /* DELTA: One in-core extent grew into a hole. */
-                temp2 = temp;
-                temp = new->br_startoff;
                break;
        case 0:
@@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay(
                oldlen = newlen = 0;
                xfs_iext_insert(ip, idx, 1, new, state);
                ip->i_df.if_lastex = idx;
-                /* DELTA: A new in-core extent was added in a hole. */
-                temp2 = new->br_blockcount;
-                temp = new->br_startoff;
                break;
        }
        if (oldlen != newlen) {
@@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay(
                 * Nothing to do for disk quota accounting here.
                 */
        }
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
        *logflagsp = 0;
        return 0;
 }
@@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *new,   /* new data to add to file extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork) /* data or attr fork */
 {
        xfs_bmbt_rec_host_t     *ep;    /* pointer to extent entry ins. point */
@@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
-        xfs_filblks_t           temp=0;
-        xfs_filblks_t           temp2=0;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
@@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real(
                                        left.br_state)))
                                goto done;
                }
-                /* DELTA: Two in-core extents were replaced by one. */
-                temp = left.br_startoff;
-                temp2 = left.br_blockcount +
-                        new->br_blockcount +
-                        right.br_blockcount;
                break;
        case BMAP_LEFT_CONTIG:
@@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real(
                                        left.br_state)))
                                goto done;
                }
-                /* DELTA: One in-core extent grew. */
-                temp = left.br_startoff;
-                temp2 = left.br_blockcount +
-                        new->br_blockcount;
                break;
        case BMAP_RIGHT_CONTIG:
@@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_state)))
                                goto done;
                }
-                /* DELTA: One in-core extent grew. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount +
-                        right.br_blockcount;
                break;
        case 0:
@@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                /* DELTA: A new extent was added in a hole. */
-                temp = new->br_startoff;
-                temp2 = new->br_blockcount;
                break;
        }
-        if (delta) {
-                temp2 += temp;
-                if (delta->xed_startoff > temp)
-                        delta->xed_startoff = temp;
-                if (delta->xed_blockcount < temp2)
-                        delta->xed_blockcount = temp2;
-        }
 done:
        *logflagsp = rval;
        return error;
@@ -2959,7 +2809,6 @@ xfs_bmap_del_extent(
        xfs_btree_cur_t         *cur,   /* if null, not a btree */
        xfs_bmbt_irec_t         *del,   /* data to remove from extents */
        int                     *logflagsp, /* inode logging flags */
-        xfs_extdelta_t          *delta, /* Change made to incore extents */
        int                     whichfork, /* data or attr fork */
        int                     rsvd)   /* OK to allocate reserved blocks */
 {
@@ -3265,14 +3114,6 @@ xfs_bmap_del_extent(
        if (da_old > da_new)
                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new),
                        rsvd);
-        if (delta) {
-                /* DELTA: report the original extent. */
-                if (delta->xed_startoff > got.br_startoff)
-                        delta->xed_startoff = got.br_startoff;
-                if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
-                        delta->xed_blockcount = got.br_startoff +
-                                                        got.br_blockcount;
-        }
 done:
        *logflagsp = flags;
        return error;
@@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork(
                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        IHOLD(ip);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_DEV:
                ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -3829,7 +3671,7 @@ xfs_bmap_add_attrfork(
        }
        if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
                goto error2;
-        error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        ASSERT(ip->i_df.if_ext_max ==
               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
@@ -4483,8 +4325,7 @@ xfs_bmapi(
        xfs_extlen_t    total,          /* total blocks needed */
        xfs_bmbt_irec_t *mval,          /* output: map values */
        int             *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t *flist,         /* i/o: list extents to free */
+        xfs_bmap_free_t *flist)         /* i/o: list extents to free */
-        xfs_extdelta_t  *delta)         /* o: change made to incore extents */
 {
        xfs_fsblock_t   abno;           /* allocated block number */
        xfs_extlen_t    alen;           /* allocated extent length */
@@ -4596,10 +4437,7 @@ xfs_bmapi(
        end = bno + len;
        obno = bno;
        bma.ip = NULL;
-        if (delta) {
-                delta->xed_startoff = NULLFILEOFF;
-                delta->xed_blockcount = 0;
-        }
        while (bno < end && n < *nmap) {
                /*
                 * Reading past eof, act as though there's a hole
@@ -4620,19 +4458,13 @@ xfs_bmapi(
                         * allocate the stuff asked for in this bmap call
                         * but that wouldn't be as good.
                         */
-                        if (wasdelay && !(flags & XFS_BMAPI_EXACT)) {
+                        if (wasdelay) {
                                alen = (xfs_extlen_t)got.br_blockcount;
                                aoff = got.br_startoff;
                                if (lastx != NULLEXTNUM && lastx) {
                                        ep = xfs_iext_get_ext(ifp, lastx - 1);
                                        xfs_bmbt_get_all(ep, &prev);
                                }
-                        } else if (wasdelay) {
-                                alen = (xfs_extlen_t)
-                                        XFS_FILBLKS_MIN(len,
-                                                (got.br_startoff +
-                                                 got.br_blockcount) - bno);
-                                aoff = bno;
                        } else {
                                alen = (xfs_extlen_t)
                                        XFS_FILBLKS_MIN(len, MAXEXTLEN);
@@ -4831,7 +4663,7 @@ xfs_bmapi(
                                        got.br_state = XFS_EXT_UNWRITTEN;
                        }
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-                                firstblock, flist, &tmp_logflags, delta,
+                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
@@ -4927,7 +4759,7 @@ xfs_bmapi(
                        }
                        mval->br_state = XFS_EXT_NORM;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-                                firstblock, flist, &tmp_logflags, delta,
+                                firstblock, flist, &tmp_logflags,
                                whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
                        logflags |= tmp_logflags;
                        if (error)
@@ -5017,14 +4849,6 @@ xfs_bmapi(
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
               XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
        error = 0;
-        if (delta && delta->xed_startoff != NULLFILEOFF) {
-                /* A change was actually made.
-                 * Note that delta->xed_blockount is an offset at this
-                 * point and needs to be converted to a block count.
-                 */
-                ASSERT(delta->xed_blockcount > delta->xed_startoff);
-                delta->xed_blockcount -= delta->xed_startoff;
-        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5136,8 +4960,6 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        xfs_extdelta_t          *delta,         /* o: change made to incore
-                                                   extents */
        int                     *done)          /* set if not done yet */
 {
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
@@ -5196,10 +5018,7 @@ xfs_bunmapi(
        bno = start + len - 1;
        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
                &prev);
-        if (delta) {
-                delta->xed_startoff = NULLFILEOFF;
-                delta->xed_blockcount = 0;
-        }
        /*
         * Check to see if the given block number is past the end of the
         * file, back up to the last block if so...
@@ -5297,7 +5116,7 @@ xfs_bunmapi(
                        }
                        del.br_state = XFS_EXT_UNWRITTEN;
                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-                                firstblock, flist, &logflags, delta,
+                                firstblock, flist, &logflags,
                                XFS_DATA_FORK, 0);
                        if (error)
                                goto error0;
@@ -5352,7 +5171,7 @@ xfs_bunmapi(
                                prev.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
                                        &prev, firstblock, flist, &logflags,
-                                        delta, XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5361,7 +5180,7 @@ xfs_bunmapi(
                                del.br_state = XFS_EXT_UNWRITTEN;
                                error = xfs_bmap_add_extent(ip, lastx, &cur,
                                        &del, firstblock, flist, &logflags,
-                                        delta, XFS_DATA_FORK, 0);
+                                        XFS_DATA_FORK, 0);
                                if (error)
                                        goto error0;
                                goto nodelete;
@@ -5414,7 +5233,7 @@ xfs_bunmapi(
                        goto error0;
                }
                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-                                &tmp_logflags, delta, whichfork, rsvd);
+                                &tmp_logflags, whichfork, rsvd);
                logflags |= tmp_logflags;
                if (error)
                        goto error0;
@@ -5471,14 +5290,6 @@ nodelete:
        ASSERT(ifp->if_ext_max ==
               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
-        if (delta && delta->xed_startoff != NULLFILEOFF) {
-                /* A change was actually made.
-                 * Note that delta->xed_blockount is an offset at this
-                 * point and needs to be converted to a block count.
-                 */
-                ASSERT(delta->xed_blockcount > delta->xed_startoff);
-                delta->xed_blockcount -= delta->xed_startoff;
-        }
 error0:
        /*
         * Log everything.  Do this after conversion, there's no point in
@@ -5605,28 +5416,6 @@ xfs_getbmap(
                prealloced = 0;
                fixlen = 1LL << 32;
        } else {
-                /*
-                 * If the BMV_IF_NO_DMAPI_READ interface bit specified, do
-                 * not generate a DMAPI read event.  Otherwise, if the
-                 * DM_EVENT_READ bit is set for the file, generate a read
-                 * event in order that the DMAPI application may do its thing
-                 * before we return the extents.  Usually this means restoring
-                 * user file data to regions of the file that look like holes.
-                 *
-                 * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
-                 * BMV_IF_NO_DMAPI_READ so that read events are generated.
-                 * If this were not true, callers of ioctl(XFS_IOC_GETBMAP)
-                 * could misinterpret holes in a DMAPI file as true holes,
-                 * when in fact they may represent offline user data.
-                 */
-                if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
-                    !(iflags & BMV_IF_NO_DMAPI_READ)) {
-                        error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip,
-                                              0, 0, 0, NULL);
-                        if (error)
-                                return XFS_ERROR(error);
-                }
                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -5713,7 +5502,7 @@ xfs_getbmap(
                error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
                                  XFS_BB_TO_FSB(mp, bmv->bmv_length),
                                  bmapi_flags, NULL, 0, map, &nmap,
-                                  NULL, NULL);
+                                  NULL);
                if (error)
                        goto out_free_map;
                ASSERT(nmap <= subnex);
@@ -5744,12 +5533,24 @@ xfs_getbmap(
                                        map[i].br_startblock))
                                goto out_free_map;
-                        nexleft--;
                        bmv->bmv_offset =
                                out[cur_ext].bmv_offset +
                                out[cur_ext].bmv_length;
                        bmv->bmv_length =
                                max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+                        /*
+                         * In case we don't want to return the hole,
+                         * don't increase cur_ext so that we can reuse
+                         * it in the next loop.
+                         */
+                        if ((iflags & BMV_IF_NO_HOLES) &&
+                            map[i].br_startblock == HOLESTARTBLOCK) {
+                                memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+                                continue;
+                        }
+                        nexleft--;
                        bmv->bmv_entries++;
                        cur_ext++;
                }
@@ -5859,66 +5660,34 @@ xfs_bmap_eof(
 }
 #ifdef DEBUG
-STATIC
+STATIC struct xfs_buf *
-xfs_buf_t *
 xfs_bmap_get_bp(
-        xfs_btree_cur_t         *cur,
+        struct xfs_btree_cur    *cur,
        xfs_fsblock_t           bno)
 {
-        int i;
+        struct xfs_log_item_desc *lidp;
-        xfs_buf_t *bp;
+        int                     i;
        if (!cur)
-                return(NULL);
+                return NULL;
-        bp = NULL;
-        for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-                bp = cur->bc_bufs[i];
-                if (!bp) break;
-                if (XFS_BUF_ADDR(bp) == bno)
-                        break;  /* Found it */
-        }
-        if (i == XFS_BTREE_MAXLEVELS)
-                bp = NULL;
-        if (!bp) { /* Chase down all the log items to see if the bp is there */
-                xfs_log_item_chunk_t    *licp;
-                xfs_trans_t             *tp;
-                tp = cur->bc_tp;
-                licp = &tp->t_items;
-                while (!bp && licp != NULL) {
-                        if (xfs_lic_are_all_free(licp)) {
-                                licp = licp->lic_next;
-                                continue;
-                        }
-                        for (i = 0; i < licp->lic_unused; i++) {
-                                xfs_log_item_desc_t     *lidp;
-                                xfs_log_item_t          *lip;
-                                xfs_buf_log_item_t      *bip;
-                                xfs_buf_t               *lbp;
-                                if (xfs_lic_isfree(licp, i)) {
-                                        continue;
-                                }
-                                lidp = xfs_lic_slot(licp, i);
-                                lip = lidp->lid_item;
-                                if (lip->li_type != XFS_LI_BUF)
-                                        continue;
-                                bip = (xfs_buf_log_item_t *)lip;
+        for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-                                lbp = bip->bli_buf;
+                if (!cur->bc_bufs[i])
+                        break;
+                if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+                        return cur->bc_bufs[i];
+        }
-                                if (XFS_BUF_ADDR(lbp) == bno) {
+        /* Chase down all the log items to see if the bp is there */
-                                        bp = lbp;
+        list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-                                        break; /* Found it */
+                struct xfs_buf_log_item *bip;
-                                }
+                bip = (struct xfs_buf_log_item *)lidp->lid_item;
-                        }
+                if (bip->bli_item.li_type == XFS_LI_BUF &&
-                        licp = licp->lic_next;
+                    XFS_BUF_ADDR(bip->bli_buf) == bno)
-                }
+                        return bip->bli_buf;
        }
-        return(bp);
+        return NULL;
 }
 STATIC void
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 419dafb9d87d..b13569a6179b 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -28,20 +28,6 @@ struct xfs_trans;
 extern kmem_zone_t      *xfs_bmap_free_item_zone;
 /*
- * DELTA: describe a change to the in-core extent list.
- *
- * Internally the use of xed_blockount is somewhat funky.
- * xed_blockcount contains an offset much of the time because this
- * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
- * the same underlying type).
- */
-typedef struct xfs_extdelta
-{
-        xfs_fileoff_t           xed_startoff;   /* offset of range */
-        xfs_filblks_t           xed_blockcount; /* blocks in range */
-} xfs_extdelta_t;
-/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -82,16 +68,13 @@ typedef	struct xfs_bmap_free
 #define XFS_BMAPI_DELAY         0x002   /* delayed write operation */
 #define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
 #define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
-#define XFS_BMAPI_EXACT         0x010   /* allocate only to spec'd bounds */
+#define XFS_BMAPI_ATTRFORK      0x010   /* use attribute fork not data */
-#define XFS_BMAPI_ATTRFORK      0x020   /* use attribute fork not data */
+#define XFS_BMAPI_RSVBLOCKS     0x020   /* OK to alloc. reserved data blocks */
-#define XFS_BMAPI_ASYNC         0x040   /* bunmapi xactions can be async */
+#define XFS_BMAPI_PREALLOC      0x040   /* preallocation op: unwritten space */
-#define XFS_BMAPI_RSVBLOCKS     0x080   /* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_IGSTATE       0x080   /* Ignore state - */
-#define XFS_BMAPI_PREALLOC      0x100   /* preallocation op: unwritten space */
-#define XFS_BMAPI_IGSTATE       0x200   /* Ignore state - */
                                        /* combine contig. space */
-#define XFS_BMAPI_CONTIG        0x400   /* must allocate only one extent */
+#define XFS_BMAPI_CONTIG        0x100   /* must allocate only one extent */
-/*      XFS_BMAPI_DIRECT_IO     0x800   */
+#define XFS_BMAPI_CONVERT       0x200   /* unwritten extent conversion - */
-#define XFS_BMAPI_CONVERT       0x1000  /* unwritten extent conversion - */
                                        /* need write cache flushing and no */
                                        /* additional allocation alignments */
@@ -100,9 +83,7 @@ typedef	struct xfs_bmap_free
        { XFS_BMAPI_DELAY,      "DELAY" }, \
        { XFS_BMAPI_ENTIRE,     "ENTIRE" }, \
        { XFS_BMAPI_METADATA,   "METADATA" }, \
-        { XFS_BMAPI_EXACT,      "EXACT" }, \
        { XFS_BMAPI_ATTRFORK,   "ATTRFORK" }, \
-        { XFS_BMAPI_ASYNC,      "ASYNC" }, \
        { XFS_BMAPI_RSVBLOCKS,  "RSVBLOCKS" }, \
        { XFS_BMAPI_PREALLOC,   "PREALLOC" }, \
        { XFS_BMAPI_IGSTATE,    "IGSTATE" }, \
@@ -310,9 +291,7 @@ xfs_bmapi(
        xfs_extlen_t            total,          /* total blocks needed */
        struct xfs_bmbt_irec    *mval,          /* output: map values */
        int                     *nmap,          /* i/o: mval size/count */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_bmap_free_t         *flist);        /* i/o: list extents to free */
-        xfs_extdelta_t          *delta);        /* o: change made to incore
-                                                   extents */
 /*
 * Map file blocks to filesystem blocks, simple version.
@@ -346,8 +325,6 @@ xfs_bunmapi(
        xfs_fsblock_t           *firstblock,    /* first allocated block
                                                   controls a.g. for allocs */
        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        xfs_extdelta_t          *delta,         /* o: change made to incore
-                                                   extents */
        int                     *done);         /* set if not done yet */
 /*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 416e47e54b83..87d3c10b6954 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,21 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 96be4b0f2496..829af92f0fba 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,20 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_btree_trace.h"
-#include "xfs_ialloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..1b09d7a280df 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -34,6 +33,12 @@
 kmem_zone_t     *xfs_buf_item_zone;
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
 #ifdef XFS_TRANS_DEBUG
 /*
 * This function uses an alternate strategy for tracking the bytes
@@ -64,7 +69,7 @@ xfs_buf_item_log_debug(
        nbytes = last - first + 1;
        bfset(bip->bli_logged, first, nbytes);
        for (x = 0; x < nbytes; x++) {
-                chunk_num = byte >> XFS_BLI_SHIFT;
+                chunk_num = byte >> XFS_BLF_SHIFT;
                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
                bit_num = chunk_num & (NBWORD - 1);
                wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -151,12 +156,13 @@ STATIC void	xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 */
 STATIC uint
 xfs_buf_item_size(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        uint            nvecs;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        int             next_bit;
+        struct xfs_buf          *bp = bip->bli_buf;
-        int             last_bit;
+        uint                    nvecs;
-        xfs_buf_t       *bp;
+        int                     next_bit;
+        int                     last_bit;
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        if (bip->bli_flags & XFS_BLI_STALE) {
@@ -166,11 +172,10 @@ xfs_buf_item_size(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_size_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                return 1;
        }
-        bp = bip->bli_buf;
        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
        nvecs = 1;
        last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
@@ -197,9 +202,9 @@ xfs_buf_item_size(
                } else if (next_bit != last_bit + 1) {
                        last_bit = next_bit;
                        nvecs++;
-                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+                } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
-                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+                           (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
                        last_bit = next_bit;
                        nvecs++;
                } else {
@@ -219,13 +224,13 @@ xfs_buf_item_size(
 */
 STATIC void
 xfs_buf_item_format(
-        xfs_buf_log_item_t      *bip,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *log_vector)
+        struct xfs_log_iovec    *vecp)
 {
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf  *bp = bip->bli_buf;
        uint            base_size;
        uint            nvecs;
-        xfs_log_iovec_t *vecp;
-        xfs_buf_t       *bp;
        int             first_bit;
        int             last_bit;
        int             next_bit;
@@ -235,8 +240,6 @@ xfs_buf_item_format(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
-        bp = bip->bli_buf;
-        vecp = log_vector;
        /*
         * The size of the base structure is the size of the
@@ -248,12 +251,26 @@ xfs_buf_item_format(
        base_size =
                (uint)(sizeof(xfs_buf_log_format_t) +
                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
-        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+        vecp->i_addr = &bip->bli_format;
        vecp->i_len = base_size;
        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
        vecp++;
        nvecs = 1;
+        /*
+         * If it is an inode buffer, transfer the in-memory state to the
+         * format flags and clear the in-memory state. We do not transfer
+         * this state if the inode buffer allocation has not yet been committed
+         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+         * correct replay of the inode allocation.
+         */
+        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+                      xfs_log_item_in_current_chkpt(lip)))
+                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+        }
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
                 * The buffer is stale, so all we need to log
@@ -261,7 +278,7 @@ xfs_buf_item_format(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_format_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                bip->bli_format.blf_size = nvecs;
                return;
        }
@@ -294,28 +311,28 @@ xfs_buf_item_format(
                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
-                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+                } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
-                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+                           (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
@@ -341,61 +358,91 @@ xfs_buf_item_format(
 }
 /*
- * This is called to pin the buffer associated with the buf log
+ * This is called to pin the buffer associated with the buf log item in memory
- * item in memory so it cannot be written out.  Simply call bpin()
+ * so it cannot be written out.
- * on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
 */
 STATIC void
 xfs_buf_item_pin(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
-        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
        trace_xfs_buf_item_pin(bip);
-        xfs_bpin(bp);
-}
+        atomic_inc(&bip->bli_refcount);
+        atomic_inc(&bip->bli_buf->b_pin_count);
+}
 /*
 * This is called to unpin the buffer associated with the buf log
 * item which was previously pinned with a call to xfs_buf_item_pin().
- * Just call bunpin() on the buffer to do this.
 *
 * Also drop the reference to the buf item for the current transaction.
 * If the XFS_BLI_STALE flag is set and we are the last reference,
 * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path.  If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
 */
 STATIC void
 xfs_buf_item_unpin(
-        xfs_buf_log_item_t      *bip,
+        struct xfs_log_item     *lip,
-        int                     stale)
+        int                     remove)
 {
-        struct xfs_ail  *ailp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        xfs_buf_t       *bp;
+        xfs_buf_t       *bp = bip->bli_buf;
+        struct xfs_ail  *ailp = lip->li_ailp;
+        int             stale = bip->bli_flags & XFS_BLI_STALE;
        int             freed;
-        bp = bip->bli_buf;
-        ASSERT(bp != NULL);
        ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        trace_xfs_buf_item_unpin(bip);
        freed = atomic_dec_and_test(&bip->bli_refcount);
-        ailp = bip->bli_item.li_ailp;
-        xfs_bunpin(bp);
+        if (atomic_dec_and_test(&bp->b_pin_count))
+                wake_up_all(&bp->b_waiters);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
+                if (remove) {
+                        /*
+                         * We have to remove the log item from the transaction
+                         * as we are about to release our reference to the
+                         * buffer.  If we don't, the unlock that occurs later
+                         * in xfs_trans_uncommit() will ry to reference the
+                         * buffer which we no longer have a hold on.
+                         */
+                        xfs_trans_del_item(lip);
+                        /*
+                         * Since the transaction no longer refers to the buffer,
+                         * the buffer should no longer refer to the transaction.
+                         */
+                        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+                }
                /*
                 * If we get called here because of an IO error, we may
                 * or may not have the item on the AIL. xfs_trans_ail_delete()
@@ -417,54 +464,6 @@ xfs_buf_item_unpin(
 }
 /*
- * this is called from uncommit in the forced-shutdown path.
- * we need to check to see if the reference count on the log item
- * is going to drop to zero.  If so, unpin will free the log item
- * so we need to free the item's descriptor (that points to the item)
- * in the transaction.
- */
-STATIC void
-xfs_buf_item_unpin_remove(
-        xfs_buf_log_item_t      *bip,
-        xfs_trans_t             *tp)
-{
-        xfs_buf_t               *bp;
-        xfs_log_item_desc_t     *lidp;
-        int                     stale = 0;
-        bp = bip->bli_buf;
-        /*
-         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
-         */
-        if ((atomic_read(&bip->bli_refcount) == 1) &&
-            (bip->bli_flags & XFS_BLI_STALE)) {
-                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
-                trace_xfs_buf_item_unpin_stale(bip);
-                /*
-                 * yes -- clear the xaction descriptor in-use flag
-                 * and free the chunk if required.  We can safely
-                 * do some work here and then call buf_item_unpin
-                 * to do the rest because if the if is true, then
-                 * we are holding the buffer locked so no one else
-                 * will be able to bump up the refcount.
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
-                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
-                xfs_trans_free_item(tp, lidp);
-                /*
-                 * Since the transaction no longer refers to the buffer,
-                 * the buffer should no longer refer to the transaction.
-                 */
-                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
-        }
-        xfs_buf_item_unpin(bip, stale);
-        return;
-}
-/*
 * This is called to attempt to lock the buffer associated with this
 * buf log item.  Don't sleep on the buffer lock.  If we can't get
 * the lock right away, return 0.  If we can get the lock, take a
@@ -474,11 +473,11 @@ xfs_buf_item_unpin_remove(
 */
 STATIC uint
 xfs_buf_item_trylock(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
-        bp = bip->bli_buf;
        if (XFS_BUF_ISPINNED(bp))
                return XFS_ITEM_PINNED;
        if (!XFS_BUF_CPSEMA(bp))
@@ -495,98 +494,81 @@ xfs_buf_item_trylock(
 }
 /*
- * Release the buffer associated with the buf log item.
+ * Release the buffer associated with the buf log item.  If there is no dirty
- * If there is no dirty logged data associated with the
+ * logged data associated with the buffer recorded in the buf log item, then
- * buffer recorded in the buf log item, then free the
+ * free the buf log item and remove the reference to it in the buffer.
- * buf log item and remove the reference to it in the
+ *
- * buffer.
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
 *
- * This call ignores the recursion count.  It is only called
+ * We unconditionally drop the transaction's reference to the log item. If the
- * when the buffer should REALLY be unlocked, regardless
+ * item was logged, then another reference was taken when it was pinned, so we
- * of the recursion count.
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
 *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
- * free the log item if necessary but do not unlock the buffer.
+ * if necessary but do not unlock the buffer.  This is for support of
- * This is for support of xfs_trans_bhold(). Make sure the
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * free the item.
 */
 STATIC void
 xfs_buf_item_unlock(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        int             aborted;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp = bip->bli_buf;
-        uint            hold;
+        int                     aborted;
+        uint                    hold;
-        bp = bip->bli_buf;
+        /* Clear the buffer's association with this transaction. */
+        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
        /*
-         * Clear the buffer's association with this transaction.
+         * If this is a transaction abort, don't return early.  Instead, allow
+         * the brelse to happen.  Normally it would be done for stale
+         * (cancelled) buffers at unpin time, but we'll never go through the
+         * pin/unpin cycle if we abort inside commit.
         */
-        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+        aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
        /*
-         * If this is a transaction abort, don't return early.
+         * Before possibly freeing the buf item, determine if we should
-         * Instead, allow the brelse to happen.
+         * release the buffer at the end of this routine.
-         * Normally it would be done for stale (cancelled) buffers
-         * at unpin time, but we'll never go through the pin/unpin
-         * cycle if we abort inside commit.
         */
-        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+        hold = bip->bli_flags & XFS_BLI_HOLD;
+        /* Clear the per transaction state. */
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
        /*
-         * If the buf item is marked stale, then don't do anything.
+         * If the buf item is marked stale, then don't do anything.  We'll
-         * We'll unlock the buffer and free the buf item when the
+         * unlock the buffer and free the buf item when the buffer is unpinned
-         * buffer is unpinned for the last time.
+         * for the last time.
         */
        if (bip->bli_flags & XFS_BLI_STALE) {
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
                trace_xfs_buf_item_unlock_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                if (!aborted)
+                if (!aborted) {
+                        atomic_dec(&bip->bli_refcount);
                        return;
+                }
        }
-        /*
-         * Drop the transaction's reference to the log item if
-         * it was not logged as part of the transaction.  Otherwise
-         * we'll drop the reference in xfs_buf_item_unpin() when
-         * the transaction is really through with the buffer.
-         */
-        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
-                atomic_dec(&bip->bli_refcount);
-        } else {
-                /*
-                 * Clear the logged flag since this is per
-                 * transaction state.
-                 */
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
-        }
-        /*
-         * Before possibly freeing the buf item, determine if we should
-         * release the buffer at the end of this routine.
-         */
-        hold = bip->bli_flags & XFS_BLI_HOLD;
        trace_xfs_buf_item_unlock(bip);
        /*
-         * If the buf item isn't tracking any data, free it.
+         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * Otherwise, if XFS_BLI_HOLD is set clear it.
+         * reference we hold to it.
         */
        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-                             bip->bli_format.blf_map_size)) {
+                             bip->bli_format.blf_map_size))
                xfs_buf_item_relse(bp);
-        } else if (hold) {
+        else
-                bip->bli_flags &= ~XFS_BLI_HOLD;
+                atomic_dec(&bip->bli_refcount);
-        }
-        /*
+        if (!hold)
-         * Release the buffer if XFS_BLI_HOLD was not set.
-         */
-        if (!hold) {
                xfs_buf_relse(bp);
-        }
 }
 /*
@@ -609,16 +591,16 @@ xfs_buf_item_unlock(
 */
 STATIC xfs_lsn_t
 xfs_buf_item_committed(
-        xfs_buf_log_item_t      *bip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        trace_xfs_buf_item_committed(bip);
-        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
-            (bip->bli_item.li_lsn != 0)) {
+                return lip->li_lsn;
-                return bip->bli_item.li_lsn;
+        return lsn;
-        }
-        return (lsn);
 }
 /*
@@ -628,15 +610,16 @@ xfs_buf_item_committed(
 */
 STATIC void
 xfs_buf_item_push(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
        trace_xfs_buf_item_push(bip);
-        bp = bip->bli_buf;
-        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
        xfs_buf_relse(bp);
 }
@@ -648,22 +631,24 @@ xfs_buf_item_push(
 */
 STATIC void
 xfs_buf_item_pushbuf(
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        xfs_buf_t       *bp;
+        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+        struct xfs_buf          *bp = bip->bli_buf;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(XFS_BUF_ISDELAYWRITE(bp));
        trace_xfs_buf_item_pushbuf(bip);
-        bp = bip->bli_buf;
-        ASSERT(XFS_BUF_ISDELAYWRITE(bp));
        xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
 }
-/* ARGSUSED */
 STATIC void
-xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+xfs_buf_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               commit_lsn)
 {
 }
@@ -671,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
 * This is the ops vector shared by all buf log items.
 */
 static struct xfs_item_ops xfs_buf_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
+        .iop_size       = xfs_buf_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_buf_item_format,
-                                        xfs_buf_item_format,
+        .iop_pin        = xfs_buf_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
+        .iop_unpin      = xfs_buf_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+        .iop_trylock    = xfs_buf_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+        .iop_unlock     = xfs_buf_item_unlock,
-                                        xfs_buf_item_unpin_remove,
+        .iop_committed  = xfs_buf_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
+        .iop_push       = xfs_buf_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
+        .iop_pushbuf    = xfs_buf_item_pushbuf,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+        .iop_committing = xfs_buf_item_committing
-                                        xfs_buf_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_buf_item_committing
 };
@@ -714,7 +694,6 @@ xfs_buf_item_init(
         */
        if (bp->b_mount != mp)
                bp->b_mount = mp;
-        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                if (lip->li_type == XFS_LI_BUF) {
@@ -723,20 +702,17 @@ xfs_buf_item_init(
        }
        /*
-         * chunks is the number of XFS_BLI_CHUNK size pieces
+         * chunks is the number of XFS_BLF_CHUNK size pieces
         * the buffer can be divided into. Make sure not to
         * truncate any pieces.  map_size is the size of the
         * bitmap needed to describe the chunks of the buffer.
         */
-        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
                                                    KM_SLEEP);
-        bip->bli_item.li_type = XFS_LI_BUF;
+        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
-        bip->bli_item.li_ops = &xfs_buf_item_ops;
-        bip->bli_item.li_mountp = mp;
-        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +775,8 @@ xfs_buf_item_log(
        /*
         * Convert byte offsets to bit numbers.
         */
-        first_bit = first >> XFS_BLI_SHIFT;
+        first_bit = first >> XFS_BLF_SHIFT;
-        last_bit = last >> XFS_BLI_SHIFT;
+        last_bit = last >> XFS_BLF_SHIFT;
        /*
         * Calculate the total number of bits to be set.
@@ -1103,15 +1079,14 @@ xfs_buf_error_relse(
 * It is called by xfs_buf_iodone_callbacks() above which will take
 * care of cleaning up the buffer itself.
 */
-/* ARGSUSED */
 void
 xfs_buf_iodone(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_buf_log_item_t      *bip)
+        struct xfs_log_item     *lip)
 {
-        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
+        struct xfs_ail          *ailp = lip->li_ailp;
-        ASSERT(bip->bli_buf == bp);
+        ASSERT(BUF_ITEM(lip)->bli_buf == bp);
        xfs_buf_rele(bp);
@@ -1125,6 +1100,6 @@ xfs_buf_iodone(
         * Either way, AIL is useless if we're forcing a shutdown.
         */
        spin_lock(&ailp->xa_lock);
-        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+        xfs_trans_ail_delete(ailp, lip);
-        xfs_buf_item_free(bip);
+        xfs_buf_item_free(BUF_ITEM(lip));
 }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..0e2ed43f16c7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t	*xfs_buf_item_zone;
 * have been logged.
 * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
 */
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
        unsigned short  blf_type;       /* buf log item type indicator */
        unsigned short  blf_size;       /* size of this item */
        ushort          blf_flags;      /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
 * This flag indicates that the buffer contains on disk inodes
 * and requires special recovery handling.
 */
-#define XFS_BLI_INODE_BUF       0x1
+#define XFS_BLF_INODE_BUF       0x1
 /*
 * This flag indicates that the buffer should not be replayed
 * during recovery because its blocks are being freed.
 */
-#define XFS_BLI_CANCEL          0x2
+#define XFS_BLF_CANCEL          0x2
 /*
 * This flag indicates that the buffer contains on disk
 * user or group dquots and may require special recovery handling.
 */
-#define XFS_BLI_UDQUOT_BUF      0x4
+#define XFS_BLF_UDQUOT_BUF      0x4
-#define XFS_BLI_PDQUOT_BUF      0x8
+#define XFS_BLF_PDQUOT_BUF      0x8
-#define XFS_BLI_GDQUOT_BUF      0x10
+#define XFS_BLF_GDQUOT_BUF      0x10
-#define XFS_BLI_CHUNK           128
+#define XFS_BLF_CHUNK           128
-#define XFS_BLI_SHIFT           7
+#define XFS_BLF_SHIFT           7
 #define BIT_TO_WORD_SHIFT       5
 #define NBWORD                  (NBBY * sizeof(unsigned int))
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
 #define XFS_BLI_LOGGED          0x08
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
+#define XFS_BLI_INODE_BUF       0x40
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
        { XFS_BLI_STALE,        "STALE" }, \
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-        { XFS_BLI_STALE_INODE,  "STALE_INODE" }
+        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
+        { XFS_BLI_INODE_BUF,    "INODE_BUF" }
 #ifdef __KERNEL__
@@ -122,7 +124,7 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
                              void(*)(struct xfs_buf *, xfs_log_item_t *),
                              xfs_log_item_t *);
 void    xfs_buf_iodone_callbacks(struct xfs_buf *);
-void    xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+void    xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 #ifdef XFS_TRANS_DEBUG
 void
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0ca556b4bf31..30fa0e206fba 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -25,19 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
-#include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
@@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
        xfs_da_intnode_t *node;
        xfs_da_node_entry_t *btree;
        int tmp;
-        xfs_mount_t *mp;
        node = oldblk->bp->data;
-        mp = state->mp;
        ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
        ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
        ASSERT(newblk->blkno != 0);
        if (state->args->whichfork == XFS_DATA_FORK)
-                ASSERT(newblk->blkno >= mp->m_dirleafblk &&
+                ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
-                       newblk->blkno < mp->m_dirfreeblk);
+                       newblk->blkno < state->mp->m_dirfreeblk);
        /*
         * We may need to make some room before we insert the new node.
@@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
                        XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist, NULL))) {
+                        args->flist))) {
                return error;
        }
        ASSERT(nmap <= 1);
@@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
                                        XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist,
+                                        &mapp[mapi], &nmap, args->flist))) {
-                                        NULL))) {
                                kmem_free(mapp);
                                return error;
                        }
@@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                 */
                if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
                                xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                0, args->firstblock, args->flist, NULL,
+                                0, args->firstblock, args->flist,
                                &done)) == ENOSPC) {
                        if (w != XFS_DATA_FORK)
                                break;
@@ -1989,7 +1981,7 @@ xfs_da_do_buf(
                                        nfsb,
                                        XFS_BMAPI_METADATA |
                                                xfs_bmapi_aflag(whichfork),
-                                        NULL, 0, mapp, &nmap, NULL, NULL)))
+                                        NULL, 0, mapp, &nmap, NULL)))
                                goto exit0;
                }
        } else {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5bba29a07812..3b9582c60a22 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,24 +24,15 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_dfrag.h"
 #include "xfs_error.h"
-#include "xfs_rw.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
@@ -69,7 +60,9 @@ xfs_swapext(
                goto out;
        }
-        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
+        if (!(file->f_mode & FMODE_WRITE) ||
+            !(file->f_mode & FMODE_READ) ||
+            (file->f_flags & O_APPEND)) {
                error = XFS_ERROR(EBADF);
                goto out_put_file;
        }
@@ -81,6 +74,7 @@ xfs_swapext(
        }
        if (!(tmp_file->f_mode & FMODE_WRITE) ||
+            !(tmp_file->f_mode & FMODE_READ) ||
            (tmp_file->f_flags & O_APPEND)) {
                error = XFS_ERROR(EBADF);
                goto out_put_tmp_file;
@@ -422,11 +416,8 @@ xfs_swap_extents(
        }
-        IHOLD(ip);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        IHOLD(tip);
-        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
        xfs_trans_log_inode(tp, tip, tilf_fields);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 42520f041265..a1321bc7f192 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -25,13 +25,11 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -382,7 +380,7 @@ xfs_readdir(
        int             rval;           /* return value */
        int             v;              /* type-checking value */
-        xfs_itrace_entry(dp);
+        trace_xfs_readdir(dp);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
@@ -549,7 +547,7 @@ xfs_dir2_grow_inode(
        if ((error = xfs_bmapi(tp, dp, bno, count,
                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
-                        args->flist, NULL)))
+                        args->flist)))
                return error;
        ASSERT(nmap <= 1);
        if (nmap == 1) {
@@ -581,8 +579,7 @@ xfs_dir2_grow_inode(
                        if ((error = xfs_bmapi(tp, dp, b, c,
                                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
-                                        &mapp[mapi], &nmap, args->flist,
+                                        &mapp[mapi], &nmap, args->flist))) {
-                                        NULL))) {
                                kmem_free(mapp);
                                return error;
                        }
@@ -715,7 +712,7 @@ xfs_dir2_shrink_inode(
         */
        if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
                        XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-                        NULL, &done))) {
+                        &done))) {
                /*
                 * ENOSPC actually can happen if we're in a removename with
                 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 779a267b0a84..580d99cef9e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block(
         */
        buf_len = dp->i_df.if_bytes;
-        buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+        buf = kmem_alloc(buf_len, KM_SLEEP);
-        memcpy(buf, sfp, dp->i_df.if_bytes);
+        memcpy(buf, sfp, buf_len);
-        xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
        dp->i_d.di_size = 0;
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 498f8d694330..921595b84f5b 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2_data.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index e2d89854ec9e..504be8640e91 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -25,11 +25,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents(
                                        xfs_dir2_byte_to_da(mp,
                                                XFS_DIR2_LEAF_OFFSET) - map_off,
                                        XFS_BMAPI_METADATA, NULL, 0,
-                                        &map[map_valid], &nmap, NULL, NULL);
+                                        &map[map_valid], &nmap, NULL);
                                /*
                                 * Don't know if we should ignore this or
                                 * try to return an error.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 78fc4d9ae756..f9a0864b696a 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index c1a5945d463a..b1bae6b1eed9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -24,12 +24,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
deleted file mode 100644
index 2813cdd72375..000000000000
--- a/fs/xfs/xfs_dmapi.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DMAPI_H__
-#define __XFS_DMAPI_H__
-/*      Values used to define the on-disk version of dm_attrname_t. All
- *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
- *
- *      In the on-disk inode, DMAPI attribute names consist of the user-provided
- *      name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
- *      changed.
- */
-#define DMATTR_PREFIXLEN        8
-#define DMATTR_PREFIXSTRING     "SGI_DMI_"
-typedef enum {
-        DM_EVENT_INVALID        = -1,
-        DM_EVENT_CANCEL         = 0,            /* not supported */
-        DM_EVENT_MOUNT          = 1,
-        DM_EVENT_PREUNMOUNT     = 2,
-        DM_EVENT_UNMOUNT        = 3,
-        DM_EVENT_DEBUT          = 4,            /* not supported */
-        DM_EVENT_CREATE         = 5,
-        DM_EVENT_CLOSE          = 6,            /* not supported */
-        DM_EVENT_POSTCREATE     = 7,
-        DM_EVENT_REMOVE         = 8,
-        DM_EVENT_POSTREMOVE     = 9,
-        DM_EVENT_RENAME         = 10,
-        DM_EVENT_POSTRENAME     = 11,
-        DM_EVENT_LINK           = 12,
-        DM_EVENT_POSTLINK       = 13,
-        DM_EVENT_SYMLINK        = 14,
-        DM_EVENT_POSTSYMLINK    = 15,
-        DM_EVENT_READ           = 16,
-        DM_EVENT_WRITE          = 17,
-        DM_EVENT_TRUNCATE       = 18,
-        DM_EVENT_ATTRIBUTE      = 19,
-        DM_EVENT_DESTROY        = 20,
-        DM_EVENT_NOSPACE        = 21,
-        DM_EVENT_USER           = 22,
-        DM_EVENT_MAX            = 23
-} dm_eventtype_t;
-#define HAVE_DM_EVENTTYPE_T
-typedef enum {
-        DM_RIGHT_NULL,
-        DM_RIGHT_SHARED,
-        DM_RIGHT_EXCL
-} dm_right_t;
-#define HAVE_DM_RIGHT_T
-/* Defines for determining if an event message should be sent. */
-#ifdef HAVE_DMAPI
-#define DM_EVENT_ENABLED(ip, event) ( \
-        unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \
-                ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
-                  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
-        )
-#else
-#define DM_EVENT_ENABLED(ip, event)     (0)
-#endif
-#define DM_XFS_VALID_FS_EVENTS          ( \
-        (1 << DM_EVENT_PREUNMOUNT)      | \
-        (1 << DM_EVENT_UNMOUNT)         | \
-        (1 << DM_EVENT_NOSPACE)         | \
-        (1 << DM_EVENT_DEBUT)           | \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a regular file or a symlink.  These events are persistent.
-*/
-#define DM_XFS_VALID_FILE_EVENTS        ( \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events valid in dm_set_eventlist() when called with a file handle for
-   a directory.  These events are persistent.
-*/
-#define DM_XFS_VALID_DIRECTORY_EVENTS   ( \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/* Events supported by the XFS filesystem. */
-#define DM_XFS_SUPPORTED_EVENTS         ( \
-        (1 << DM_EVENT_MOUNT)           | \
-        (1 << DM_EVENT_PREUNMOUNT)      | \
-        (1 << DM_EVENT_UNMOUNT)         | \
-        (1 << DM_EVENT_NOSPACE)         | \
-        (1 << DM_EVENT_CREATE)          | \
-        (1 << DM_EVENT_POSTCREATE)      | \
-        (1 << DM_EVENT_REMOVE)          | \
-        (1 << DM_EVENT_POSTREMOVE)      | \
-        (1 << DM_EVENT_RENAME)          | \
-        (1 << DM_EVENT_POSTRENAME)      | \
-        (1 << DM_EVENT_LINK)            | \
-        (1 << DM_EVENT_POSTLINK)        | \
-        (1 << DM_EVENT_SYMLINK)         | \
-        (1 << DM_EVENT_POSTSYMLINK)     | \
-        (1 << DM_EVENT_READ)            | \
-        (1 << DM_EVENT_WRITE)           | \
-        (1 << DM_EVENT_TRUNCATE)        | \
-        (1 << DM_EVENT_ATTRIBUTE)       | \
-        (1 << DM_EVENT_DESTROY)         )
-/*
- *      Definitions used for the flags field on dm_send_*_event().
- */
-#define DM_FLAGS_NDELAY         0x001   /* return EAGAIN after dm_pending() */
-#define DM_FLAGS_UNWANTED       0x002   /* event not in fsys dm_eventset_t */
-#define DM_FLAGS_IMUX           0x004   /* thread holds i_mutex */
-#define DM_FLAGS_IALLOCSEM_RD   0x010   /* thread holds i_alloc_sem rd */
-#define DM_FLAGS_IALLOCSEM_WR   0x020   /* thread holds i_alloc_sem wr */
-/*
- *      Pull in platform specific event flags defines
- */
-#include "xfs_dmapi_priv.h"
-/*
- *      Macros to turn caller specified delay/block flags into
- *      dm_send_xxxx_event flag DM_FLAGS_NDELAY.
- */
-#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
-                        DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
-#endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
deleted file mode 100644
index e71e2581c0c3..000000000000
--- a/fs/xfs/xfs_dmops.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-static struct xfs_dmops xfs_dmcore_stub = {
-        .xfs_send_data          = (xfs_send_data_t)fs_nosys,
-        .xfs_send_mmap          = (xfs_send_mmap_t)fs_noerr,
-        .xfs_send_destroy       = (xfs_send_destroy_t)fs_nosys,
-        .xfs_send_namesp        = (xfs_send_namesp_t)fs_nosys,
-        .xfs_send_mount         = (xfs_send_mount_t)fs_nosys,
-        .xfs_send_unmount       = (xfs_send_unmount_t)fs_noerr,
-};
-int
-xfs_dmops_get(struct xfs_mount *mp)
-{
-        if (mp->m_flags & XFS_MOUNT_DMAPI) {
-                cmn_err(CE_WARN,
-                        "XFS: dmapi support not available in this kernel.");
-                return EINVAL;
-        }
-        mp->m_dm_ops = &xfs_dmcore_stub;
-        return 0;
-}
-void
-xfs_dmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..ed9990267661 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -23,12 +23,8 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_utils.h"
@@ -170,7 +166,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
        va_list ap;
 #ifdef DEBUG
-        xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +182,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 void
 xfs_error_report(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
                            CE_ALERT, mp,
                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
-                            tag, linenum, fname, ra);
+                            tag, linenum, filename, ra);
                xfs_stack_trace();
        }
@@ -205,15 +201,15 @@ xfs_error_report(
 void
 xfs_corruption_error(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        void            *p,
+        void                    *p,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
-        xfs_error_report(tag, level, mp, fname, linenum, ra);
+        xfs_error_report(tag, level, mp, filename, linenum, ra);
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int	xfs_error_trap(int);
 struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                                char *fname, int linenum, inst_t *ra);
+                        const char *filename, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_corruption_error(const char *tag, int level,
-                                void *p, char *fname, int linenum, inst_t *ra);
+                        struct xfs_mount *mp, void *p, const char *filename,
+                        int linenum, inst_t *ra);
 #define XFS_ERROR_REPORT(e, lvl, mp)    \
        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..a55e687bf562 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -24,7 +24,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -33,18 +32,19 @@
 kmem_zone_t     *xfs_efi_zone;
 kmem_zone_t     *xfs_efd_zone;
-STATIC void     xfs_efi_item_unlock(xfs_efi_log_item_t *);
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
 void
-xfs_efi_item_free(xfs_efi_log_item_t *efip)
+xfs_efi_item_free(
+        struct xfs_efi_log_item *efip)
 {
-        int nexts = efip->efi_format.efi_nextents;
+        if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
-        if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
                kmem_free(efip);
-        } else {
+        else
                kmem_zone_free(xfs_efi_zone, efip);
-        }
 }
 /*
@@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
 * We only need 1 iovec for an efi item.  It just logs the efi_log_format
 * structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_size(xfs_efi_log_item_t *efip)
+xfs_efi_item_size(
+        struct xfs_log_item     *lip)
 {
        return 1;
 }
@@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip)
 * slots in the efi item have been filled.
 */
 STATIC void
-xfs_efi_item_format(xfs_efi_log_item_t  *efip,
+xfs_efi_item_format(
-                    xfs_log_iovec_t     *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        uint    size;
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+        uint                    size;
        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
@@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
        size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
        efip->efi_format.efi_size = 1;
-        log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+        log_vector->i_addr = &efip->efi_format;
        log_vector->i_len = size;
        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
        ASSERT(size >= sizeof(xfs_efi_log_format_t));
@@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 /*
 * Pinning has no meaning for an efi item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+xfs_efi_item_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * While EFIs cannot really be pinned, the unpin operation is the
 * last place at which the EFI is manipulated during a transaction.
 * Here we coordinate with xfs_efi_cancel() to determine who gets to
 * free the EFI.
 */
-/*ARGSUSED*/
-STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
-{
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        spin_lock(&ailp->xa_lock);
-        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                xfs_efi_item_free(efip);
-        } else {
-                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&ailp->xa_lock);
-        }
-}
-/*
- * like unpin only we have to also clear the xaction descriptor
- * pointing the log item if we free the item.  This routine duplicates
- * unpin because efi_flags is protected by the AIL lock.  Freeing
- * the descriptor and then calling unpin would force us to drop the AIL
- * lock which would open up a race condition.
- */
 STATIC void
-xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+xfs_efi_item_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_ail          *ailp = lip->li_ailp;
        spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /*
+                if (remove)
-                 * free the xaction descriptor pointing to this item
+                        xfs_trans_del_item(lip);
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
-                xfs_trans_free_item(tp, lidp);
                /* xfs_trans_ail_delete() drops the AIL lock. */
-                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
+                xfs_trans_ail_delete(ailp, lip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
@@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 * XFS_ITEM_PINNED so that the caller will eventually flush the log.
 * This should help in getting the EFI out of the AIL.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+xfs_efi_item_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_PINNED;
 }
@@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
 /*
 * Efi items have no locking, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+xfs_efi_item_unlock(
+        struct xfs_log_item     *lip)
 {
-        if (efip->efi_item.li_flags & XFS_LI_ABORTED)
+        if (lip->li_flags & XFS_LI_ABORTED)
-                xfs_efi_item_free(efip);
+                xfs_efi_item_free(EFI_ITEM(lip));
-        return;
 }
 /*
@@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
 * flag is not paid any attention here.  Checking for that is delayed
 * until the EFI is unpinned.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
        return lsn;
 }
@@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
 * stuck waiting for all of its corresponding efd items to be
 * committed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_push(xfs_efi_log_item_t *efip)
+xfs_efi_item_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
@@ -209,64 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip)
 * example, for inodes, the inode is locked throughout the extent freeing
 * so the dependency should be recorded there.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efi_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector shared by all efi log items.
 */
 static struct xfs_item_ops xfs_efi_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
+        .iop_size       = xfs_efi_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_efi_item_format,
-                                        xfs_efi_item_format,
+        .iop_pin        = xfs_efi_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
+        .iop_unpin      = xfs_efi_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+        .iop_trylock    = xfs_efi_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+        .iop_unlock     = xfs_efi_item_unlock,
-                                        xfs_efi_item_unpin_remove,
+        .iop_committed  = xfs_efi_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
+        .iop_push       = xfs_efi_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
+        .iop_committing = xfs_efi_item_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efi_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efi_item_committing
 };
 /*
 * Allocate and initialize an efi item with the given number of extents.
 */
-xfs_efi_log_item_t *
+struct xfs_efi_log_item *
-xfs_efi_init(xfs_mount_t        *mp,
+xfs_efi_init(
-             uint               nextents)
+        struct xfs_mount        *mp,
+        uint                    nextents)
 {
-        xfs_efi_log_item_t      *efip;
+        struct xfs_efi_log_item *efip;
        uint                    size;
        ASSERT(nextents > 0);
        if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efi_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+                efip = kmem_zalloc(size, KM_SLEEP);
        } else {
-                efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
+                efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
-                                                             KM_SLEEP);
        }
-        efip->efi_item.li_type = XFS_LI_EFI;
+        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
-        efip->efi_item.li_ops = &xfs_efi_item_ops;
-        efip->efi_item.li_mountp = mp;
-        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
-        return (efip);
+        return efip;
 }
 /*
@@ -279,7 +244,7 @@ xfs_efi_init(xfs_mount_t	*mp,
 int
 xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 {
-        xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+        xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
        uint i;
        uint len = sizeof(xfs_efi_log_format_t) + 
                (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
@@ -292,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
                return 0;
        } else if (buf->i_len == len32) {
-                xfs_efi_log_format_32_t *src_efi_fmt_32 =
+                xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
-                        (xfs_efi_log_format_32_t *)buf->i_addr;
                dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
                dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
@@ -307,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                }
                return 0;
        } else if (buf->i_len == len64) {
-                xfs_efi_log_format_64_t *src_efi_fmt_64 =
+                xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
-                        (xfs_efi_log_format_64_t *)buf->i_addr;
                dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
                dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
@@ -359,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
        }
 }
-STATIC void
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
-xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 {
-        int nexts = efdp->efd_format.efd_nextents;
+        return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
-        if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+        if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                kmem_free(efdp);
-        } else {
+        else
                kmem_zone_free(xfs_efd_zone, efdp);
-        }
 }
 /*
@@ -376,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 * We only need 1 iovec for an efd item.  It just logs the efd_log_format
 * structure.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+xfs_efd_item_size(
+        struct xfs_log_item     *lip)
 {
        return 1;
 }
@@ -391,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp)
 * slots in the efd item have been filled.
 */
 STATIC void
-xfs_efd_item_format(xfs_efd_log_item_t  *efdp,
+xfs_efd_item_format(
-                    xfs_log_iovec_t     *log_vector)
+        struct xfs_log_item     *lip,
+        struct xfs_log_iovec    *log_vector)
 {
-        uint    size;
+        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+        uint                    size;
        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
@@ -404,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
        size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
        efdp->efd_format.efd_size = 1;
-        log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+        log_vector->i_addr = &efdp->efd_format;
        log_vector->i_len = size;
        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
        ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 /*
 * Pinning has no meaning for an efd item, so just return.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+xfs_efd_item_pin(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
 * Since pinning has no meaning for an efd item, unpinning does
 * not either.
 */
-/*ARGSUSED*/
-STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
-{
-        return;
-}
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+xfs_efd_item_unpin(
+        struct xfs_log_item     *lip,
+        int                     remove)
 {
-        return;
 }
 /*
 * Efd items have no locking, so just return success.
 */
-/*ARGSUSED*/
 STATIC uint
-xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_trylock(
+        struct xfs_log_item     *lip)
 {
        return XFS_ITEM_LOCKED;
 }
@@ -454,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
 * Efd items have no locking or pushing, so return failure
 * so that the caller doesn't bother with us.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+xfs_efd_item_unlock(
+        struct xfs_log_item     *lip)
 {
-        if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
+        if (lip->li_flags & XFS_LI_ABORTED)
-                xfs_efd_item_free(efdp);
+                xfs_efd_item_free(EFD_ITEM(lip));
-        return;
 }
 /*
@@ -470,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
 * return -1 to keep the transaction code from further referencing
 * this item.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
-xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+xfs_efd_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
+        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
        /*
         * If we got a log I/O error, it's always the case that the LR with the
         * EFI got unpinned and freed before the EFD got aborted.
         */
-        if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+        if (!(lip->li_flags & XFS_LI_ABORTED))
                xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
        xfs_efd_item_free(efdp);
@@ -489,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
 * There isn't much you can do to push on an efd item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+xfs_efd_item_push(
+        struct xfs_log_item     *lip)
 {
-        return;
 }
 /*
@@ -503,64 +461,54 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp)
 * example, for inodes, the inode is locked throughout the extent freeing
 * so the dependency should be recorded there.
 */
-/*ARGSUSED*/
 STATIC void
-xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+xfs_efd_item_committing(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               lsn)
 {
-        return;
 }
 /*
 * This is the ops vector shared by all efd log items.
 */
 static struct xfs_item_ops xfs_efd_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
+        .iop_size       = xfs_efd_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_efd_item_format,
-                                        xfs_efd_item_format,
+        .iop_pin        = xfs_efd_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
+        .iop_unpin      = xfs_efd_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+        .iop_trylock    = xfs_efd_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_unlock     = xfs_efd_item_unlock,
-                                        xfs_efd_item_unpin_remove,
+        .iop_committed  = xfs_efd_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
+        .iop_push       = xfs_efd_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
+        .iop_committing = xfs_efd_item_committing
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efd_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
-        .iop_pushbuf    = NULL,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_efd_item_committing
 };
 /*
 * Allocate and initialize an efd item with the given number of extents.
 */
-xfs_efd_log_item_t *
+struct xfs_efd_log_item *
-xfs_efd_init(xfs_mount_t        *mp,
+xfs_efd_init(
-             xfs_efi_log_item_t *efip,
+        struct xfs_mount        *mp,
-             uint               nextents)
+        struct xfs_efi_log_item *efip,
+        uint                    nextents)
 {
-        xfs_efd_log_item_t      *efdp;
+        struct xfs_efd_log_item *efdp;
        uint                    size;
        ASSERT(nextents > 0);
        if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
                size = (uint)(sizeof(xfs_efd_log_item_t) +
                        ((nextents - 1) * sizeof(xfs_extent_t)));
-                efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+                efdp = kmem_zalloc(size, KM_SLEEP);
        } else {
-                efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
+                efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
-                                                             KM_SLEEP);
        }
-        efdp->efd_item.li_type = XFS_LI_EFD;
+        xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
-        efdp->efd_item.li_ops = &xfs_efd_item_ops;
-        efdp->efd_item.li_mountp = mp;
-        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-        return (efdp);
+        return efdp;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 390850ee6603..9b715dce5699 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,13 +18,9 @@
 #include "xfs.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -127,6 +123,82 @@ typedef struct fstrm_item
        xfs_inode_t     *pip;   /* Parent directory inode pointer. */
 } fstrm_item_t;
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all.  The race condition this addresses is the following:
+ *
+ *  - The work queue scheduler fires and pulls a filestream directory cache
+ *    element off the LRU end of the cache for deletion, then gets pre-empted.
+ *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ *    remaining items from the cache and reallocates the mount point's per-ag
+ *    array, resetting all the counters to zero.
+ *  - The work queue thread resumes and calls the free function for the element
+ *    it started cleaning up earlier.  In the process it decrements the
+ *    filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_read(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
+}
+static int
+xfs_filestream_get_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        int             ret;
+        pag = xfs_perag_get(mp, agno);
+        ret = atomic_inc_return(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+        return ret;
+}
+static void
+xfs_filestream_put_ag(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno)
+{
+        struct xfs_perag *pag;
+        pag = xfs_perag_get(mp, agno);
+        atomic_dec(&pag->pagf_fstrms);
+        xfs_perag_put(pag);
+}
 /*
 * Scan the AGs starting at startag looking for an AG that isn't in use and has
@@ -355,16 +427,14 @@ xfs_fstrm_free_func(
 {
        fstrm_item_t    *item  = (fstrm_item_t *)data;
        xfs_inode_t     *ip = item->ip;
-        int ref;
        ASSERT(ip->i_ino == ino);
        xfs_iflags_clear(ip, XFS_IFILESTREAM);
        /* Drop the reference taken on the AG when the item was added. */
-        ref = xfs_filestream_put_ag(ip->i_mount, item->ag);
+        xfs_filestream_put_ag(ip->i_mount, item->ag);
-        ASSERT(ref >= 0);
        TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
                xfs_filestream_peek_ag(ip->i_mount, item->ag));
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 260f757bbc5d..09dd9af45434 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf;
 #endif
-/*
- * Allocation group filestream associations are tracked with per-ag atomic
- * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all.  The race condition this addresses is the following:
- *
- *  - The work queue scheduler fires and pulls a filestream directory cache
- *    element off the LRU end of the cache for deletion, then gets pre-empted.
- *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
- *    remaining items from the cache and reallocates the mount point's per-ag
- *    array, resetting all the counters to zero.
- *  - The work queue thread resumes and calls the free function for the element
- *    it started cleaning up earlier.  In the process it decrements the
- *    filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
- */
-/*
- * xfs_filestream_peek_ag is only used in tracing code
- */
-static inline int
-xfs_filestream_peek_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_read(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
-static inline int
-xfs_filestream_get_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_inc_return(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
-static inline int
-xfs_filestream_put_ag(
-        xfs_mount_t     *mp,
-        xfs_agnumber_t  agno)
-{
-        struct xfs_perag *pag;
-        int             ret;
-        pag = xfs_perag_get(mp, agno);
-        ret = atomic_dec_return(&pag->pagf_fstrms);
-        xfs_perag_put(pag);
-        return ret;
-}
 /* allocation selection flags */
 typedef enum xfs_fstrm_alloc {
        XFS_PICK_USERDATA = 1,
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES         0x10    /* Do not return holes */
 #define BMV_IF_VALID    \
-        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|  \
+         BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
 /*      bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 37a6f62c57b6..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -608,32 +604,36 @@ out:
        return 0;
 }
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
+        xfs_mount_t     *mp,
+        int             flags)
 {
        xfs_trans_t     *tp;
-        xfs_inode_t     *ip;
        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                        XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
-        ip = mp->m_rootip;
+        /* log the UUID because it is an unchanging field */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_mod_sb(tp, XFS_SB_UUID);
+        if (flags & SYNC_WAIT)
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_set_sync(tp);
-        xfs_trans_ihold(tp, ip);
+        return xfs_trans_commit(tp, 0);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 9d884c127bb9..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -1203,6 +1199,67 @@ error0:
        return error;
 }
+STATIC int
+xfs_imap_lookup(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          agno,
+        xfs_agino_t             agino,
+        xfs_agblock_t           agbno,
+        xfs_agblock_t           *chunk_agbno,
+        xfs_agblock_t           *offset_agbno,
+        int                     flags)
+{
+        struct xfs_inobt_rec_incore rec;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        int                     error;
+        int                     i;
+        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+        if (error) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                                "xfs_ialloc_read_agi() returned "
+                                "error %d, agno %d",
+                                error, agno);
+                return error;
+        }
+        /*
+         * Lookup the inode record for the given agino. If the record cannot be
+         * found, then it's an invalid inode number and we should abort. Once
+         * we have a record, we need to ensure it contains the inode number
+         * we are looking up.
+         */
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+        if (!error) {
+                if (i)
+                        error = xfs_inobt_get_rec(cur, &rec, &i);
+                if (!error && i == 0)
+                        error = EINVAL;
+        }
+        xfs_trans_brelse(tp, agbp);
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        if (error)
+                return error;
+        /* check that the returned record contains the required inode */
+        if (rec.ir_startino > agino ||
+            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+                return EINVAL;
+        /* for untrusted inodes check it is allocated first */
+        if ((flags & XFS_IGET_UNTRUSTED) &&
+            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+                return EINVAL;
+        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+        *offset_agbno = agbno - *chunk_agbno;
+        return 0;
+}
 /*
 * Return the location of the inode in imap, for mapping it into a buffer.
 */
@@ -1235,8 +1292,11 @@ xfs_imap(
        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
-                /* no diagnostics for bulkstat, ino comes from userspace */
+                /*
-                if (flags & XFS_IGET_BULKSTAT)
+                 * Don't output diagnostic information for untrusted inodes
+                 * as they can be invalid without implying corruption.
+                 */
+                if (flags & XFS_IGET_UNTRUSTED)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
@@ -1263,6 +1323,23 @@ xfs_imap(
                return XFS_ERROR(EINVAL);
        }
+        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+        /*
+         * For bulkstat and handle lookups, we have an untrusted inode number
+         * that we have to verify is valid. We cannot do this just by reading
+         * the inode buffer as it may have been unlinked and removed leaving
+         * inodes in stale state on disk. Hence we have to do a btree lookup
+         * in all cases where an untrusted inode number is passed.
+         */
+        if (flags & XFS_IGET_UNTRUSTED) {
+                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+                                        &chunk_agbno, &offset_agbno, flags);
+                if (error)
+                        return error;
+                goto out_map;
+        }
        /*
         * If the inode cluster size is the same as the blocksize or
         * smaller we get to the buffer by simple arithmetics.
@@ -1277,24 +1354,6 @@ xfs_imap(
                return 0;
        }
-        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-        /*
-         * If we get a block number passed from bulkstat we can use it to
-         * find the buffer easily.
-         */
-        if (imap->im_blkno) {
-                offset = XFS_INO_TO_OFFSET(mp, ino);
-                ASSERT(offset < mp->m_sb.sb_inopblock);
-                cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
-                offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-                imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
-                return 0;
-        }
        /*
         * If the inode chunks are aligned then use simple maths to
         * find the location. Otherwise we have to do a btree
@@ -1304,50 +1363,13 @@ xfs_imap(
                offset_agbno = agbno & mp->m_inoalign_mask;
                chunk_agbno = agbno - offset_agbno;
        } else {
-                xfs_btree_cur_t *cur;   /* inode btree cursor */
+                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
-                xfs_inobt_rec_incore_t chunk_rec;
+                                        &chunk_agbno, &offset_agbno, flags);
-                xfs_buf_t       *agbp;  /* agi buffer */
-                int             i;      /* temp state */
-                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_ialloc_read_agi() returned "
-                                        "error %d, agno %d",
-                                        error, agno);
-                        return error;
-                }
-                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
-                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_inobt_lookup() failed");
-                        goto error0;
-                }
-                error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
-                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_inobt_get_rec() failed");
-                        goto error0;
-                }
-                if (i == 0) {
-#ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
-                        error = XFS_ERROR(EINVAL);
-                }
- error0:
-                xfs_trans_brelse(tp, agbp);
-                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
                        return error;
-                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
-                offset_agbno = agbno - chunk_agbno;
        }
+out_map:
        ASSERT(agbno >= chunk_agbno);
        cluster_agbno = chunk_agbno +
                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c282a9af5393..d352862cefa0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..b1ecc6f97ade 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -25,14 +25,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -95,7 +91,7 @@ xfs_inode_alloc(
        return ip;
 }
-STATIC void
+void
 xfs_inode_free(
        struct xfs_inode        *ip)
 {
@@ -212,7 +208,7 @@ xfs_iget_cache_hit(
                        ip->i_flags &= ~XFS_INEW;
                        ip->i_flags |= XFS_IRECLAIMABLE;
                        __xfs_inode_set_reclaim_tag(pag, ip);
-                        trace_xfs_iget_reclaim(ip);
+                        trace_xfs_iget_reclaim_fail(ip);
                        goto out_error;
                }
@@ -227,6 +223,7 @@ xfs_iget_cache_hit(
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
+                        trace_xfs_iget_skip(ip);
                        error = EAGAIN;
                        goto out_error;
                }
@@ -234,6 +231,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
                read_unlock(&pag->pag_ici_lock);
+                trace_xfs_iget_hit(ip);
        }
        if (lock_flags != 0)
@@ -242,7 +240,6 @@ xfs_iget_cache_hit(
        xfs_iflags_clear(ip, XFS_ISTALE);
        XFS_STATS_INC(xs_ig_found);
-        trace_xfs_iget_found(ip);
        return 0;
 out_error:
@@ -259,24 +256,22 @@ xfs_iget_cache_miss(
        xfs_trans_t             *tp,
        xfs_ino_t               ino,
        struct xfs_inode        **ipp,
-        xfs_daddr_t             bno,
        int                     flags,
        int                     lock_flags)
 {
        struct xfs_inode        *ip;
        int                     error;
-        unsigned long           first_index, mask;
        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
        ip = xfs_inode_alloc(mp, ino);
        if (!ip)
                return ENOMEM;
-        error = xfs_iread(mp, tp, ip, bno, flags);
+        error = xfs_iread(mp, tp, ip, flags);
        if (error)
                goto out_destroy;
-        xfs_itrace_entry(ip);
+        trace_xfs_iget_miss(ip);
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                error = ENOENT;
@@ -302,8 +297,6 @@ xfs_iget_cache_miss(
                        BUG();
        }
-        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
-        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
        /* insert the new inode */
@@ -322,7 +315,6 @@ xfs_iget_cache_miss(
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        trace_xfs_iget_alloc(ip);
        *ipp = ip;
        return 0;
@@ -358,8 +350,6 @@ out_destroy:
 *        within the file system for the inode being requested.
 * lock_flags -- flags indicating how to lock the inode.  See the comment
 *               for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *        if known (as by bulkstat), else 0.
 */
 int
 xfs_iget(
@@ -368,8 +358,7 @@ xfs_iget(
        xfs_ino_t       ino,
        uint            flags,
        uint            lock_flags,
-        xfs_inode_t     **ipp,
+        xfs_inode_t     **ipp)
-        xfs_daddr_t     bno)
 {
        xfs_inode_t     *ip;
        int             error;
@@ -382,9 +371,6 @@ xfs_iget(
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        if (!pag->pagi_inodeok)
-                return EINVAL;
-        ASSERT(pag->pag_ici_init);
        agino = XFS_INO_TO_AGINO(mp, ino);
 again:
@@ -400,7 +386,7 @@ again:
                read_unlock(&pag->pag_ici_lock);
                XFS_STATS_INC(xs_ig_missed);
-                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
                                                        flags, lock_flags);
                if (error)
                        goto out_error_or_again;
@@ -429,97 +415,6 @@ out_error_or_again:
 }
 /*
- * Decrement reference count of an inode structure and unlock it.
- *
- * ip -- the inode being released
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be released.  See the comment on xfs_iunlock() for a list
- *       of valid values.
- */
-void
-xfs_iput(xfs_inode_t    *ip,
-         uint           lock_flags)
-{
-        xfs_itrace_entry(ip);
-        xfs_iunlock(ip, lock_flags);
-        IRELE(ip);
-}
-/*
- * Special iput for brand-new inodes that are still locked
- */
-void
-xfs_iput_new(
-        xfs_inode_t     *ip,
-        uint            lock_flags)
-{
-        struct inode    *inode = VFS_I(ip);
-        xfs_itrace_entry(ip);
-        if ((ip->i_d.di_mode == 0)) {
-                ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                make_bad_inode(inode);
-        }
-        if (inode->i_state & I_NEW)
-                unlock_new_inode(inode);
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
-        IRELE(ip);
-}
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- *
- * Note: because we don't initialise everything on reallocation out
- * of the zone, we must ensure we nullify everything correctly before
- * freeing the structure.
- */
-void
-xfs_ireclaim(
-        struct xfs_inode        *ip)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_perag        *pag;
-        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
-        XFS_STATS_INC(xs_ig_reclaims);
-        /*
-         * Remove the inode from the per-AG radix tree.
-         *
-         * Because radix_tree_delete won't complain even if the item was never
-         * added to the tree assert that it's been there before to catch
-         * problems with the inode life time early on.
-         */
-        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        write_lock(&pag->pag_ici_lock);
-        if (!radix_tree_delete(&pag->pag_ici_root, agino))
-                ASSERT(0);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_perag_put(pag);
-        /*
-         * Here we do an (almost) spurious inode lock in order to coordinate
-         * with inode cache radix tree lookups.  This is because the lookup
-         * can reference the inodes in the cache without taking references.
-         *
-         * We make that OK here by ensuring that we wait until the inode is
-         * unlocked after the lookup before we go ahead and free it.  We get
-         * both the ilock and the iolock because the code may need to drop the
-         * ilock one but will still hold the iolock.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_qm_dqdetach(ip);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_inode_free(ip);
-}
-/*
 * This is a wrapper routine around the xfs_ilock() routine
 * used to centralize some grungy code.  It is used in places
 * that wish to lock the inode solely for reading the extents.
@@ -744,30 +639,24 @@ xfs_ilock_demote(
 }
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                        XFS_ILOCK_EXCL) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
-                if (!ip->i_lock.mr_writer)
+                        return !!ip->i_lock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
-        if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                        XFS_IOLOCK_EXCL) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                if (!ip->i_iolock.mr_writer)
+                        return !!ip->i_iolock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
        }
-        return 1;
+        ASSERT(0);
+        return 0;
 }
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -27,13 +27,10 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_utils.h"
 #include "xfs_quota.h"
@@ -177,7 +173,7 @@ xfs_imap_to_bp(
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP,
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (iget_flags & XFS_IGET_BULKSTAT) {
+                        if (iget_flags & XFS_IGET_UNTRUSTED) {
                                xfs_trans_brelse(tp, bp);
                                return XFS_ERROR(EINVAL);
                        }
@@ -426,7 +422,7 @@ xfs_iformat(
        if (!XFS_DFORK_Q(dip))
                return 0;
        ASSERT(ip->i_afp == NULL);
-        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        switch (dip->di_aformat) {
@@ -509,7 +505,7 @@ xfs_iformat_local(
                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
        else {
                real_size = roundup(size, 4);
-                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
        }
        ifp->if_bytes = size;
        ifp->if_real_bytes = real_size;
@@ -636,7 +632,7 @@ xfs_iformat_btree(
        }
        ifp->if_broot_bytes = size;
-        ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+        ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
        ASSERT(ifp->if_broot != NULL);
        /*
         * Copy and convert from the on-disk structure
@@ -787,7 +783,6 @@ xfs_iread(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_inode_t     *ip,
-        xfs_daddr_t     bno,
        uint            iget_flags)
 {
        xfs_buf_t       *bp;
@@ -797,11 +792,9 @@ xfs_iread(
        /*
         * Fill in the location information in the in-core inode.
         */
-        ip->i_imap.im_blkno = bno;
        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
        if (error)
                return error;
-        ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
        /*
         * Get pointers to the on-disk inode and the buffer containing it.
@@ -925,7 +918,6 @@ xfs_iread_extents(
        int             error;
        xfs_ifork_t     *ifp;
        xfs_extnum_t    nextents;
-        size_t          size;
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
@@ -933,7 +925,6 @@ xfs_iread_extents(
                return XFS_ERROR(EFSCORRUPTED);
        }
        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-        size = nextents * sizeof(xfs_bmbt_rec_t);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        /*
@@ -1229,7 +1220,7 @@ xfs_isize_check(
                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
                          map_first),
                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-                         NULL, NULL))
+                         NULL))
            return;
        ASSERT(nimaps == 1);
        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1463,7 +1454,7 @@ xfs_itruncate_finish(
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(ip->i_transp == *tp);
        ASSERT(ip->i_itemp != NULL);
-        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+        ASSERT(ip->i_itemp->ili_lock_flags == 0);
        ntp = *tp;
@@ -1592,11 +1583,10 @@ xfs_itruncate_finish(
                xfs_bmap_init(&free_list, &first_block);
                error = xfs_bunmapi(ntp, ip,
                                    first_unmap_block, unmap_len,
-                                    xfs_bmapi_aflag(fork) |
+                                    xfs_bmapi_aflag(fork),
-                                      (sync ? 0 : XFS_BMAPI_ASYNC),
                                    XFS_ITRUNC_MAX_EXTENTS,
                                    &first_block, &free_list,
-                                    NULL, &done);
+                                    &done);
                if (error) {
                        /*
                         * If the bunmapi call encounters an error,
@@ -1615,12 +1605,8 @@ xfs_itruncate_finish(
                 */
                error = xfs_bmap_finish(tp, &free_list, &committed);
                ntp = *tp;
-                if (committed) {
+                if (committed)
-                        /* link the inode into the next xact in the chain */
+                        xfs_trans_ijoin(ntp, ip);
-                        xfs_trans_ijoin(ntp, ip,
-                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        xfs_trans_ihold(ntp, ip);
-                }
                if (error) {
                        /*
@@ -1649,9 +1635,7 @@ xfs_itruncate_finish(
                error = xfs_trans_commit(*tp, 0);
                *tp = ntp;
-                /* link the inode into the next transaction in the chain */
+                xfs_trans_ijoin(ntp, ip);
-                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_trans_ihold(ntp, ip);
                if (error)
                        return error;
@@ -1930,6 +1914,11 @@ xfs_iunlink_remove(
        return 0;
 }
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
        xfs_inode_t     *free_ip,
@@ -1940,10 +1929,10 @@ xfs_ifree_cluster(
        int                     blks_per_cluster;
        int                     nbufs;
        int                     ninodes;
-        int                     i, j, found, pre_flushed;
+        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
-        xfs_inode_t             *ip, **ip_found;
+        xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
@@ -1960,109 +1949,91 @@ xfs_ifree_cluster(
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
        }
-        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
        for (j = 0; j < nbufs; j++, inum += ninodes) {
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
+                /*
+                 * We obtain and lock the backing buffer first in the process
+                 * here, as we have to ensure that any dirty inode that we
+                 * can't get the flush lock on is attached to the buffer.
+                 * If we scan the in-memory inodes first, then buffer IO can
+                 * complete before we get a lock on it, and hence we may fail
+                 * to mark all the active inodes on the buffer stale.
+                 */
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_LOCK);
+                /*
+                 * Walk the inodes already attached to the buffer and mark them
+                 * stale. These will all have the flush locks held, so an
+                 * in-memory inode walk can't lock them. By marking them all
+                 * stale first, we will not attempt to lock them in the loop
+                 * below as the XFS_ISTALE flag will be set.
+                 */
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                while (lip) {
+                        if (lip->li_type == XFS_LI_INODE) {
+                                iip = (xfs_inode_log_item_t *)lip;
+                                ASSERT(iip->ili_logged == 1);
+                                lip->li_cb = xfs_istale_done;
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                        &iip->ili_flush_lsn,
+                                                        &iip->ili_item.li_lsn);
+                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+                        }
+                        lip = lip->li_bio_list;
+                }
                /*
-                 * Look for each inode in memory and attempt to lock it,
+                 * For each inode in memory attempt to add it to the inode
-                 * we can be racing with flush and tail pushing here.
+                 * buffer and set it up for being staled on buffer IO
-                 * any inode we get the locks on, add to an array of
+                 * completion.  This is safe as we've locked out tail pushing
-                 * inode items to process later.
+                 * and flushing by locking the buffer.
                 *
-                 * The get the buffer lock, we could beat a flush
+                 * We have already marked every inode that was part of a
-                 * or tail pushing thread to the lock here, in which
+                 * transaction stale above, which means there is no point in
-                 * case they will go looking for the inode buffer
+                 * even trying to lock them.
-                 * and fail, we need some other form of interlock
-                 * here.
                 */
-                found = 0;
                for (i = 0; i < ninodes; i++) {
+retry:
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or we found it already,
+                        /* Inode not in memory or stale, nothing to do */
-                         * nothing to do
-                         */
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
-                        if (xfs_inode_clean(ip)) {
+                        /*
-                                read_unlock(&pag->pag_ici_lock);
+                         * Don't try to lock/unlock the current inode, but we
-                                continue;
+                         * _cannot_ skip the other inodes that we did not find
-                        }
+                         * in the list attached to the buffer and are not
+                         * already marked stale. If we can't lock it, back off
-                        /* If we can get the locks then add it to the
+                         * and retry.
-                         * list, otherwise by the time we get the bp lock
-                         * below it will already be attached to the
-                         * inode buffer.
-                         */
-                        /* This inode will already be locked - by us, lets
-                         * keep it that way.
                         */
+                        if (ip != free_ip &&
-                        if (ip == free_ip) {
+                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                if (xfs_iflock_nowait(ip)) {
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                }
                                read_unlock(&pag->pag_ici_lock);
-                                continue;
+                                delay(1);
-                        }
+                                goto retry;
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                if (xfs_iflock_nowait(ip)) {
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                } else {
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                }
                        }
                        read_unlock(&pag->pag_ici_lock);
-                }
-                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
+                        xfs_iflock(ip);
-                                        mp->m_bsize * blks_per_cluster,
+                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        XBF_LOCK);
-                pre_flushed = 0;
+                        /*
-                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                         * we don't need to attach clean inodes or those only
-                while (lip) {
+                         * with unlogged changes (which we throw away, anyway).
-                        if (lip->li_type == XFS_LI_INODE) {
+                         */
-                                iip = (xfs_inode_log_item_t *)lip;
-                                ASSERT(iip->ili_logged == 1);
-                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                        &iip->ili_flush_lsn,
-                                                        &iip->ili_item.li_lsn);
-                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                pre_flushed++;
-                        }
-                        lip = lip->li_bio_list;
-                }
-                for (i = 0; i < found; i++) {
-                        ip = ip_found[i];
                        iip = ip->i_itemp;
+                        if (!iip || xfs_inode_clean(ip)) {
-                        if (!iip) {
+                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -2075,20 +2046,17 @@ xfs_ifree_cluster(
                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
                                                &iip->ili_item.li_lsn);
-                        xfs_buf_attach_iodone(bp,
+                        xfs_buf_attach_iodone(bp, xfs_istale_done,
-                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                                                  &iip->ili_item);
-                                xfs_istale_done, (xfs_log_item_t *)iip);
-                        if (ip != free_ip) {
+                        if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        }
                }
-                if (found || pre_flushed)
+                xfs_trans_stale_inode_buf(tp, bp);
-                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
-        kmem_free(ip_found);
        xfs_perag_put(pag);
 }
@@ -2224,7 +2192,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2240,7 +2208,7 @@ xfs_iroot_realloc(
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
-                                KM_SLEEP);
+                                KM_SLEEP | KM_NOFS);
                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -2266,7 +2234,7 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = kmem_alloc(new_size, KM_SLEEP);
+                new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
                /*
                 * First copy over the btree block header.
                 */
@@ -2370,7 +2338,8 @@ xfs_idata_realloc(
                real_size = roundup(new_size, 4);
                if (ifp->if_u1.if_data == NULL) {
                        ASSERT(ifp->if_real_bytes == 0);
-                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
                        /*
                         * Only do the realloc if the underlying size
@@ -2381,11 +2350,12 @@ xfs_idata_realloc(
                                        kmem_realloc(ifp->if_u1.if_data,
                                                        real_size,
                                                        ifp->if_real_bytes,
-                                                        KM_SLEEP);
+                                                        KM_SLEEP | KM_NOFS);
                        }
                } else {
                        ASSERT(ifp->if_real_bytes == 0);
-                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                        ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                        KM_SLEEP | KM_NOFS);
                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
                                ifp->if_bytes);
                }
@@ -2449,6 +2419,8 @@ xfs_iunpin_nowait(
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
        /* Give the log a push to start the unpinning I/O */
        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
@@ -2647,8 +2619,6 @@ xfs_iflush_cluster(
        int                     i;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        ASSERT(pag->pagi_inodeok);
-        ASSERT(pag->pag_ici_init);
        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
@@ -2752,7 +2722,6 @@ cluster_corrupt_out:
                 * mark it as stale and brelse.
                 */
                if (XFS_BUF_IODONE_FUNC(bp)) {
-                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
                        XFS_BUF_ERROR(bp,EIO);
@@ -3090,8 +3059,7 @@ xfs_iflush_int(
                 * and unlock the inode's flush lock when the inode is
                 * completely written to disk.
                 */
-                xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-                                      xfs_iflush_done, (xfs_log_item_t *)iip);
                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
                ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
@@ -3535,13 +3503,11 @@ xfs_iext_remove_indirect(
        xfs_extnum_t    ext_diff;       /* extents to remove in current list */
        xfs_extnum_t    nex1;           /* number of extents before idx */
        xfs_extnum_t    nex2;           /* extents after idx + count */
-        int             nlists;         /* entries in indirection array */
        int             page_idx = idx; /* index in target extent list */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
        erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
        ASSERT(erp != NULL);
-        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        nex1 = page_idx;
        ext_cnt = count;
        while (ext_cnt) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9965e40a4615..0898c5417d12 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -442,9 +442,7 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 * xfs_iget.c prototypes.
 */
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                         uint, uint, xfs_inode_t **, xfs_daddr_t);
+                         uint, uint, xfs_inode_t **);
-void            xfs_iput(xfs_inode_t *, uint);
-void            xfs_iput_new(xfs_inode_t *, uint);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
@@ -452,7 +450,7 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_ireclaim(xfs_inode_t *);
+void            xfs_inode_free(struct xfs_inode *ip);
 /*
 * xfs_inode.c prototypes.
@@ -500,7 +498,7 @@ do { \
 * Flags for xfs_iget()
 */
 #define XFS_IGET_CREATE         0x1
-#define XFS_IGET_BULKSTAT       0x2
+#define XFS_IGET_UNTRUSTED      0x2
 int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
                            xfs_ino_t, struct xfs_dinode **,
@@ -509,7 +507,7 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          struct xfs_inode *, struct xfs_dinode **,
                          struct xfs_buf **, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                          struct xfs_inode *, xfs_daddr_t, uint);
+                          struct xfs_inode *, uint);
 void            xfs_dinode_to_disk(struct xfs_dinode *,
                                   struct xfs_icdinode *);
 void            xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..fe00777e2796 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -22,30 +22,26 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rw.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
 kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+        return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
 /*
 * This returns the number of iovecs needed to log the given inode item.
 *
@@ -55,13 +51,11 @@ kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
 */
 STATIC uint
 xfs_inode_item_size(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        uint            nvecs;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        xfs_inode_t     *ip;
+        struct xfs_inode        *ip = iip->ili_inode;
+        uint                    nvecs = 2;
-        ip = iip->ili_inode;
-        nvecs = 2;
        /*
         * Only log the data/extents/b-tree root if there is something
@@ -212,21 +206,17 @@ xfs_inode_item_size(
 */
 STATIC void
 xfs_inode_item_format(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
-        xfs_log_iovec_t         *log_vector)
+        struct xfs_log_iovec    *vecp)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
-        xfs_log_iovec_t         *vecp;
-        xfs_inode_t             *ip;
        size_t                  data_bytes;
        xfs_bmbt_rec_t          *ext_buffer;
-        int                     nrecs;
        xfs_mount_t             *mp;
-        ip = iip->ili_inode;
+        vecp->i_addr = &iip->ili_format;
-        vecp = log_vector;
-        vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
        vecp->i_len  = sizeof(xfs_inode_log_format_t);
        vecp->i_type = XLOG_REG_TYPE_IFORMAT;
        vecp++;
@@ -277,7 +267,7 @@ xfs_inode_item_format(
         */
        xfs_synchronize_times(ip);
-        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+        vecp->i_addr = &ip->i_d;
        vecp->i_len  = sizeof(struct xfs_icdinode);
        vecp->i_type = XLOG_REG_TYPE_ICORE;
        vecp++;
@@ -323,18 +313,17 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
                        ASSERT(ip->i_d.di_nextents > 0);
                        ASSERT(iip->ili_extents_buf == NULL);
-                        nrecs = ip->i_df.if_bytes /
+                        ASSERT((ip->i_df.if_bytes /
-                                (uint)sizeof(xfs_bmbt_rec_t);
+                                (uint)sizeof(xfs_bmbt_rec_t)) > 0);
-                        ASSERT(nrecs > 0);
 #ifdef XFS_NATIVE_HOST
-                        if (nrecs == ip->i_d.di_nextents) {
+                       if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+                                               (uint)sizeof(xfs_bmbt_rec_t)) {
                                /*
                                 * There are no delayed allocation
                                 * extents, so just point to the
                                 * real extents array.
                                 */
-                                vecp->i_addr =
+                                vecp->i_addr = ip->i_df.if_u1.if_extents;
-                                        (char *)(ip->i_df.if_u1.if_extents);
                                vecp->i_len = ip->i_df.if_bytes;
                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        } else
@@ -352,7 +341,7 @@ xfs_inode_item_format(
                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
                                        KM_SLEEP);
                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                                vecp->i_addr = ext_buffer;
                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                                XFS_DATA_FORK);
                                vecp->i_type = XLOG_REG_TYPE_IEXT;
@@ -371,7 +360,7 @@ xfs_inode_item_format(
                if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
                        ASSERT(ip->i_df.if_broot_bytes > 0);
                        ASSERT(ip->i_df.if_broot != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+                        vecp->i_addr = ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IBROOT;
                        vecp++;
@@ -389,7 +378,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+                        vecp->i_addr = ip->i_df.if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
@@ -437,7 +426,7 @@ xfs_inode_item_format(
         * Assert that no attribute-related log flags are set.
         */
        if (!XFS_IFORK_Q(ip)) {
-                ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+                ASSERT(nvecs == lip->li_desc->lid_size);
                iip->ili_format.ilf_size = nvecs;
                ASSERT(!(iip->ili_format.ilf_fields &
                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
@@ -449,21 +438,21 @@ xfs_inode_item_format(
                ASSERT(!(iip->ili_format.ilf_fields &
                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
                if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-                        ASSERT(ip->i_afp->if_bytes > 0);
-                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-                        ASSERT(ip->i_d.di_anextents > 0);
 #ifdef DEBUG
-                        nrecs = ip->i_afp->if_bytes /
+                        int nrecs = ip->i_afp->if_bytes /
                                (uint)sizeof(xfs_bmbt_rec_t);
-#endif
                        ASSERT(nrecs > 0);
                        ASSERT(nrecs == ip->i_d.di_anextents);
+                        ASSERT(ip->i_afp->if_bytes > 0);
+                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+                        ASSERT(ip->i_d.di_anextents > 0);
+#endif
 #ifdef XFS_NATIVE_HOST
                        /*
                         * There are not delayed allocation extents
                         * for attributes, so just point at the array.
                         */
-                        vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
@@ -473,7 +462,7 @@ xfs_inode_item_format(
                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
                                KM_SLEEP);
                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                        vecp->i_addr = ext_buffer;
                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                        XFS_ATTR_FORK);
 #endif
@@ -490,7 +479,7 @@ xfs_inode_item_format(
                if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
                        ASSERT(ip->i_afp->if_broot_bytes > 0);
                        ASSERT(ip->i_afp->if_broot != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+                        vecp->i_addr = ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
                        vecp++;
@@ -506,7 +495,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_afp->if_bytes > 0);
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
-                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+                        vecp->i_addr = ip->i_afp->if_u1.if_data;
                        /*
                         * Round i_bytes up to a word boundary.
                         * The underlying memory is guaranteed to
@@ -528,7 +517,7 @@ xfs_inode_item_format(
                break;
        }
-        ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+        ASSERT(nvecs == lip->li_desc->lid_size);
        iip->ili_format.ilf_size = nvecs;
 }
@@ -539,11 +528,14 @@ xfs_inode_item_format(
 */
 STATIC void
 xfs_inode_item_pin(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+        struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
-        atomic_inc(&iip->ili_inode->i_pincount);
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        trace_xfs_inode_pin(ip, _RET_IP_);
+        atomic_inc(&ip->i_pincount);
 }
@@ -553,28 +545,19 @@ xfs_inode_item_pin(
 *
 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
 */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
-        int                     stale)
+        int                     remove)
 {
-        struct xfs_inode        *ip = iip->ili_inode;
+        struct xfs_inode        *ip = INODE_ITEM(lip)->ili_inode;
+        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
                wake_up(&ip->i_ipin_wait);
 }
-/* ARGSUSED */
-STATIC void
-xfs_inode_item_unpin_remove(
-        xfs_inode_log_item_t    *iip,
-        xfs_trans_t             *tp)
-{
-        xfs_inode_item_unpin(iip, 0);
-}
 /*
 * This is called to attempt to lock the inode associated with this
 * inode log item, in preparation for the push routine which does the actual
@@ -590,19 +573,16 @@ xfs_inode_item_unpin_remove(
 */
 STATIC uint
 xfs_inode_item_trylock(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        register xfs_inode_t    *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ip = iip->ili_inode;
-        if (xfs_ipincount(ip) > 0) {
+        if (xfs_ipincount(ip) > 0)
                return XFS_ITEM_PINNED;
-        }
-        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
                return XFS_ITEM_LOCKED;
-        }
        if (!xfs_iflock_nowait(ip)) {
                /*
@@ -628,7 +608,7 @@ xfs_inode_item_trylock(
        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                ASSERT(iip->ili_format.ilf_fields != 0);
                ASSERT(iip->ili_logged == 0);
-                ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+                ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        }
 #endif
        return XFS_ITEM_SUCCESS;
@@ -642,26 +622,18 @@ xfs_inode_item_trylock(
 */
 STATIC void
 xfs_inode_item_unlock(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        uint            hold;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        uint            iolocked;
+        struct xfs_inode        *ip = iip->ili_inode;
-        uint            lock_flags;
+        unsigned short          lock_flags;
-        xfs_inode_t     *ip;
-        ASSERT(iip != NULL);
        ASSERT(iip->ili_inode->i_itemp != NULL);
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
-        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-                  XFS_ILI_IOLOCKED_EXCL)) ||
-               xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL));
-        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
-                  XFS_ILI_IOLOCKED_SHARED)) ||
-               xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED));
        /*
         * Clear the transaction pointer in the inode.
         */
-        ip = iip->ili_inode;
        ip->i_transp = NULL;
        /*
@@ -685,34 +657,11 @@ xfs_inode_item_unlock(
                iip->ili_aextents_buf = NULL;
        }
-        /*
+        lock_flags = iip->ili_lock_flags;
-         * Figure out if we should unlock the inode or not.
+        iip->ili_lock_flags = 0;
-         */
+        if (lock_flags) {
-        hold = iip->ili_flags & XFS_ILI_HOLD;
+                xfs_iunlock(iip->ili_inode, lock_flags);
+                IRELE(iip->ili_inode);
-        /*
-         * Before clearing out the flags, remember whether we
-         * are holding the inode's IO lock.
-         */
-        iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
-        /*
-         * Clear out the fields of the inode log item particular
-         * to the current transaction.
-         */
-        iip->ili_flags = 0;
-        /*
-         * Unlock the inode if XFS_ILI_HOLD was not set.
-         */
-        if (!hold) {
-                lock_flags = XFS_ILOCK_EXCL;
-                if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
-                        lock_flags |= XFS_IOLOCK_EXCL;
-                } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
-                        lock_flags |= XFS_IOLOCK_SHARED;
-                }
-                xfs_iput(iip->ili_inode, lock_flags);
        }
 }
@@ -724,13 +673,12 @@ xfs_inode_item_unlock(
 * is the only one that matters.  Therefore, simply return the
 * given lsn.
 */
-/*ARGSUSED*/
 STATIC xfs_lsn_t
 xfs_inode_item_committed(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        return (lsn);
+        return lsn;
 }
 /*
@@ -742,13 +690,12 @@ xfs_inode_item_committed(
 */
 STATIC void
 xfs_inode_item_pushbuf(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_inode_t     *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
-        xfs_mount_t     *mp;
+        struct xfs_inode        *ip = iip->ili_inode;
-        xfs_buf_t       *bp;
+        struct xfs_buf          *bp;
-        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        /*
@@ -756,14 +703,13 @@ xfs_inode_item_pushbuf(
         * inode was taken off the AIL. So, just get out.
         */
        if (completion_done(&ip->i_flush) ||
-            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return;
        }
-        mp = ip->i_mount;
+        bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
-        bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
+                        iip->ili_format.ilf_len, XBF_TRYLOCK);
-                    iip->ili_format.ilf_len, XBF_TRYLOCK);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!bp)
@@ -771,10 +717,8 @@ xfs_inode_item_pushbuf(
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
-        return;
 }
 /*
 * This is called to asynchronously write the inode associated with this
 * inode log item out to disk. The inode will already have been locked by
@@ -782,14 +726,14 @@ xfs_inode_item_pushbuf(
 */
 STATIC void
 xfs_inode_item_push(
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_inode_t     *ip;
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+        struct xfs_inode        *ip = iip->ili_inode;
-        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
        ASSERT(!completion_done(&ip->i_flush));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
@@ -812,43 +756,34 @@ xfs_inode_item_push(
         */
        (void) xfs_iflush(ip, 0);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return;
 }
 /*
 * XXX rcc - this one really has to do something.  Probably needs
 * to stamp in a new field in the incore inode.
 */
-/* ARGSUSED */
 STATIC void
 xfs_inode_item_committing(
-        xfs_inode_log_item_t    *iip,
+        struct xfs_log_item     *lip,
        xfs_lsn_t               lsn)
 {
-        iip->ili_last_lsn = lsn;
+        INODE_ITEM(lip)->ili_last_lsn = lsn;
-        return;
 }
 /*
 * This is the ops vector shared by all buf log items.
 */
 static struct xfs_item_ops xfs_inode_item_ops = {
-        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
+        .iop_size       = xfs_inode_item_size,
-        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+        .iop_format     = xfs_inode_item_format,
-                                        xfs_inode_item_format,
+        .iop_pin        = xfs_inode_item_pin,
-        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
+        .iop_unpin      = xfs_inode_item_unpin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+        .iop_trylock    = xfs_inode_item_trylock,
-        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+        .iop_unlock     = xfs_inode_item_unlock,
-                                        xfs_inode_item_unpin_remove,
+        .iop_committed  = xfs_inode_item_committed,
-        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
+        .iop_push       = xfs_inode_item_push,
-        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
+        .iop_pushbuf    = xfs_inode_item_pushbuf,
-        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+        .iop_committing = xfs_inode_item_committing
-                                        xfs_inode_item_committed,
-        .iop_push       = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
-        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-                                        xfs_inode_item_committing
 };
@@ -857,25 +792,17 @@ static struct xfs_item_ops xfs_inode_item_ops = {
 */
 void
 xfs_inode_item_init(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_inode_log_item_t    *iip;
+        struct xfs_inode_log_item *iip;
        ASSERT(ip->i_itemp == NULL);
        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
-        iip->ili_item.li_type = XFS_LI_INODE;
-        iip->ili_item.li_ops = &xfs_inode_item_ops;
-        iip->ili_item.li_mountp = mp;
-        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
+        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
-        /*
+                                                &xfs_inode_item_ops);
-           We have zeroed memory. No need ...
-           iip->ili_extents_buf = NULL;
-         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
@@ -906,14 +833,14 @@ xfs_inode_item_destroy(
 * from the AIL if it has not been re-logged, and unlocking the inode's
 * flush lock.
 */
-/*ARGSUSED*/
 void
 xfs_iflush_done(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
+        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        xfs_inode_t             *ip = iip->ili_inode;
-        struct xfs_ail          *ailp = iip->ili_item.li_ailp;
+        struct xfs_ail          *ailp = lip->li_ailp;
        /*
         * We only want to pull the item from the AIL if it is
@@ -924,12 +851,11 @@ xfs_iflush_done(
         * the lock since it's cheaper, and then we recheck while
         * holding the lock before removing the inode from the AIL.
         */
-        if (iip->ili_logged &&
+        if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) {
-            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
                spin_lock(&ailp->xa_lock);
-                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+                if (lip->li_lsn == iip->ili_flush_lsn) {
                        /* xfs_trans_ail_delete() drops the AIL lock. */
-                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
+                        xfs_trans_ail_delete(ailp, lip);
                } else {
                        spin_unlock(&ailp->xa_lock);
                }
@@ -947,8 +873,6 @@ xfs_iflush_done(
         * Release the inode's flush lock since we're done with it.
         */
        xfs_ifunlock(ip);
-        return;
 }
 /*
@@ -964,10 +888,8 @@ xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
        xfs_inode_log_item_t    *iip = ip->i_itemp;
-        xfs_mount_t             *mp;
        iip = ip->i_itemp;
-        mp = ip->i_mount;
        if (iip) {
                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
@@ -998,10 +920,10 @@ xfs_iflush_abort(
 void
 xfs_istale_done(
-        xfs_buf_t               *bp,
+        struct xfs_buf          *bp,
-        xfs_inode_log_item_t    *iip)
+        struct xfs_log_item     *lip)
 {
-        xfs_iflush_abort(iip->ili_inode);
+        xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
 }
 /*
@@ -1014,9 +936,8 @@ xfs_inode_item_format_convert(
        xfs_inode_log_format_t  *in_f)
 {
        if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
-                xfs_inode_log_format_32_t *in_f32;
+                xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
-                in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
                in_f->ilf_type = in_f32->ilf_type;
                in_f->ilf_size = in_f32->ilf_size;
                in_f->ilf_fields = in_f32->ilf_fields;
@@ -1032,9 +953,8 @@ xfs_inode_item_format_convert(
                in_f->ilf_boffset = in_f32->ilf_boffset;
                return 0;
        } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
-                xfs_inode_log_format_64_t *in_f64;
+                xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
-                in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
                in_f->ilf_type = in_f64->ilf_type;
                in_f->ilf_size = in_f64->ilf_size;
                in_f->ilf_fields = in_f64->ilf_fields;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 9a467958ecdd..d3dee61e6d91 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 {
                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
                                 XFS_ILOG_ABROOT)
-#define XFS_ILI_HOLD            0x1
-#define XFS_ILI_IOLOCKED_EXCL   0x2
-#define XFS_ILI_IOLOCKED_SHARED 0x4
-#define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 static inline int xfs_ilog_fbroot(int w)
 {
        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
@@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item {
        struct xfs_inode        *ili_inode;        /* inode ptr */
        xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
        xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
-        unsigned short          ili_flags;         /* misc flags */
+        unsigned short          ili_lock_flags;    /* lock flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
@@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
 extern void xfs_iflush_abort(struct xfs_inode *);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
                                         xfs_inode_log_format_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..20576146369f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,19 +23,14 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
-#include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_ialloc.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -55,71 +50,33 @@
 #define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-xfs_imap_to_bmap(
+                                  int, struct xfs_bmbt_irec *, int *);
-        xfs_inode_t     *ip,
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-        xfs_off_t       offset,
+                                 struct xfs_bmbt_irec *, int *);
-        xfs_bmbt_irec_t *imap,
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-        xfs_iomap_t     *iomapp,
+                                struct xfs_bmbt_irec *, int *);
-        int             imaps,                  /* Number of imap entries */
-        int             iomaps,                 /* Number of iomap entries */
-        int             flags)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        int             pbm;
-        xfs_fsblock_t   start_block;
-        for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-                iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-                iomapp->iomap_delta = offset - iomapp->iomap_offset;
-                iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-                iomapp->iomap_flags = flags;
-                if (XFS_IS_REALTIME_INODE(ip)) {
-                        iomapp->iomap_flags |= IOMAP_REALTIME;
-                        iomapp->iomap_target = mp->m_rtdev_targp;
-                } else {
-                        iomapp->iomap_target = mp->m_ddev_targp;
-                }
-                start_block = imap->br_startblock;
-                if (start_block == HOLESTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_HOLE;
-                } else if (start_block == DELAYSTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_DELAY;
-                } else {
-                        iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
-                        if (ISUNWRITTEN(imap))
-                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-                }
-                offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-        }
-        return pbm;     /* Return the number filled */
-}
 int
 xfs_iomap(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t       offset,
+        xfs_off_t               offset,
-        ssize_t         count,
+        ssize_t                 count,
-        int             flags,
+        int                     flags,
-        xfs_iomap_t     *iomapp,
+        struct xfs_bmbt_irec    *imap,
-        int             *niomaps)
+        int                     *nimaps,
+        int                     *new)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t   offset_fsb, end_fsb;
+        xfs_fileoff_t           offset_fsb, end_fsb;
-        int             error = 0;
+        int                     error = 0;
-        int             lockmode = 0;
+        int                     lockmode = 0;
-        xfs_bmbt_irec_t imap;
+        int                     bmapi_flags = 0;
-        int             nimaps = 1;
-        int             bmapi_flags = 0;
-        int             iomap_flags = 0;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+        *new = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -160,8 +117,8 @@ xfs_iomap(
        error = xfs_bmapi(NULL, ip, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, &imap,
+                        bmapi_flags,  NULL, 0, imap,
-                        &nimaps, NULL, NULL);
+                        nimaps, NULL);
        if (error)
                goto out;
@@ -169,46 +126,41 @@ xfs_iomap(
        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
        case BMAPI_WRITE:
                /* If we found an extent, return it */
-                if (nimaps &&
+                if (*nimaps &&
-                    (imap.br_startblock != HOLESTARTBLOCK) &&
+                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap.br_startblock != DELAYSTARTBLOCK)) {
+                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
-                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+                if (flags & BMAPI_DIRECT) {
                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       &imap, &nimaps, nimaps);
+                                                       imap, nimaps);
                } else {
                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      &imap, &nimaps);
+                                                      imap, nimaps);
                }
                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
                }
-                iomap_flags = IOMAP_NEW;
+                *new = 1;
                break;
        case BMAPI_ALLOCATE:
                /* If we found an extent, return it */
                xfs_iunlock(ip, lockmode);
                lockmode = 0;
-                if (nimaps && !isnullstartblock(imap.br_startblock)) {
+                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 &imap, &nimaps);
+                                                 imap, nimaps);
                break;
        }
-        if (nimaps) {
+        ASSERT(*nimaps <= 1);
-                *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
-                                            iomapp, nimaps, *niomaps, iomap_flags);
-        } else if (niomaps) {
-                *niomaps = 0;
-        }
 out:
        if (lockmode)
@@ -216,7 +168,6 @@ out:
        return XFS_ERROR(error);
 }
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -285,15 +236,14 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-int
+STATIC int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
        int             flags,
-        xfs_bmbt_irec_t *ret_imap,
+        xfs_bmbt_irec_t *imap,
-        int             *nmaps,
+        int             *nmaps)
-        int             found)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -306,7 +256,6 @@ xfs_iomap_write_direct(
        int             quota_flag;
        int             rt;
        xfs_trans_t     *tp;
-        xfs_bmbt_irec_t imap;
        xfs_bmap_free_t free_list;
        uint            qblocks, resblks, resrtextents;
        int             committed;
@@ -330,10 +279,10 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+                if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
-                                        ret_imap->br_blockcount +
+                                        imap->br_blockcount +
-                                        ret_imap->br_startoff);
+                                        imap->br_startoff);
        }
        count_fsb = last_fsb - offset_fsb;
        ASSERT(count_fsb > 0);
@@ -379,20 +328,22 @@ xfs_iomap_write_direct(
        if (error)
                goto error1;
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        bmapi_flag = XFS_BMAPI_WRITE;
        if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz))
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
-         * Issue the xfs_bmapi() call to allocate the blocks
+         * Issue the xfs_bmapi() call to allocate the blocks.
+         *
+         * From this point onwards we overwrite the imap pointer that the
+         * caller gave to us.
         */
        xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
-                &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
+                &firstfsb, 0, imap, &nimaps, &free_list);
        if (error)
                goto error0;
@@ -414,12 +365,11 @@ xfs_iomap_write_direct(
                goto error_out;
        }
-        if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, &imap);
+                error = xfs_cmn_err_fsblock_zero(ip, imap);
                goto error_out;
        }
-        *ret_imap = imap;
        *nmaps = 1;
        return 0;
@@ -470,7 +420,7 @@ xfs_iomap_eof_want_preallocate(
                imaps = nimaps;
                firstblock = NULLFSBLOCK;
                error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
-                                  &firstblock, 0, imap, &imaps, NULL, NULL);
+                                  &firstblock, 0, imap, &imaps, NULL);
                if (error)
                        return error;
                for (n = 0; n < imaps; n++) {
@@ -485,7 +435,7 @@ xfs_iomap_eof_want_preallocate(
        return 0;
 }
-int
+STATIC int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
@@ -545,7 +495,7 @@ retry:
                          (xfs_filblks_t)(last_fsb - offset_fsb),
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-                          &nimaps, NULL, NULL);
+                          &nimaps, NULL);
        if (error && (error != ENOSPC))
                return XFS_ERROR(error);
@@ -588,12 +538,12 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-int
+STATIC int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
-        xfs_bmbt_irec_t *map,
+        xfs_bmbt_irec_t *imap,
        int             *retmap)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -602,7 +552,6 @@ xfs_iomap_write_allocate(
        xfs_fsblock_t   first_block;
        xfs_bmap_free_t free_list;
        xfs_filblks_t   count_fsb;
-        xfs_bmbt_irec_t imap;
        xfs_trans_t     *tp;
        int             nimaps, committed;
        int             error = 0;
@@ -618,8 +567,8 @@ xfs_iomap_write_allocate(
                return XFS_ERROR(error);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
-        count_fsb = map->br_blockcount;
+        count_fsb = imap->br_blockcount;
-        map_start_fsb = map->br_startoff;
+        map_start_fsb = imap->br_startoff;
        XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
@@ -647,8 +596,7 @@ xfs_iomap_write_allocate(
                                return XFS_ERROR(error);
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(tp, ip);
-                        xfs_trans_ihold(tp, ip);
                        xfs_bmap_init(&free_list, &first_block);
@@ -699,10 +647,15 @@ xfs_iomap_write_allocate(
                                }
                        }
-                        /* Go get the actual blocks */
+                        /*
+                         * Go get the actual blocks.
+                         *
+                         * From this point onwards we overwrite the imap
+                         * pointer that the caller gave to us.
+                         */
                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
                                        XFS_BMAPI_WRITE, &first_block, 1,
-                                        &imap, &nimaps, &free_list, NULL);
+                                        imap, &nimaps, &free_list);
                        if (error)
                                goto trans_cancel;
@@ -721,13 +674,12 @@ xfs_iomap_write_allocate(
                 * See if we were able to allocate an extent that
                 * covers at least part of the callers request
                 */
-                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_cmn_err_fsblock_zero(ip, imap);
-                if ((offset_fsb >= imap.br_startoff) &&
+                if ((offset_fsb >= imap->br_startoff) &&
-                    (offset_fsb < (imap.br_startoff +
+                    (offset_fsb < (imap->br_startoff +
-                                   imap.br_blockcount))) {
+                                   imap->br_blockcount))) {
-                        *map = imap;
                        *retmap = 1;
                        XFS_STATS_INC(xs_xstrat_quick);
                        return 0;
@@ -737,8 +689,8 @@ xfs_iomap_write_allocate(
                 * So far we have not mapped the requested part of the
                 * file, just surrounding data, try again.
                 */
-                count_fsb -= imap.br_blockcount;
+                count_fsb -= imap->br_blockcount;
-                map_start_fsb = imap.br_startoff + imap.br_blockcount;
+                map_start_fsb = imap->br_startoff + imap->br_blockcount;
        }
 trans_cancel:
@@ -811,8 +763,7 @@ xfs_iomap_write_unwritten(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Modify the unwritten extent state of the buffer.
@@ -821,7 +772,7 @@ xfs_iomap_write_unwritten(
                nimaps = 1;
                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
                                  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-                                  1, &imap, &nimaps, &free_list, NULL);
+                                  1, &imap, &nimaps, &free_list);
                if (error)
                        goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..7748a430f50d 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,30 +18,16 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
+/* base extent manipulation calls */
+#define BMAPI_READ      (1 << 0)        /* read extents */
+#define BMAPI_WRITE     (1 << 1)        /* create extents */
+#define BMAPI_ALLOCATE  (1 << 2)        /* delayed allocate to real extents */
+/* modifiers */
-typedef enum {                          /* iomap_flags values */
+#define BMAPI_IGNSTATE  (1 << 4)        /* ignore unwritten state on read */
-        IOMAP_READ =            0,      /* mapping for a read */
+#define BMAPI_DIRECT    (1 << 5)        /* direct instead of buffered write */
-        IOMAP_HOLE =            0x02,   /* mapping covers a hole  */
+#define BMAPI_MMA       (1 << 6)        /* allocate for mmap write */
-        IOMAP_DELAY =           0x04,   /* mapping covers delalloc region  */
+#define BMAPI_TRYLOCK   (1 << 7)        /* non-blocking request */
-        IOMAP_REALTIME =        0x10,   /* mapping on the realtime device  */
-        IOMAP_UNWRITTEN =       0x20,   /* mapping covers allocated */
-                                        /* but uninitialized file data  */
-        IOMAP_NEW =             0x40    /* just allocate */
-} iomap_flags_t;
-typedef enum {
-        /* base extent manipulation calls */
-        BMAPI_READ = (1 << 0),          /* read extents */
-        BMAPI_WRITE = (1 << 1),         /* create extents */
-        BMAPI_ALLOCATE = (1 << 2),      /* delayed allocate to real extents */
-        /* modifiers */
-        BMAPI_IGNSTATE = (1 << 4),      /* ignore unwritten state on read */
-        BMAPI_DIRECT = (1 << 5),        /* direct instead of buffered write */
-        BMAPI_MMAP = (1 << 6),          /* allocate for mmap write */
-        BMAPI_TRYLOCK = (1 << 7),       /* non-blocking request */
-} bmapi_flags_t;
 #define BMAPI_FLAGS \
        { BMAPI_READ,           "READ" }, \
@@ -49,46 +35,13 @@ typedef enum {
        { BMAPI_ALLOCATE,       "ALLOCATE" }, \
        { BMAPI_IGNSTATE,       "IGNSTATE" }, \
        { BMAPI_DIRECT,         "DIRECT" }, \
-        { BMAPI_MMAP,           "MMAP" }, \
        { BMAPI_TRYLOCK,        "TRYLOCK" }
-/*
- * xfs_iomap_t:  File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping,  iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-typedef struct xfs_iomap {
-        xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
-        xfs_buftarg_t           *iomap_target;
-        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
-        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
-        xfs_off_t               iomap_delta;    /* offset into mapping, bytes */
-        iomap_flags_t           iomap_flags;
-} xfs_iomap_t;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-                     struct xfs_iomap *, int *);
+                     struct xfs_bmbt_irec *, int *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b1b801e4a28e..7e3626e5925c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,20 +24,17 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_btree.h"
+#include "xfs_trace.h"
 STATIC int
 xfs_internal_inum(
@@ -49,24 +46,40 @@ xfs_internal_inum(
                 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
 }
-STATIC int
+/*
-xfs_bulkstat_one_iget(
+ * Return stat information for one inode.
-        xfs_mount_t     *mp,            /* mount point for filesystem */
+ * Return 0 if ok, else errno.
-        xfs_ino_t       ino,            /* inode number to get data for */
+ */
-        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+int
-        xfs_bstat_t     *buf,           /* return buffer */
+xfs_bulkstat_one_int(
-        int             *stat)          /* BULKSTAT_RV_... */
+        struct xfs_mount        *mp,            /* mount point for filesystem */
+        xfs_ino_t               ino,            /* inode to get data for */
+        void __user             *buffer,        /* buffer to place output in */
+        int                     ubsize,         /* size of buffer */
+        bulkstat_one_fmt_pf     formatter,      /* formatter, copy to user */
+        int                     *ubused,        /* bytes used by me */
+        int                     *stat)          /* BULKSTAT_RV_... */
 {
-        xfs_icdinode_t  *dic;   /* dinode core info pointer */
+        struct xfs_icdinode     *dic;           /* dinode core info pointer */
-        xfs_inode_t     *ip;            /* incore inode pointer */
+        struct xfs_inode        *ip;            /* incore inode pointer */
-        struct inode    *inode;
+        struct inode            *inode;
-        int             error;
+        struct xfs_bstat        *buf;           /* return buffer */
+        int                     error = 0;      /* error value */
+        *stat = BULKSTAT_RV_NOTHING;
+        if (!buffer || xfs_internal_inum(mp, ino))
+                return XFS_ERROR(EINVAL);
+        buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+        if (!buf)
+                return XFS_ERROR(ENOMEM);
        error = xfs_iget(mp, NULL, ino,
-                         XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
+                         XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
        if (error) {
                *stat = BULKSTAT_RV_NOTHING;
-                return error;
+                goto out_free;
        }
        ASSERT(ip != NULL);
@@ -127,77 +140,17 @@ xfs_bulkstat_one_iget(
                buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
                break;
        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        IRELE(ip);
-        xfs_iput(ip, XFS_ILOCK_SHARED);
+        error = formatter(buffer, ubsize, ubused, buf);
-        return error;
-}
-STATIC void
-xfs_bulkstat_one_dinode(
-        xfs_mount_t     *mp,            /* mount point for filesystem */
-        xfs_ino_t       ino,            /* inode number to get data for */
-        xfs_dinode_t    *dic,           /* dinode inode pointer */
-        xfs_bstat_t     *buf)           /* return buffer */
-{
-        /*
-         * The inode format changed when we moved the link count and
-         * made it 32 bits long.  If this is an old format inode,
-         * convert it in memory to look like a new one.  If it gets
-         * flushed to disk we will convert back before flushing or
-         * logging it.  We zero out the new projid field and the old link
-         * count field.  We'll handle clearing the pad field (the remains
-         * of the old uuid field) when we actually convert the inode to
-         * the new format. We don't change the version number so that we
-         * can distinguish this from a real new format inode.
-         */
-        if (dic->di_version == 1) {
-                buf->bs_nlink = be16_to_cpu(dic->di_onlink);
-                buf->bs_projid = 0;
-        } else {
-                buf->bs_nlink = be32_to_cpu(dic->di_nlink);
-                buf->bs_projid = be16_to_cpu(dic->di_projid);
-        }
-        buf->bs_ino = ino;
+        if (!error)
-        buf->bs_mode = be16_to_cpu(dic->di_mode);
+                *stat = BULKSTAT_RV_DIDONE;
-        buf->bs_uid = be32_to_cpu(dic->di_uid);
-        buf->bs_gid = be32_to_cpu(dic->di_gid);
-        buf->bs_size = be64_to_cpu(dic->di_size);
-        buf->bs_atime.tv_sec = be32_to_cpu(dic->di_atime.t_sec);
-        buf->bs_atime.tv_nsec = be32_to_cpu(dic->di_atime.t_nsec);
-        buf->bs_mtime.tv_sec = be32_to_cpu(dic->di_mtime.t_sec);
-        buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
-        buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
-        buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-        buf->bs_xflags = xfs_dic2xflags(dic);
-        buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
-        buf->bs_extents = be32_to_cpu(dic->di_nextents);
-        buf->bs_gen = be32_to_cpu(dic->di_gen);
-        memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
-        buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
-        buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
-        buf->bs_aextents = be16_to_cpu(dic->di_anextents);
-        buf->bs_forkoff = XFS_DFORK_BOFF(dic);
-        switch (dic->di_format) {
+ out_free:
-        case XFS_DINODE_FMT_DEV:
+        kmem_free(buf);
-                buf->bs_rdev = xfs_dinode_get_rdev(dic);
+        return error;
-                buf->bs_blksize = BLKDEV_IOSIZE;
-                buf->bs_blocks = 0;
-                break;
-        case XFS_DINODE_FMT_LOCAL:
-        case XFS_DINODE_FMT_UUID:
-                buf->bs_rdev = 0;
-                buf->bs_blksize = mp->m_sb.sb_blocksize;
-                buf->bs_blocks = 0;
-                break;
-        case XFS_DINODE_FMT_EXTENTS:
-        case XFS_DINODE_FMT_BTREE:
-                buf->bs_rdev = 0;
-                buf->bs_blksize = mp->m_sb.sb_blocksize;
-                buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
-                break;
-        }
 }
 /* Return 0 on success or positive error */
@@ -217,118 +170,17 @@ xfs_bulkstat_one_fmt(
        return 0;
 }
-/*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
- */
-int                                     /* error status */
-xfs_bulkstat_one_int(
-        xfs_mount_t     *mp,            /* mount point for filesystem */
-        xfs_ino_t       ino,            /* inode number to get data for */
-        void            __user *buffer, /* buffer to place output in */
-        int             ubsize,         /* size of buffer */
-        bulkstat_one_fmt_pf formatter,  /* formatter, copy to user */
-        xfs_daddr_t     bno,            /* starting bno of inode cluster */
-        int             *ubused,        /* bytes used by me */
-        void            *dibuff,        /* on-disk inode buffer */
-        int             *stat)          /* BULKSTAT_RV_... */
-{
-        xfs_bstat_t     *buf;           /* return buffer */
-        int             error = 0;      /* error value */
-        xfs_dinode_t    *dip;           /* dinode inode pointer */
-        dip = (xfs_dinode_t *)dibuff;
-        *stat = BULKSTAT_RV_NOTHING;
-        if (!buffer || xfs_internal_inum(mp, ino))
-                return XFS_ERROR(EINVAL);
-        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
-        if (dip == NULL) {
-                /* We're not being passed a pointer to a dinode.  This happens
-                 * if BULKSTAT_FG_IGET is selected.  Do the iget.
-                 */
-                error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
-                if (error)
-                        goto out_free;
-        } else {
-                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
-        }
-        error = formatter(buffer, ubsize, ubused, buf);
-        if (error)
-                goto out_free;
-        *stat = BULKSTAT_RV_DIDONE;
- out_free:
-        kmem_free(buf);
-        return error;
-}
 int
 xfs_bulkstat_one(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-        void            *private_data,  /* my private data */
-        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
-        void            *dibuff,        /* on-disk inode buffer */
        int             *stat)          /* BULKSTAT_RV_... */
 {
        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-                                    xfs_bulkstat_one_fmt, bno,
+                                    xfs_bulkstat_one_fmt, ubused, stat);
-                                    ubused, dibuff, stat);
-}
-/*
- * Test to see whether we can use the ondisk inode directly, based
- * on the given bulkstat flags, filling in dipp accordingly.
- * Returns zero if the inode is dodgey.
- */
-STATIC int
-xfs_bulkstat_use_dinode(
-        xfs_mount_t     *mp,
-        int             flags,
-        xfs_buf_t       *bp,
-        int             clustidx,
-        xfs_dinode_t    **dipp)
-{
-        xfs_dinode_t    *dip;
-        unsigned int    aformat;
-        *dipp = NULL;
-        if (!bp || (flags & BULKSTAT_FG_IGET))
-                return 1;
-        dip = (xfs_dinode_t *)
-                        xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
-        /*
-         * Check the buffer containing the on-disk inode for di_mode == 0.
-         * This is to prevent xfs_bulkstat from picking up just reclaimed
-         * inodes that have their in-core state initialized but not flushed
-         * to disk yet. This is a temporary hack that would require a proper
-         * fix in the future.
-         */
-        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-            !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
-            !dip->di_mode)
-                return 0;
-        if (flags & BULKSTAT_FG_QUICK) {
-                *dipp = dip;
-                return 1;
-        }
-        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-        aformat = dip->di_aformat;
-        if ((XFS_DFORK_Q(dip) == 0) ||
-            (aformat == XFS_DINODE_FMT_LOCAL) ||
-            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
-                *dipp = dip;
-                return 1;
-        }
-        return 1;
 }
 #define XFS_BULKSTAT_UBLEFT(ubleft)     ((ubleft) >= statstruct_size)
@@ -342,10 +194,8 @@ xfs_bulkstat(
        xfs_ino_t               *lastinop, /* last inode returned */
        int                     *ubcountp, /* size of buffer/count returned */
        bulkstat_one_pf         formatter, /* func that'd fill a single buf */
-        void                    *private_data,/* private data for formatter */
        size_t                  statstruct_size, /* sizeof struct filling */
        char                    __user *ubuffer, /* buffer with inode stats */
-        int                     flags,  /* defined in xfs_itable.h */
        int                     *done)  /* 1 if there are more stats to get */
 {
        xfs_agblock_t           agbno=0;/* allocation group block number */
@@ -380,14 +230,12 @@ xfs_bulkstat(
        int                     ubelem; /* spaces used in user's buffer */
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
-        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
        /*
         * Get the last inode value, see if there's nothing to do.
         */
        ino = (xfs_ino_t)*lastinop;
        lastino = ino;
-        dip = NULL;
        agno = XFS_INO_TO_AGNO(mp, ino);
        agino = XFS_INO_TO_AGINO(mp, ino);
        if (agno >= mp->m_sb.sb_agcount ||
@@ -612,37 +460,6 @@ xfs_bulkstat(
                                                        irbp->ir_startino) +
                                                ((chunkidx & nimask) >>
                                                 mp->m_sb.sb_inopblog);
-                                        if (flags & (BULKSTAT_FG_QUICK |
-                                                     BULKSTAT_FG_INLINE)) {
-                                                int offset;
-                                                ino = XFS_AGINO_TO_INO(mp, agno,
-                                                                       agino);
-                                                bno = XFS_AGB_TO_DADDR(mp, agno,
-                                                                       agbno);
-                                                /*
-                                                 * Get the inode cluster buffer
-                                                 */
-                                                if (bp)
-                                                        xfs_buf_relse(bp);
-                                                error = xfs_inotobp(mp, NULL, ino, &dip,
-                                                                    &bp, &offset,
-                                                                    XFS_IGET_BULKSTAT);
-                                                if (!error)
-                                                        clustidx = offset / mp->m_sb.sb_inodesize;
-                                                if (XFS_TEST_ERROR(error != 0,
-                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
-                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
-                                                        bp = NULL;
-                                                        ubleft = 0;
-                                                        rval = error;
-                                                        break;
-                                                }
-                                        }
                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
@@ -658,35 +475,13 @@ xfs_bulkstat(
                                 * when the chunk is used up.
                                 */
                                irbp->ir_freecount++;
-                                if (!xfs_bulkstat_use_dinode(mp, flags, bp,
-                                                             clustidx, &dip)) {
-                                        lastino = ino;
-                                        continue;
-                                }
-                                /*
-                                 * If we need to do an iget, cannot hold bp.
-                                 * Drop it, until starting the next cluster.
-                                 */
-                                if ((flags & BULKSTAT_FG_INLINE) && !dip) {
-                                        if (bp)
-                                                xfs_buf_relse(bp);
-                                        bp = NULL;
-                                }
                                /*
                                 * Get the inode and fill in a single buffer.
-                                 * BULKSTAT_FG_QUICK uses dip to fill it in.
-                                 * BULKSTAT_FG_IGET uses igets.
-                                 * BULKSTAT_FG_INLINE uses dip if we have an
-                                 * inline attr fork, else igets.
-                                 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
-                                 * This is also used to count inodes/blks, etc
-                                 * in xfs_qm_quotacheck.
                                 */
                                ubused = statstruct_size;
-                                error = formatter(mp, ino, ubufp,
+                                error = formatter(mp, ino, ubufp, ubleft,
-                                                ubleft, private_data,
+                                                  &ubused, &fmterror);
-                                                bno, &ubused, dip, &fmterror);
                                if (fmterror == BULKSTAT_RV_NOTHING) {
                                        if (error && error != ENOENT &&
                                                error != EINVAL) {
@@ -778,8 +573,7 @@ xfs_bulkstat_single(
         */
        ino = (xfs_ino_t)*lastinop;
-        error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
+        error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
-                                 NULL, 0, NULL, NULL, &res);
        if (error) {
                /*
                 * Special case way failed, do it the "long" way
@@ -788,8 +582,7 @@ xfs_bulkstat_single(
                (*lastinop)--;
                count = 1;
                if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
-                                NULL, sizeof(xfs_bstat_t), buffer,
+                                sizeof(xfs_bstat_t), buffer, done))
-                                BULKSTAT_FG_IGET, done))
                        return error;
                if (count == 0 || (xfs_ino_t)*lastinop != ino)
                        return error == EFSCORRUPTED ?
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 20792bf45946..97295d91d170 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -27,10 +27,7 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
                               xfs_ino_t        ino,
                               void             __user *buffer,
                               int              ubsize,
-                               void             *private_data,
-                               xfs_daddr_t      bno,
                               int              *ubused,
-                               void             *dip,
                               int              *stat);
 /*
@@ -41,13 +38,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 #define BULKSTAT_RV_GIVEUP      2
 /*
- * Values for bulkstat flag argument.
- */
-#define BULKSTAT_FG_IGET        0x1     /* Go through the buffer cache */
-#define BULKSTAT_FG_QUICK       0x2     /* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_INLINE      0x4     /* No iget if inline attrs */
-/*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
 int                                     /* error status */
@@ -56,10 +46,8 @@ xfs_bulkstat(
        xfs_ino_t       *lastino,       /* last inode returned */
        int             *count,         /* size of buffer/count returned */
        bulkstat_one_pf formatter,      /* func that'd fill a single buf */
-        void            *private_data,  /* private data for formatter */
        size_t          statstruct_size,/* sizeof struct that we're filling */
        char            __user *ubuffer,/* buffer with inode stats */
-        int             flags,          /* flag to control access method */
        int             *done);         /* 1 if there are more stats to get */
 int
@@ -82,9 +70,7 @@ xfs_bulkstat_one_int(
        void                    __user *buffer,
        int                     ubsize,
        bulkstat_one_fmt_pf     formatter,
-        xfs_daddr_t             bno,
        int                     *ubused,
-        void                    *dibuff,
        int                     *stat);
 int
@@ -93,10 +79,7 @@ xfs_bulkstat_one(
        xfs_ino_t               ino,
        void                    __user *buffer,
        int                     ubsize,
-        void                    *private_data,
-        xfs_daddr_t             bno,
        int                     *ubused,
-        void                    *dibuff,
        int                     *stat);
 typedef int (*inumbers_fmt_pf)(
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2be019136287..33f718f92a48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,8 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
@@ -35,8 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_rw.h"
@@ -44,13 +40,8 @@
 kmem_zone_t     *xfs_log_ticket_zone;
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
-        { (ptr) += (bytes); \
-          (len) -= (bytes); \
-          (off) += (bytes);}
 /* Local miscellaneous function prototypes */
-STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int       xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
                                    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
                                xfs_buftarg_t   *log_target,
@@ -59,11 +50,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
-STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
-                            int nentries, struct xlog_ticket *tic,
-                            xfs_lsn_t *start_lsn,
-                            xlog_in_core_t **commit_iclog,
-                            uint flags);
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +79,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t        *log,
                                   xlog_ticket_t *ticket);
-/* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
-                                         int    unit_bytes,
-                                         int    count,
-                                         char   clientid,
-                                         uint   flags);
 #if defined(DEBUG)
-STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
@@ -258,7 +236,7 @@ xfs_log_done(
             * If we get an error, just continue and give back the log ticket.
             */
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-             (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
+             (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
                        flags |= XFS_LOG_REL_PERM_RESERV;
@@ -355,7 +333,6 @@ xfs_log_reserve(
        int                     retval = 0;
        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-        ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
@@ -367,6 +344,15 @@ xfs_log_reserve(
                ASSERT(flags & XFS_LOG_PERM_RESERV);
                internal_ticket = *ticket;
+                /*
+                 * this is a new transaction on the ticket, so we need to
+                 * change the transaction ID so that the next transaction has a
+                 * different TID in the log. Just add one to the existing tid
+                 * so that we can see chains of rolling transactions in the log
+                 * easily.
+                 */
+                internal_ticket->t_tid++;
                trace_xfs_log_reserve(log, internal_ticket);
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +360,8 @@ xfs_log_reserve(
        } else {
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                  client, flags);
+                                                  client, flags,
+                                                  KM_SLEEP|KM_MAYFAIL);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
@@ -459,6 +446,13 @@ xfs_log_mount(
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        /*
+         * Now the log has been fully initialised and we know were our
+         * space grant counters are, we can initialise the permanent ticket
+         * needed for delayed logging to work.
+         */
+        xlog_cil_init_post_recovery(mp->m_log);
        return 0;
 out_destroy_ail:
@@ -516,18 +510,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 #ifdef DEBUG
        xlog_in_core_t   *first_iclog;
 #endif
-        xfs_log_iovec_t  reg[1];
        xlog_ticket_t   *tic = NULL;
        xfs_lsn_t        lsn;
        int              error;
-        /* the data section must be 32 bit size aligned */
-        struct {
-            __uint16_t magic;
-            __uint16_t pad1;
-            __uint32_t pad2; /* may as well make it 64 bits */
-        } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
        /*
         * Don't write out unmount record on read-only mounts.
         * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +535,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-                reg[0].i_addr = (void*)&magic;
-                reg[0].i_len  = sizeof(magic);
-                reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
                error = xfs_log_reserve(mp, 600, 1, &tic,
                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
+                        /* the data section must be 32 bit size aligned */
+                        struct {
+                            __uint16_t magic;
+                            __uint16_t pad1;
+                            __uint32_t pad2; /* may as well make it 64 bits */
+                        } magic = {
+                                .magic = XLOG_UNMOUNT_TYPE,
+                        };
+                        struct xfs_log_iovec reg = {
+                                .i_addr = &magic,
+                                .i_len = sizeof(magic),
+                                .i_type = XLOG_REG_TYPE_UNMOUNT,
+                        };
+                        struct xfs_log_vec vec = {
+                                .lv_niovecs = 1,
+                                .lv_iovecp = &reg,
+                        };
                        /* remove inited flag */
-                        ((xlog_ticket_t *)tic)->t_flags = 0;
+                        tic->t_flags = 0;
-                        error = xlog_write(mp, reg, 1, tic, &lsn,
+                        error = xlog_write(log, &vec, tic, &lsn,
                                           NULL, XLOG_UNMOUNT_TRANS);
                        /*
                         * At this point, we're umounting anyway,
@@ -648,10 +648,30 @@ xfs_log_unmount(xfs_mount_t *mp)
        xlog_dealloc_log(mp->m_log);
 }
+void
+xfs_log_item_init(
+        struct xfs_mount        *mp,
+        struct xfs_log_item     *item,
+        int                     type,
+        struct xfs_item_ops     *ops)
+{
+        item->li_mountp = mp;
+        item->li_ailp = mp->m_ail;
+        item->li_type = type;
+        item->li_ops = ops;
+        item->li_lv = NULL;
+        INIT_LIST_HEAD(&item->li_ail);
+        INIT_LIST_HEAD(&item->li_cil);
+}
 /*
 * Write region vectors to log.  The write happens using the space reservation
 * of the ticket (tic).  It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
 */
 int
 xfs_log_write(
@@ -663,11 +683,15 @@ xfs_log_write(
 {
        struct log              *log = mp->m_log;
        int                     error;
+        struct xfs_log_vec      vec = {
+                .lv_niovecs = nentries,
+                .lv_iovecp = reg,
+        };
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
-        error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+        error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        return error;
@@ -1018,8 +1042,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
        xlog_in_core_t          *iclog, *prev_iclog=NULL;
        xfs_buf_t               *bp;
        int                     i;
-        int                     iclogsize;
        int                     error = ENOMEM;
+        uint                    log2_size = 0;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
@@ -1045,29 +1069,30 @@ xlog_alloc_log(xfs_mount_t	*mp,
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
-                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
+                log2_size = mp->m_sb.sb_logsectlog;
-                if (log->l_sectbb_log < 0 ||
+                if (log2_size < BBSHIFT) {
-                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size too small "
-                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
-                                                log->l_sectbb_log);
                        goto out_free_log;
                }
-                /* for larger sector sizes, must have v2 or external log */
+                log2_size -= BBSHIFT;
-                if (log->l_sectbb_log != 0 &&
+                if (log2_size > mp->m_sectbb_log) {
-                    (log->l_logBBstart != 0 &&
+                        xlog_warn("XFS: Log sector size too large "
-                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
-                                  "for configuration.", log->l_sectbb_log);
                        goto out_free_log;
                }
-                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
-                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                /* for larger sector sizes, must have v2 or external log */
-                                                mp->m_sb.sb_logsectlog);
+                if (log2_size && log->l_logBBstart > 0 &&
+                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log2_size);
                        goto out_free_log;
                }
        }
-        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+        log->l_sectBBsize = 1 << log2_size;
        xlog_get_iclog_buffer_size(mp, log);
@@ -1096,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
         * with different amounts of memory.  See the definition of
         * xlog_in_core_t in xfs_log_priv.h for details.
         */
-        iclogsize = log->l_iclog_size;
        ASSERT(log->l_iclog_size >= 4096);
        for (i=0; i < log->l_iclog_bufs; i++) {
                *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
@@ -1147,6 +1171,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        *iclogp = log->l_iclog;                 /* complete ring */
        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
+        error = xlog_cil_init(log);
+        if (error)
+                goto out_free_iclog;
        return log;
 out_free_iclog:
@@ -1174,26 +1201,31 @@ out:
 * ticket.  Return the lsn of the commit record.
 */
 STATIC int
-xlog_commit_record(xfs_mount_t  *mp,
+xlog_commit_record(
-                   xlog_ticket_t *ticket,
+        struct log              *log,
-                   xlog_in_core_t **iclog,
+        struct xlog_ticket      *ticket,
-                   xfs_lsn_t    *commitlsnp)
+        struct xlog_in_core     **iclog,
+        xfs_lsn_t               *commitlsnp)
 {
-        int             error;
+        struct xfs_mount *mp = log->l_mp;
-        xfs_log_iovec_t reg[1];
+        int     error;
+        struct xfs_log_iovec reg = {
-        reg[0].i_addr = NULL;
+                .i_addr = NULL,
-        reg[0].i_len = 0;
+                .i_len = 0,
-        reg[0].i_type = XLOG_REG_TYPE_COMMIT;
+                .i_type = XLOG_REG_TYPE_COMMIT,
+        };
+        struct xfs_log_vec vec = {
+                .lv_niovecs = 1,
+                .lv_iovecp = &reg,
+        };
        ASSERT_ALWAYS(iclog);
-        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+        error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
-                               iclog, XLOG_COMMIT_TRANS))) {
+                                        XLOG_COMMIT_TRANS);
+        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-        }
        return error;
-}       /* xlog_commit_record */
+}
 /*
 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1389,11 +1421,8 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_LOG_BUFFER;
-        /*
-         * Do an ordered write for the log block.
+        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
-         * Its unnecessary to flush the first split block in the log wrap case.
-         */
-        if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER))
                XFS_BUF_ORDERED(bp);
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
@@ -1468,6 +1497,8 @@ xlog_dealloc_log(xlog_t *log)
        xlog_in_core_t  *iclog, *next_iclog;
        int             i;
+        xlog_cil_destroy(log);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                sv_destroy(&iclog->ic_force_wait);
@@ -1510,8 +1541,10 @@ xlog_state_finish_copy(xlog_t		*log,
 * print out info relating to regions written which consume
 * the reservation
 */
-STATIC void
+void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+xlog_print_tic_res(
+        struct xfs_mount        *mp,
+        struct xlog_ticket      *ticket)
 {
        uint i;
        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1611,6 +1644,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
+        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                "xfs_log_write: reservation ran out. Need to up reservation");
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+        struct xlog_ticket      *ticket,
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        int                     headers = 0;
+        int                     len = 0;
+        int                     i;
+        /* acct for start rec of xact */
+        if (ticket->t_flags & XLOG_TIC_INITED)
+                headers++;
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                headers += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++) {
+                        struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
+                        len += vecp->i_len;
+                        xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+                }
+        }
+        ticket->t_res_num_ophdrs += headers;
+        len += headers * sizeof(struct xlog_op_header);
+        return len;
+}
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket)
+{
+        if (!(ticket->t_flags & XLOG_TIC_INITED))
+                return 0;
+        ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_len = 0;
+        ophdr->oh_flags = XLOG_START_TRANS;
+        ophdr->oh_res2 = 0;
+        ticket->t_flags &= ~XLOG_TIC_INITED;
+        return sizeof(struct xlog_op_header);
+}
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+        struct log              *log,
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket,
+        uint                    flags)
+{
+        ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_res2 = 0;
+        /* are we copying a commit or unmount record? */
+        ophdr->oh_flags = flags;
+        /*
+         * We've seen logs corrupted with bad transaction client ids.  This
+         * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+         * and shut down the filesystem.
+         */
+        switch (ophdr->oh_clientid)  {
+        case XFS_TRANSACTION:
+        case XFS_VOLUME:
+        case XFS_LOG:
+                break;
+        default:
+                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        ophdr->oh_clientid, ticket);
+                return NULL;
+        }
+        return ophdr;
+}
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+        struct xlog_ticket      *ticket,
+        struct xlog_op_header   *ophdr,
+        int                     space_available,
+        int                     space_required,
+        int                     *copy_off,
+        int                     *copy_len,
+        int                     *last_was_partial_copy,
+        int                     *bytes_consumed)
+{
+        int                     still_to_copy;
+        still_to_copy = space_required - *bytes_consumed;
+        *copy_off = *bytes_consumed;
+        if (still_to_copy <= space_available) {
+                /* write of region completes here */
+                *copy_len = still_to_copy;
+                ophdr->oh_len = cpu_to_be32(*copy_len);
+                if (*last_was_partial_copy)
+                        ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+                *last_was_partial_copy = 0;
+                *bytes_consumed = 0;
+                return 0;
+        }
+        /* partial write of region, needs extra log op header reservation */
+        *copy_len = space_available;
+        ophdr->oh_len = cpu_to_be32(*copy_len);
+        ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+        if (*last_was_partial_copy)
+                ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+        *bytes_consumed += *copy_len;
+        (*last_was_partial_copy)++;
+        /* account for new log op header */
+        ticket->t_curr_res -= sizeof(struct xlog_op_header);
+        ticket->t_res_num_ophdrs++;
+        return sizeof(struct xlog_op_header);
+}
+static int
+xlog_write_copy_finish(
+        struct log              *log,
+        struct xlog_in_core     *iclog,
+        uint                    flags,
+        int                     *record_cnt,
+        int                     *data_cnt,
+        int                     *partial_copy,
+        int                     *partial_copy_len,
+        int                     log_offset,
+        struct xlog_in_core     **commit_iclog)
+{
+        if (*partial_copy) {
+                /*
+                 * This iclog has already been marked WANT_SYNC by
+                 * xlog_state_get_iclog_space.
+                 */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                return xlog_state_release_iclog(log, iclog);
+        }
+        *partial_copy = 0;
+        *partial_copy_len = 0;
+        if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+                /* no more space in this iclog - push it. */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                spin_lock(&log->l_icloglock);
+                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
+                if (!commit_iclog)
+                        return xlog_state_release_iclog(log, iclog);
+                ASSERT(flags & XLOG_COMMIT_TRANS);
+                *commit_iclog = iclog;
+        }
+        return 0;
 }
 /*
@@ -1653,211 +1876,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 *      we don't update ic_offset until the end when we know exactly how many
 *      bytes have been written out.
 */
-STATIC int
+int
 xlog_write(
-        struct xfs_mount        *mp,
+        struct log              *log,
-        struct xfs_log_iovec    reg[],
+        struct xfs_log_vec      *log_vector,
-        int                     nentries,
        struct xlog_ticket      *ticket,
        xfs_lsn_t               *start_lsn,
        struct xlog_in_core     **commit_iclog,
        uint                    flags)
 {
-    xlog_t           *log = mp->m_log;
+        struct xlog_in_core     *iclog = NULL;
-    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
+        struct xfs_log_iovec    *vecp;
-    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+        struct xfs_log_vec      *lv;
-    __psint_t        ptr;            /* copy address into data region */
+        int                     len;
-    int              len;            /* # xlog_write() bytes 2 still copy */
+        int                     index;
-    int              index;          /* region index currently copying */
+        int                     partial_copy = 0;
-    int              log_offset;     /* offset (from 0) into data region */
+        int                     partial_copy_len = 0;
-    int              start_rec_copy; /* # bytes to copy for start record */
+        int                     contwr = 0;
-    int              partial_copy;   /* did we split a region? */
+        int                     record_cnt = 0;
-    int              partial_copy_len;/* # bytes copied if split region */
+        int                     data_cnt = 0;
-    int              need_copy;      /* # bytes need to memcpy this region */
+        int                     error;
-    int              copy_len;       /* # bytes actually memcpy'ing */
-    int              copy_off;       /* # bytes from entry start */
-    int              contwr;         /* continued write of in-core log? */
-    int              error;
-    int              record_cnt = 0, data_cnt = 0;
-    partial_copy_len = partial_copy = 0;
-    /* Calculate potential maximum space.  Each region gets its own
-     * xlog_op_header_t and may need to be double word aligned.
-     */
-    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
-        len += sizeof(xlog_op_header_t);
-        ticket->t_res_num_ophdrs++;
-    }
-    for (index = 0; index < nentries; index++) {
-        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
-        ticket->t_res_num_ophdrs++;
-        len += reg[index].i_len;
-        xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
-    }
-    contwr = *start_lsn = 0;
-    if (ticket->t_curr_res < len) {
+        *start_lsn = 0;
-        xlog_print_tic_res(mp, ticket);
-#ifdef DEBUG
-        xlog_panic(
-                "xfs_log_write: reservation ran out. Need to up reservation");
-#else
-        /* Customer configurable panic */
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
-                "xfs_log_write: reservation ran out. Need to up reservation");
-        /* If we did not panic, shutdown the filesystem */
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
-    } else
-        ticket->t_curr_res -= len;
-    for (index = 0; index < nentries; ) {
+        len = xlog_write_calc_vec_length(ticket, log_vector);
-        if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+        if (log->l_cilp) {
-                                               &contwr, &log_offset)))
+                /*
-                return error;
+                 * Region headers and bytes are already accounted for.
+                 * We only need to take into account start records and
+                 * split regions in this function.
+                 */
+                if (ticket->t_flags & XLOG_TIC_INITED)
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
-        ASSERT(log_offset <= iclog->ic_size - 1);
+                /*
-        ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+                 * Commit record headers need to be accounted for. These
+                 * come in as separate writes so are easy to detect.
+                 */
+                if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
+        } else
+                ticket->t_curr_res -= len;
+        if (ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, ticket);
+        index = 0;
+        lv = log_vector;
+        vecp = lv->lv_iovecp;
+        while (lv && index < lv->lv_niovecs) {
+                void            *ptr;
+                int             log_offset;
+                error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                                   &contwr, &log_offset);
+                if (error)
+                        return error;
-        /* start_lsn is the first lsn written to. That's all we need. */
+                ASSERT(log_offset <= iclog->ic_size - 1);
-        if (! *start_lsn)
+                ptr = iclog->ic_datap + log_offset;
-            *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        /* This loop writes out as many regions as can fit in the amount
+                /* start_lsn is the first lsn written to. That's all we need. */
-         * of space which was allocated by xlog_state_get_iclog_space().
+                if (!*start_lsn)
-         */
+                        *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        while (index < nentries) {
-            ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
-            ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
-            start_rec_copy = 0;
-            /* If first write for transaction, insert start record.
-             * We can't be trying to commit if we are inited.  We can't
-             * have any "partial_copy" if we are inited.
-             */
-            if (ticket->t_flags & XLOG_TIC_INITED) {
-                logop_head              = (xlog_op_header_t *)ptr;
-                logop_head->oh_tid      = cpu_to_be32(ticket->t_tid);
-                logop_head->oh_clientid = ticket->t_clientid;
-                logop_head->oh_len      = 0;
-                logop_head->oh_flags    = XLOG_START_TRANS;
-                logop_head->oh_res2     = 0;
-                ticket->t_flags         &= ~XLOG_TIC_INITED;    /* clear bit */
-                record_cnt++;
-                start_rec_copy = sizeof(xlog_op_header_t);
-                xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
-            }
-            /* Copy log operation header directly into data section */
+                /*
-            logop_head                  = (xlog_op_header_t *)ptr;
+                 * This loop writes out as many regions as can fit in the amount
-            logop_head->oh_tid          = cpu_to_be32(ticket->t_tid);
+                 * of space which was allocated by xlog_state_get_iclog_space().
-            logop_head->oh_clientid     = ticket->t_clientid;
+                 */
-            logop_head->oh_res2         = 0;
+                while (lv && index < lv->lv_niovecs) {
+                        struct xfs_log_iovec    *reg = &vecp[index];
+                        struct xlog_op_header   *ophdr;
+                        int                     start_rec_copy;
+                        int                     copy_len;
+                        int                     copy_off;
+                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+                        start_rec_copy = xlog_write_start_rec(ptr, ticket);
+                        if (start_rec_copy) {
+                                record_cnt++;
+                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
+                                                   start_rec_copy);
+                        }
-            /* header copied directly */
+                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
-            xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+                        if (!ophdr)
+                                return XFS_ERROR(EIO);
-            /* are we copying a commit or unmount record? */
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
-            logop_head->oh_flags = flags;
+                                           sizeof(struct xlog_op_header));
+                        len += xlog_write_setup_copy(ticket, ophdr,
+                                                     iclog->ic_size-log_offset,
+                                                     reg->i_len,
+                                                     &copy_off, &copy_len,
+                                                     &partial_copy,
+                                                     &partial_copy_len);
+                        xlog_verify_dest_ptr(log, ptr);
+                        /* copy region */
+                        ASSERT(copy_len >= 0);
+                        memcpy(ptr, reg->i_addr + copy_off, copy_len);
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+                        copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+                        record_cnt++;
+                        data_cnt += contwr ? copy_len : 0;
+                        error = xlog_write_copy_finish(log, iclog, flags,
+                                                       &record_cnt, &data_cnt,
+                                                       &partial_copy,
+                                                       &partial_copy_len,
+                                                       log_offset,
+                                                       commit_iclog);
+                        if (error)
+                                return error;
-            /*
+                        /*
-             * We've seen logs corrupted with bad transaction client
+                         * if we had a partial copy, we need to get more iclog
-             * ids.  This makes sure that XFS doesn't generate them on.
+                         * space but we don't want to increment the region
-             * Turn this into an EIO and shut down the filesystem.
+                         * index because there is still more is this region to
-             */
+                         * write.
-            switch (logop_head->oh_clientid)  {
+                         *
-            case XFS_TRANSACTION:
+                         * If we completed writing this region, and we flushed
-            case XFS_VOLUME:
+                         * the iclog (indicated by resetting of the record
-            case XFS_LOG:
+                         * count), then we also need to get more log space. If
-                break;
+                         * this was the last record, though, we are done and
-            default:
+                         * can just return.
-                xfs_fs_cmn_err(CE_WARN, mp,
+                         */
-                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        if (partial_copy)
-                    logop_head->oh_clientid, ticket);
+                                break;
-                return XFS_ERROR(EIO);
-            }
-            /* Partial write last time? => (partial_copy != 0)
+                        if (++index == lv->lv_niovecs) {
-             * need_copy is the amount we'd like to copy if everything could
+                                lv = lv->lv_next;
-             * fit in the current memcpy.
+                                index = 0;
-             */
+                                if (lv)
-            need_copy = reg[index].i_len - partial_copy_len;
+                                        vecp = lv->lv_iovecp;
+                        }
-            copy_off = partial_copy_len;
+                        if (record_cnt == 0) {
-            if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+                                if (!lv)
-                copy_len = need_copy;
+                                        return 0;
-                logop_head->oh_len = cpu_to_be32(copy_len);
+                                break;
-                if (partial_copy)
+                        }
-                    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-                partial_copy_len = partial_copy = 0;
-            } else {                                        /* partial write */
-                copy_len = iclog->ic_size - log_offset;
-                logop_head->oh_len = cpu_to_be32(copy_len);
-                logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
-                if (partial_copy)
-                        logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
-                partial_copy_len += copy_len;
-                partial_copy++;
-                len += sizeof(xlog_op_header_t); /* from splitting of region */
-                /* account for new log op header */
-                ticket->t_curr_res -= sizeof(xlog_op_header_t);
-                ticket->t_res_num_ophdrs++;
-            }
-            xlog_verify_dest_ptr(log, ptr);
-            /* copy region */
-            ASSERT(copy_len >= 0);
-            memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
-            xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-            /* make copy_len total bytes copied, including headers */
-            copy_len += start_rec_copy + sizeof(xlog_op_header_t);
-            record_cnt++;
-            data_cnt += contwr ? copy_len : 0;
-            if (partial_copy) {                 /* copied partial region */
-                    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    if ((error = xlog_state_release_iclog(log, iclog)))
-                            return error;
-                    break;                      /* don't increment index */
-            } else {                            /* copied entire region */
-                index++;
-                partial_copy_len = partial_copy = 0;
-                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    spin_lock(&log->l_icloglock);
-                    xlog_state_want_sync(log, iclog);
-                    spin_unlock(&log->l_icloglock);
-                    if (commit_iclog) {
-                        ASSERT(flags & XLOG_COMMIT_TRANS);
-                        *commit_iclog = iclog;
-                    } else if ((error = xlog_state_release_iclog(log, iclog)))
-                           return error;
-                    if (index == nentries)
-                            return 0;           /* we are done */
-                    else
-                            break;
                }
-            } /* if (partial_copy) */
+        }
-        } /* while (index < nentries) */
-    } /* for (index = 0; index < nentries; ) */
+        ASSERT(len == 0);
-    ASSERT(len == 0);
+        xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+        if (!commit_iclog)
+                return xlog_state_release_iclog(log, iclog);
-    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-    if (commit_iclog) {
        ASSERT(flags & XLOG_COMMIT_TRANS);
        *commit_iclog = iclog;
        return 0;
-    }
+}
-    return xlog_state_release_iclog(log, iclog);
-}       /* xlog_write */
 /*****************************************************************************
@@ -2840,6 +3015,9 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
+        if (log->l_cilp)
+                xlog_cil_force(log);
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -2989,6 +3167,12 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
+        if (log->l_cilp) {
+                lsn = xlog_cil_force_lsn(log, lsn);
+                if (lsn == NULLCOMMITLSN)
+                        return 0;
+        }
 try_again:
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -3153,20 +3337,30 @@ xfs_log_ticket_get(
        return ticket;
 }
+xlog_tid_t
+xfs_log_get_trans_ident(
+        struct xfs_trans        *tp)
+{
+        return tp->t_ticket->t_tid;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
-STATIC xlog_ticket_t *
+xlog_ticket_t *
-xlog_ticket_alloc(xlog_t                *log,
+xlog_ticket_alloc(
-                int             unit_bytes,
+        struct log      *log,
-                int             cnt,
+        int             unit_bytes,
-                char            client,
+        int             cnt,
-                uint            xflags)
+        char            client,
+        uint            xflags,
+        int             alloc_flags)
 {
-        xlog_ticket_t   *tic;
+        struct xlog_ticket *tic;
        uint            num_headers;
+        int             iclog_space;
-        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
        if (!tic)
                return NULL;
@@ -3208,16 +3402,40 @@ xlog_ticket_alloc(xlog_t		*log,
        /* for start-rec */
        unit_bytes += sizeof(xlog_op_header_t);
-        /* for LR headers */
+        /*
-        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+         * for LR headers - the space for data in an iclog is the size minus
+         * the space used for the headers. If we use the iclog size, then we
+         * undercalculate the number of headers required.
+         *
+         * Furthermore - the addition of op headers for split-recs might
+         * increase the space required enough to require more log and op
+         * headers, so take that into account too.
+         *
+         * IMPORTANT: This reservation makes the assumption that if this
+         * transaction is the first in an iclog and hence has the LR headers
+         * accounted to it, then the remaining space in the iclog is
+         * exclusively for this transaction.  i.e. if the transaction is larger
+         * than the iclog, it will be the only thing in that iclog.
+         * Fundamentally, this means we must pass the entire log vector to
+         * xlog_write to guarantee this.
+         */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        num_headers = howmany(unit_bytes, iclog_space);
+        /* for split-recs - ophdrs added when data split over LRs */
+        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+        /* add extra header reservations if we overrun */
+        while (!num_headers ||
+               howmany(unit_bytes, iclog_space) > num_headers) {
+                unit_bytes += sizeof(xlog_op_header_t);
+                num_headers++;
+        }
        unit_bytes += log->l_iclog_hsize * num_headers;
        /* for commit-rec LR header - note: padding will subsume the ophdr */
        unit_bytes += log->l_iclog_hsize;
-        /* for split-recs - ophdrs added when data split over LRs */
-        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
        /* for roundoff padding for transaction data and one for commit record */
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3233,13 +3451,13 @@ xlog_ticket_alloc(xlog_t		*log,
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
        tic->t_ocnt             = cnt;
-        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+        tic->t_tid              = random32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3260,20 +3478,22 @@ xlog_ticket_alloc(xlog_t		*log,
 * part of the log in case we trash the log structure.
 */
 void
-xlog_verify_dest_ptr(xlog_t     *log,
+xlog_verify_dest_ptr(
-                     __psint_t  ptr)
+        struct log      *log,
+        char            *ptr)
 {
        int i;
        int good_ptr = 0;
-        for (i=0; i < log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
-                if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+                if (ptr >= log->l_iclog_bak[i] &&
-                    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+                    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
                        good_ptr++;
        }
-        if (! good_ptr)
+        if (!good_ptr)
                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-}       /* xlog_verify_dest_ptr */
+}
 STATIC void
 xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3459,6 +3679,11 @@ xlog_state_ioerror(
 *      c. nothing new gets queued up after (a) and (b) are done.
 *      d. if !logerror, flush the iclogs to disk, then seal them off
 *         for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3492,6 +3717,16 @@ xfs_log_force_umount(
                return 1;
        }
        retval = 0;
+        /*
+         * Flush the in memory commit item list before marking the log as
+         * being shut down. We need to do it in this order to ensure all the
+         * completed transactions are flushed to disk with the xfs_log_force()
+         * call below.
+         */
+        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+                xlog_cil_force(log);
        /*
         * We must hold both the GRANT lock and the LOG lock,
         * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..916eb7db14d9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 /* get lsn fields */
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
@@ -56,14 +55,10 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 /*
 * Flags to xfs_log_reserve()
 *
- *      XFS_LOG_SLEEP:   If space is not available, sleep (default)
- *      XFS_LOG_NOSLEEP: If space is not available, return error
 *      XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
 *              performed against this type of reservation, the reservation
 *              is not decreased.  Long running transactions should use this.
 */
-#define XFS_LOG_SLEEP           0x0
-#define XFS_LOG_NOSLEEP         0x1
 #define XFS_LOG_PERM_RESERV     0x2
 /*
@@ -105,11 +100,20 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_MAX               19
 typedef struct xfs_log_iovec {
-        xfs_caddr_t     i_addr;         /* beginning address of region */
+        void            *i_addr;        /* beginning address of region */
        int             i_len;          /* length in bytes of region */
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
+struct xfs_log_vec {
+        struct xfs_log_vec      *lv_next;       /* next lv in build list */
+        int                     lv_niovecs;     /* number of iovecs in lv */
+        struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+        struct xfs_log_item     *lv_item;       /* owner */
+        char                    *lv_buf;        /* formatted buffer */
+        int                     lv_buf_len;     /* size of formatted buffer */
+};
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
@@ -126,6 +130,14 @@ typedef struct xfs_log_callback {
 struct xfs_mount;
 struct xlog_in_core;
 struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+void    xfs_log_item_init(struct xfs_mount      *mp,
+                        struct xfs_log_item     *item,
+                        int                     type,
+                        struct xfs_item_ops     *ops);
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
@@ -174,13 +186,15 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
-#endif
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-extern int xlog_debug;          /* set to 1 to enable real log */
+int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+                                struct xfs_log_vec *log_vector,
+                                xfs_lsn_t *commit_lsn, int flags);
+bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+#endif
 #endif  /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..7e206fc1fa36
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+        struct log      *log)
+{
+        struct xfs_cil  *cil;
+        struct xfs_cil_ctx *ctx;
+        log->l_cilp = NULL;
+        if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+                return 0;
+        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        if (!cil)
+                return ENOMEM;
+        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        if (!ctx) {
+                kmem_free(cil);
+                return ENOMEM;
+        }
+        INIT_LIST_HEAD(&cil->xc_cil);
+        INIT_LIST_HEAD(&cil->xc_committing);
+        spin_lock_init(&cil->xc_cil_lock);
+        init_rwsem(&cil->xc_ctx_lock);
+        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        INIT_LIST_HEAD(&ctx->committing);
+        INIT_LIST_HEAD(&ctx->busy_extents);
+        ctx->sequence = 1;
+        ctx->cil = cil;
+        cil->xc_ctx = ctx;
+        cil->xc_current_sequence = ctx->sequence;
+        cil->xc_log = log;
+        log->l_cilp = cil;
+        return 0;
+}
+void
+xlog_cil_destroy(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        if (log->l_cilp->xc_ctx) {
+                if (log->l_cilp->xc_ctx->ticket)
+                        xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+                kmem_free(log->l_cilp->xc_ctx);
+        }
+        ASSERT(list_empty(&log->l_cilp->xc_cil));
+        kmem_free(log->l_cilp);
+}
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+        struct log      *log)
+{
+        struct xlog_ticket *tic;
+        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+                                KM_SLEEP|KM_NOFS);
+        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+        /*
+         * set the current reservation to zero so we know to steal the basic
+         * transaction overhead reservation from the first transaction commit.
+         */
+        tic->t_curr_res = 0;
+        return tic;
+}
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+        log->l_cilp->xc_ctx->sequence = 1;
+        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+                                                                log->l_curr_block);
+}
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+        struct log              *log,
+        struct xlog_ticket      *ticket,
+        struct xfs_log_item     *item,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
+        int                     len;
+        int                     diff_iovecs;
+        int                     iclog_space;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                len = lv->lv_buf_len - old->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&item->li_cil));
+                len = lv->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        spin_lock(&cil->xc_cil_lock);
+        list_move_tail(&item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * If this is the first time the item is being committed to the CIL,
+         * store the sequence number on the log item so we can tell
+         * in future commits whether this is the first checkpoint the item is
+         * being committed into.
+         */
+        if (!item->li_seq)
+                item->li_seq = ctx->sequence;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
+}
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec *lv;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                void    *ptr;
+                int     index;
+                int     len = 0;
+                /* build the vector array and calculate it's length */
+                IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+                for (index = 0; index < lv->lv_niovecs; index++)
+                        len += lv->lv_iovecp[index].i_len;
+                lv->lv_buf_len = len;
+                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                ptr = lv->lv_buf;
+                for (index = 0; index < lv->lv_niovecs; index++) {
+                        struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+                        memcpy(ptr, vec->i_addr, vec->i_len);
+                        vec->i_addr = ptr;
+                        ptr += vec->i_len;
+                }
+                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+        }
+}
+static void
+xlog_cil_insert_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next)
+                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+}
+static void
+xlog_cil_free_logvec(
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        for (lv = log_vector; lv; ) {
+                struct xfs_log_vec *next = lv->lv_next;
+                kmem_free(lv->lv_buf);
+                kmem_free(lv);
+                lv = next;
+        }
+}
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+        void    *args,
+        int     abort)
+{
+        struct xfs_cil_ctx      *ctx = args;
+        struct xfs_log_vec      *lv;
+        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
+        struct xfs_busy_extent  *busyp, *n;
+        /* unpin all the log items */
+        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+                                                        abortflag);
+        }
+        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        spin_lock(&ctx->cil->xc_cil_lock);
+        list_del(&ctx->committing);
+        spin_unlock(&ctx->cil->xc_cil_lock);
+        xlog_cil_free_logvec(ctx->lv_chain);
+        kmem_free(ctx);
+}
+/*
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
+ */
+STATIC int
+xlog_cil_push(
+        struct log              *log,
+        xfs_lsn_t               push_seq)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *lv;
+        struct xfs_cil_ctx      *ctx;
+        struct xfs_cil_ctx      *new_ctx;
+        struct xlog_in_core     *commit_iclog;
+        struct xlog_ticket      *tic;
+        int                     num_lv;
+        int                     num_iovecs;
+        int                     len;
+        int                     error = 0;
+        struct xfs_trans_header thdr;
+        struct xfs_log_iovec    lhdr;
+        struct xfs_log_vec      lvhdr = { NULL };
+        xfs_lsn_t               commit_lsn;
+        if (!cil)
+                return 0;
+        ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx->ticket = xlog_cil_ticket_alloc(log);
+        /*
+         * Lock out transaction commit, but don't block for background pushes
+         * unless we are well over the CIL space limit. See the definition of
+         * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+         * used here.
+         */
+        if (!down_write_trylock(&cil->xc_ctx_lock)) {
+                if (!push_seq &&
+                    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
+                        goto out_free_ticket;
+                down_write(&cil->xc_ctx_lock);
+        }
+        ctx = cil->xc_ctx;
+        /* check if we've anything to push */
+        if (list_empty(&cil->xc_cil))
+                goto out_skip;
+        /* check for spurious background flush */
+        if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /* check for a previously pushed seqeunce */
+        if (push_seq && push_seq < cil->xc_ctx->sequence)
+                goto out_skip;
+        /*
+         * pull all the log vectors off the items in the CIL, and
+         * remove the items from the CIL. We don't need the CIL lock
+         * here because it's only needed on the transaction commit
+         * side which is currently locked out by the flush lock.
+         */
+        lv = NULL;
+        num_lv = 0;
+        num_iovecs = 0;
+        len = 0;
+        while (!list_empty(&cil->xc_cil)) {
+                struct xfs_log_item     *item;
+                int                     i;
+                item = list_first_entry(&cil->xc_cil,
+                                        struct xfs_log_item, li_cil);
+                list_del_init(&item->li_cil);
+                if (!ctx->lv_chain)
+                        ctx->lv_chain = item->li_lv;
+                else
+                        lv->lv_next = item->li_lv;
+                lv = item->li_lv;
+                item->li_lv = NULL;
+                num_lv++;
+                num_iovecs += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++)
+                        len += lv->lv_iovecp[i].i_len;
+        }
+        /*
+         * initialise the new context and attach it to the CIL. Then attach
+         * the current context to the CIL committing lsit so it can be found
+         * during log forces to extract the commit lsn of the sequence that
+         * needs to be forced.
+         */
+        INIT_LIST_HEAD(&new_ctx->committing);
+        INIT_LIST_HEAD(&new_ctx->busy_extents);
+        new_ctx->sequence = ctx->sequence + 1;
+        new_ctx->cil = cil;
+        cil->xc_ctx = new_ctx;
+        /*
+         * mirror the new sequence into the cil structure so that we can do
+         * unlocked checks against the current sequence in log forces without
+         * risking deferencing a freed context pointer.
+         */
+        cil->xc_current_sequence = new_ctx->sequence;
+        /*
+         * The switch is now done, so we can drop the context lock and move out
+         * of a shared context. We can't just go straight to the commit record,
+         * though - we need to synchronise with previous and future commits so
+         * that the commit records are correctly ordered in the log to ensure
+         * that we process items during log IO completion in the correct order.
+         *
+         * For example, if we get an EFI in one checkpoint and the EFD in the
+         * next (e.g. due to log forces), we do not want the checkpoint with
+         * the EFD to be committed before the checkpoint with the EFI.  Hence
+         * we must strictly order the commit records of the checkpoints so
+         * that: a) the checkpoint callbacks are attached to the iclogs in the
+         * correct order; and b) the checkpoints are replayed in correct order
+         * in log recovery.
+         *
+         * Hence we need to add this context to the committing context list so
+         * that higher sequences will wait for us to write out a commit record
+         * before they do.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        list_add(&ctx->committing, &cil->xc_committing);
+        spin_unlock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        /*
+         * Build a checkpoint transaction header and write it to the log to
+         * begin the transaction. We need to account for the space used by the
+         * transaction header here as it is not accounted for in xlog_write().
+         *
+         * The LSN we need to pass to the log items on transaction commit is
+         * the LSN reported by the first log vector write. If we use the commit
+         * record lsn then we can move the tail beyond the grant write head.
+         */
+        tic = ctx->ticket;
+        thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+        thdr.th_type = XFS_TRANS_CHECKPOINT;
+        thdr.th_tid = tic->t_tid;
+        thdr.th_num_items = num_iovecs;
+        lhdr.i_addr = &thdr;
+        lhdr.i_len = sizeof(xfs_trans_header_t);
+        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+        lvhdr.lv_niovecs = 1;
+        lvhdr.lv_iovecp = &lhdr;
+        lvhdr.lv_next = ctx->lv_chain;
+        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+        if (error)
+                goto out_abort;
+        /*
+         * now that we've written the checkpoint into the log, strictly
+         * order the commit records so replay will get them in the right order.
+         */
+restart:
+        spin_lock(&cil->xc_cil_lock);
+        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+                /*
+                 * Higher sequences will wait for this one so skip them.
+                 * Don't wait for own own sequence, either.
+                 */
+                if (new_ctx->sequence >= ctx->sequence)
+                        continue;
+                if (!new_ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+        if (error || commit_lsn == -1)
+                goto out_abort;
+        /* attach all the transactions w/ busy extents to iclog */
+        ctx->log_cb.cb_func = xlog_cil_committed;
+        ctx->log_cb.cb_arg = ctx;
+        error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+        if (error)
+                goto out_abort;
+        /*
+         * now the checkpoint commit is complete and we've attached the
+         * callbacks to the iclog we can assign the commit LSN to the context
+         * and wake up anyone who is waiting for the commit to complete.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        ctx->commit_lsn = commit_lsn;
+        sv_broadcast(&cil->xc_commit_wait);
+        spin_unlock(&cil->xc_cil_lock);
+        /* release the hounds! */
+        return xfs_log_release_iclog(log->l_mp, commit_iclog);
+out_skip:
+        up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+        xfs_log_ticket_put(new_ctx->ticket);
+        kmem_free(new_ctx);
+        return 0;
+out_abort:
+        xlog_cil_committed(ctx, XFS_LI_ABORTED);
+        return XFS_ERROR(EIO);
+}
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * do all the hard work of formatting items (including memory
+         * allocation) outside the CIL context lock. This prevents stalling CIL
+         * pushes when we are low on memory and a transaction commit spends a
+         * lot of time in memory reclaim.
+         */
+        xlog_cil_format_items(log, log_vector);
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /*
+         * Once all the items of the transaction have been copied to the CIL,
+         * the items can be unlocked and freed.
+         *
+         * This needs to be done before we drop the CIL context lock because we
+         * have to update state in the log items and unlock them before they go
+         * to disk. If we don't, then the CIL checkpoint can race with us and
+         * we can run checkpoint completion before we've updated and unlocked
+         * the log items. This affects (at least) processing of stale buffers,
+         * inodes and EFIs.
+         */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_force_lsn(
+        struct log      *log,
+        xfs_lsn_t       sequence)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx;
+        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
+        ASSERT(sequence <= cil->xc_current_sequence);
+        /*
+         * check to see if we need to force out the current context.
+         * xlog_cil_push() handles racing pushes for the same sequence,
+         * so no need to deal with it here.
+         */
+        if (sequence == cil->xc_current_sequence)
+                xlog_cil_push(log, sequence);
+        /*
+         * See if we can find a previous sequence still committing.
+         * We need to wait for all previous sequence commits to complete
+         * before allowing the force of push_seq to go ahead. Hence block
+         * on commits for those as well.
+         */
+restart:
+        spin_lock(&cil->xc_cil_lock);
+        list_for_each_entry(ctx, &cil->xc_committing, committing) {
+                if (ctx->sequence > sequence)
+                        continue;
+                if (!ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+                if (ctx->sequence != sequence)
+                        continue;
+                /* found it! */
+                commit_lsn = ctx->commit_lsn;
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        return commit_lsn;
+}
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+        struct xfs_log_item *lip)
+{
+        struct xfs_cil_ctx *ctx;
+        if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+                return false;
+        if (list_empty(&lip->li_cil))
+                return false;
+        ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+        /*
+         * li_seq is written on the first commit of a log item to record the
+         * first checkpoint it is written to. Hence if it is different to the
+         * current sequence, we're in a new checkpoint.
+         */
+        if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+                return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
-typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,105 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+struct xfs_cil_ctx {
+        struct xfs_cil          *cil;
+        xfs_lsn_t               sequence;       /* chkpt sequence # */
+        xfs_lsn_t               start_lsn;      /* first LSN of chkpt commit */
+        xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
+        struct xlog_ticket      *ticket;        /* chkpt ticket */
+        int                     nvecs;          /* number of regions */
+        int                     space_used;     /* aggregate size of regions */
+        struct list_head        busy_extents;   /* busy extents in chkpt */
+        struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
+        xfs_log_callback_t      log_cb;         /* completion callback hook. */
+        struct list_head        committing;     /* ctx committing list */
+};
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+        struct log              *xc_log;
+        struct list_head        xc_cil;
+        spinlock_t              xc_cil_lock;
+        struct xfs_cil_ctx      *xc_ctx;
+        struct rw_semaphore     xc_ctx_lock;
+        struct list_head        xc_committing;
+        sv_t                    xc_commit_wait;
+        xfs_lsn_t               xc_current_sequence;
+};
+/*
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that.  Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits.  A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)       (log->l_logsize >> 3)
+#define XLOG_CIL_HARD_SPACE_LIMIT(log)  (3 * (log->l_logsize >> 4))
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +485,7 @@ typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
        struct xfs_ail          *l_ailp;        /* AIL log is working with */
+        struct xfs_cil          *l_cilp;        /* CIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
@@ -396,9 +494,7 @@ typedef struct log {
        struct xfs_buf_cancel   **l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
        int                     l_iclog_size;   /* size of log in bytes */
        int                     l_iclog_size_log; /* log power size of log */
        int                     l_iclog_bufs;   /* number of iclog buffers */
@@ -440,14 +536,48 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern kmem_zone_t      *xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+                                int count, char client, uint xflags,
+                                int alloc_flags);
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+        *ptr += bytes;
+        *len -= bytes;
+        *off += bytes;
+}
+void    xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int     xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+                                struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+                                xlog_in_core_t **commit_iclog, uint flags);
+/*
+ * Committed Item List interfaces
+ */
+int     xlog_cil_init(struct log *log);
+void    xlog_cil_init_post_recovery(struct log *log);
+void    xlog_cil_destroy(struct log *log);
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+static inline void
+xlog_cil_force(struct log *log)
+{
+        xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..6f3f5fa37acf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,15 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -56,33 +52,61 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #define xlog_recover_check_summary(log)
 #endif
 /*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)   \
+/*
-        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
+ * Verify the given count of basic blocks is valid number of blocks
-        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
+ * to specify for an operation involving the given XFS log buffer.
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+static inline int
+xlog_buf_bbcount_valid(
+        xlog_t          *log,
+        int             bbcount)
+{
+        return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
 STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
        int             nbblks)
 {
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_get_bp(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
        }
-        if (log->l_sectbb_log) {
+        /*
-                if (nbblks > 1)
+         * We do log I/O in units of log sectors (a power-of-2
-                        nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+         * multiple of the basic block size), so we round up the
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+         * requested size to acommodate the basic blocks required
-        }
+         * for complete log sectors.
+         *
+         * In addition, the buffer may be used for a non-sector-
+         * aligned block offset, in which case an I/O of the
+         * requested size could extend beyond the end of the
+         * buffer.  If the requested size is only 1 basic block it
+         * will never straddle a sector boundary, so this won't be
+         * an issue.  Nor will this be a problem if the log I/O is
+         * done in basic blocks (sector size 1).  But otherwise we
+         * extend the buffer by one extra log sector to ensure
+         * there's space to accomodate this possiblility.
+         */
+        if (nbblks > 1 && log->l_sectBBsize > 1)
+                nbblks += log->l_sectBBsize;
+        nbblks = round_up(nbblks, log->l_sectBBsize);
        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
@@ -93,6 +117,10 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
 STATIC xfs_caddr_t
 xlog_align(
        xlog_t          *log,
@@ -100,15 +128,10 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
-        xfs_caddr_t     ptr;
+        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-        if (!log->l_sectbb_log)
+        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-                return XFS_BUF_PTR(bp);
+        return XFS_BUF_PTR(bp) + BBTOB(offset);
-        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-        ASSERT(XFS_BUF_SIZE(bp) >=
-                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-        return ptr;
 }
@@ -124,21 +147,18 @@ xlog_bread_noalign(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bread(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
-        ASSERT(bp);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
@@ -186,17 +206,15 @@ xlog_bwrite(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bwrite(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +345,38 @@ xlog_find_cycle_start(
 {
        xfs_caddr_t     offset;
        xfs_daddr_t     mid_blk;
+        xfs_daddr_t     end_blk;
        uint            mid_cycle;
        int             error;
-        mid_blk = BLK_AVG(first_blk, *last_blk);
+        end_blk = *last_blk;
-        while (mid_blk != first_blk && mid_blk != *last_blk) {
+        mid_blk = BLK_AVG(first_blk, end_blk);
+        while (mid_blk != first_blk && mid_blk != end_blk) {
                error = xlog_bread(log, mid_blk, 1, bp, &offset);
                if (error)
                        return error;
                mid_cycle = xlog_get_cycle(offset);
-                if (mid_cycle == cycle) {
+                if (mid_cycle == cycle)
-                        *last_blk = mid_blk;
+                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
-                        /* last_half_cycle == mid_cycle */
+                else
-                } else {
+                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
-                        first_blk = mid_blk;
+                mid_blk = BLK_AVG(first_blk, end_blk);
-                        /* first_half_cycle == mid_cycle */
-                }
-                mid_blk = BLK_AVG(first_blk, *last_blk);
        }
-        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
-               (mid_blk == *last_blk && mid_blk-1 == first_blk));
+               (mid_blk == end_blk && mid_blk-1 == first_blk));
+        *last_blk = end_blk;
        return 0;
 }
 /*
- * Check that the range of blocks does not contain the cycle number
+ * Check that a range of blocks does not contain stop_on_cycle_no.
- * given.  The scan needs to occur from front to back and the ptr into the
+ * Fill in *new_blk with the block offset where such a block is
- * region must be updated since a later routine will need to perform another
+ * found, or with -1 (an invalid block number) if there is no such
- * test.  If the region is completely good, we end up returning the same
+ * block in the range.  The scan needs to occur from front to back
- * last block number.
+ * and the pointer into the region must be updated since a later
- *
+ * routine will need to perform another test.
- * Set blkno to -1 if we encounter no errors.  This is an invalid block number
- * since we don't ever expect logs to get this large.
 */
 STATIC int
 xlog_find_verify_cycle(
@@ -376,12 +393,16 @@ xlog_find_verify_cycle(
        xfs_caddr_t     buf = NULL;
        int             error = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks we'll be examining.  If that fails,
+         * try a smaller size.  We need to be able to read at least
+         * a log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(nbblks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
-                /* can't get enough memory to do everything in one big buffer */
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < log->l_sectBBsize)
                        return ENOMEM;
        }
@@ -629,7 +650,7 @@ xlog_find_head(
                 * In this case we want to find the first block with cycle
                 * number matching last_half_cycle.  We expect the log to be
                 * some variation on
-                 *        x + 1 ... | x ...
+                 *        x + 1 ... | x ... | x
                 * The first block with cycle number x (last_half_cycle) will
                 * be where the new head belongs.  First we do a binary search
                 * for the first occurrence of last_half_cycle.  The binary
@@ -639,11 +660,13 @@ xlog_find_head(
                 * the log, then we look for occurrences of last_half_cycle - 1
                 * at the end of the log.  The cases we're looking for look
                 * like
-                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               v binary search stopped here
-                 *                               ^ binary search stopped here
+                 *        x + 1 ... | x | x + 1 | x ... | x
+                 *                   ^ but we want to locate this spot
                 * or
-                 *        x + 1 ... | x ... | x - 1 | x
                 *        <---------> less than scan distance
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *                           ^ we want to locate this spot
                 */
                stop_on_cycle = last_half_cycle;
                if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +722,16 @@ xlog_find_head(
                 * certainly not the head of the log.  By searching for
                 * last_half_cycle-1 we accomplish that.
                 */
-                start_blk = log_bbnum - num_scan_bblks + head_blk;
                ASSERT(head_blk <= INT_MAX &&
-                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+                        (xfs_daddr_t) num_scan_bblks >= head_blk);
+                start_blk = log_bbnum - (num_scan_bblks - head_blk);
                if ((error = xlog_find_verify_cycle(log, start_blk,
                                        num_scan_bblks - (int)head_blk,
                                        (stop_on_cycle - 1), &new_blk)))
                        goto bp_err;
                if (new_blk != -1) {
                        head_blk = new_blk;
-                        goto bad_blk;
+                        goto validate_head;
                }
                /*
@@ -726,7 +749,7 @@ xlog_find_head(
                        head_blk = new_blk;
        }
- bad_blk:
+validate_head:
        /*
         * Now we need to make sure head_blk is not pointing to a block in
         * the middle of a log record.
@@ -748,7 +771,7 @@ xlog_find_head(
                if ((error = xlog_find_verify_log_record(log, start_blk,
                                                        &head_blk, 0)) == -1) {
                        /* We hit the beginning of the log during our search */
-                        start_blk = log_bbnum - num_scan_bblks + head_blk;
+                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +856,12 @@ xlog_find_tail(
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
-                        goto exit;
+                        goto done;
                }
        }
@@ -849,7 +872,7 @@ xlog_find_tail(
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
                error = xlog_bread(log, i, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
@@ -866,7 +889,7 @@ xlog_find_tail(
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
                        error = xlog_bread(log, i, 1, bp, &offset);
                        if (error)
-                                goto bread_err;
+                                goto done;
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +964,7 @@ xlog_find_tail(
                umount_data_blk = (i + hblks) % log->l_logBBsize;
                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1010,10 @@ xlog_find_tail(
         * But... if the -device- itself is readonly, just skip this.
         * We can't recover this device anyway, so it won't matter.
         */
-        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
                error = xlog_clear_stale_blocks(log, tail_lsn);
-        }
-bread_err:
+done:
-exit:
        xlog_put_bp(bp);
        if (error)
@@ -1152,16 +1173,22 @@ xlog_write_log_records(
        xfs_caddr_t     offset;
        xfs_buf_t       *bp;
        int             balign, ealign;
-        int             sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+        int             sectbb = log->l_sectBBsize;
        int             end_block = start_block + blocks;
        int             bufblks;
        int             error = 0;
        int             i, j = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks to be written.  If that fails, try
+         * a smaller size.  We need to be able to write at least a
+         * log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(blocks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < sectbb)
                        return ENOMEM;
        }
@@ -1169,7 +1196,7 @@ xlog_write_log_records(
         * the buffer in the starting sector not covered by the first
         * write below.
         */
-        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+        balign = round_down(start_block, sectbb);
        if (balign != start_block) {
                error = xlog_bread_noalign(log, start_block, 1, bp);
                if (error)
@@ -1188,7 +1215,7 @@ xlog_write_log_records(
                 * the buffer in the final sector not covered by the write.
                 * If this is the same sector as the above read, skip it.
                 */
-                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
@@ -1408,6 +1435,7 @@ xlog_recover_add_item(
 STATIC int
 xlog_recover_add_to_cont_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1434,6 +1462,7 @@ xlog_recover_add_to_cont_trans(
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
        return 0;
 }
@@ -1452,6 +1481,7 @@ xlog_recover_add_to_cont_trans(
 */
 STATIC int
 xlog_recover_add_to_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1510,6 +1540,7 @@ xlog_recover_add_to_trans(
        item->ri_buf[item->ri_cnt].i_addr = ptr;
        item->ri_buf[item->ri_cnt].i_len  = len;
        item->ri_cnt++;
+        trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
@@ -1521,20 +1552,22 @@ xlog_recover_add_to_trans(
 */
 STATIC int
 xlog_recover_reorder_trans(
-        xlog_recover_t          *trans)
+        struct log              *log,
+        xlog_recover_t          *trans,
+        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
        LIST_HEAD(sort_list);
        list_splice_init(&trans->r_itemq, &sort_list);
        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
-                xfs_buf_log_format_t    *buf_f;
+                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
-                buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+                                trace_xfs_log_recover_item_reorder_head(log,
+                                                        trans, item, pass);
                                list_move(&item->ri_list, &trans->r_itemq);
                                break;
                        }
@@ -1543,6 +1576,8 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
+                        trace_xfs_log_recover_item_reorder_tail(log,
+                                                        trans, item, pass);
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
@@ -1592,8 +1627,10 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLI_CANCEL))
+        if (!(flags & XFS_BLF_CANCEL)) {
+                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
+        }
        /*
         * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1664,7 @@ xlog_recover_do_buffer_pass1(
        while (nextp != NULL) {
                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
                        nextp->bc_refcount++;
+                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
                        return;
                }
                prevp = nextp;
@@ -1640,13 +1678,14 @@ xlog_recover_do_buffer_pass1(
        bcp->bc_refcount = 1;
        bcp->bc_next = NULL;
        prevp->bc_next = bcp;
+        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 /*
 * Check to see whether the buffer being recovered has a corresponding
 * entry in the buffer cancel record table.  If it does then return 1
 * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
 * the refcount on the entry in the table and remove it from the table
 * if this is the last reference.
 *
@@ -1671,7 +1710,7 @@ xlog_check_buffer_cancelled(
                 * There is nothing in the table built in pass one,
                 * so this buffer must not be cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1683,7 +1722,7 @@ xlog_check_buffer_cancelled(
                 * There is no corresponding entry in the table built
                 * in pass one, so this buffer has not been cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1702,7 +1741,7 @@ xlog_check_buffer_cancelled(
                         * one in the table and remove it if this is the
                         * last reference.
                         */
-                        if (flags & XFS_BLI_CANCEL) {
+                        if (flags & XFS_BLF_CANCEL) {
                                bcp->bc_refcount--;
                                if (bcp->bc_refcount == 0) {
                                        if (prevp == NULL) {
@@ -1722,7 +1761,7 @@ xlog_check_buffer_cancelled(
         * We didn't find a corresponding entry in the table, so
         * return 0 so that the buffer is NOT cancelled.
         */
-        ASSERT(!(flags & XFS_BLI_CANCEL));
+        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
 }
@@ -1779,6 +1818,8 @@ xlog_recover_do_inode_buffer(
        unsigned int            *data_map = NULL;
        unsigned int            map_size = 0;
+        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1822,8 +1863,8 @@ xlog_recover_do_inode_buffer(
                        nbits = xfs_contig_bits(data_map, map_size,
                                                         bit);
                        ASSERT(nbits > 0);
-                        reg_buf_offset = bit << XFS_BLI_SHIFT;
+                        reg_buf_offset = bit << XFS_BLF_SHIFT;
-                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
                        item_index++;
                }
@@ -1837,7 +1878,7 @@ xlog_recover_do_inode_buffer(
                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
-                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
                /*
@@ -1845,9 +1886,8 @@ xlog_recover_do_inode_buffer(
                 * current di_next_unlinked field.  Extract its value
                 * and copy it to the buffer copy.
                 */
-                logged_nextp = (xfs_agino_t *)
+                logged_nextp = item->ri_buf[item_index].i_addr +
-                               ((char *)(item->ri_buf[item_index].i_addr) +
+                                next_unlinked_offset - reg_buf_offset;
-                                (next_unlinked_offset - reg_buf_offset));
                if (unlikely(*logged_nextp == 0)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
@@ -1874,6 +1914,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        xfs_buf_t               *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -1885,6 +1926,8 @@ xlog_recover_do_reg_buffer(
        unsigned int            map_size = 0;
        int                     error;
+        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1900,9 +1943,9 @@ xlog_recover_do_reg_buffer(
                nbits = xfs_contig_bits(data_map, map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
-                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
                ASSERT(XFS_BUF_COUNT(bp) >=
-                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+                       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
                /*
                 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1954,7 @@ xlog_recover_do_reg_buffer(
                 */
                error = 0;
                if (buf_f->blf_flags &
-                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
                                cmn_err(CE_ALERT,
                                        "XFS: NULL dquot in %s.", __func__);
@@ -1923,8 +1966,7 @@ xlog_recover_do_reg_buffer(
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
+                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
-                                               item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -1932,9 +1974,9 @@ xlog_recover_do_reg_buffer(
                }
                memcpy(xfs_buf_offset(bp,
-                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
                        item->ri_buf[i].i_addr,         /* source */
-                        nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLF_SHIFT);          /* length */
 next:
                i++;
                bit += nbits;
@@ -2083,6 +2125,8 @@ xlog_recover_do_dquot_buffer(
 {
        uint                    type;
+        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
@@ -2091,11 +2135,11 @@ xlog_recover_do_dquot_buffer(
        }
        type = 0;
-        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
                type |= XFS_DQ_USER;
-        if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
                type |= XFS_DQ_PROJ;
-        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
                type |= XFS_DQ_GROUP;
        /*
         * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2147,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return;
-        xlog_recover_do_reg_buffer(item, bp, buf_f);
+        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 /*
@@ -2116,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
 * here which overlaps that may be stale.
 *
 * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
 * of the buffer in the log should not be replayed at recovery time.
 * This is so that if the blocks covered by the buffer are reused for
 * file data before we crash we don't end up replaying old, freed
@@ -2135,7 +2179,7 @@ xlog_recover_do_buffer_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-        xfs_buf_log_format_t    *buf_f;
+        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
        int                     error;
@@ -2145,12 +2189,10 @@ xlog_recover_do_buffer_trans(
        ushort                  flags;
        uint                    buf_flags;
-        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
        if (pass == XLOG_RECOVER_PASS1) {
                /*
                 * In this pass we're only looking for buf items
-                 * with the XFS_BLI_CANCEL bit set.
+                 * with the XFS_BLF_CANCEL bit set.
                 */
                xlog_recover_do_buffer_pass1(log, buf_f);
                return 0;
@@ -2164,9 +2206,11 @@ xlog_recover_do_buffer_trans(
                 */
                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
                if (cancel) {
+                        trace_xfs_log_recover_buf_cancel(log, buf_f);
                        return 0;
                }
        }
+        trace_xfs_log_recover_buf_recover(log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                blkno = buf_f->blf_blkno;
@@ -2185,7 +2229,7 @@ xlog_recover_do_buffer_trans(
        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLI_INODE_BUF))
+        if (!(flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2242,13 @@ xlog_recover_do_buffer_trans(
        }
        error = 0;
-        if (flags & XFS_BLI_INODE_BUF) {
+        if (flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (flags &
-                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
-                xlog_recover_do_reg_buffer(item, bp, buf_f);
+                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
                return XFS_ERROR(error);
@@ -2265,10 +2309,9 @@ xlog_recover_do_inode_trans(
        }
        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
-                in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+                in_f = item->ri_buf[0].i_addr;
        } else {
-                in_f = (xfs_inode_log_format_t *)kmem_alloc(
+                in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
-                        sizeof(xfs_inode_log_format_t), KM_SLEEP);
                need_free = 1;
                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
                if (error)
@@ -2284,8 +2327,10 @@ xlog_recover_do_inode_trans(
        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
                                        in_f->ilf_len, 0)) {
                error = 0;
+                trace_xfs_log_recover_inode_cancel(log, in_f);
                goto error;
        }
+        trace_xfs_log_recover_inode_recover(log, in_f);
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
@@ -2314,7 +2359,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-        dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
+        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
@@ -2337,6 +2382,7 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
+                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
                        goto error;
                }
@@ -2404,7 +2450,7 @@ xlog_recover_do_inode_trans(
        }
        /* The core is in in-core format */
-        xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+        xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
        /* the rest is in on-disk format */
        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
@@ -2521,7 +2567,7 @@ xlog_recover_do_quotaoff_trans(
                return (0);
        }
-        qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+        qoff_f = item->ri_buf[0].i_addr;
        ASSERT(qoff_f);
        /*
@@ -2565,9 +2611,8 @@ xlog_recover_do_dquot_trans(
        if (mp->m_qflags == 0)
                return (0);
-        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
+        recddq = item->ri_buf[1].i_addr;
+        if (recddq == NULL) {
-        if (item->ri_buf[1].i_addr == NULL) {
                cmn_err(CE_ALERT,
                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
@@ -2597,7 +2642,7 @@ xlog_recover_do_dquot_trans(
         * The other possibility, of course, is that the quota subsystem was
         * removed since the last mount - ENOSYS.
         */
-        dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
        if ((error = xfs_qm_dqcheck(recddq,
                           dq_f->qlf_id,
@@ -2664,7 +2709,7 @@ xlog_recover_do_efi_trans(
                return 0;
        }
-        efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+        efi_formatp = item->ri_buf[0].i_addr;
        mp = log->l_mp;
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
@@ -2710,7 +2755,7 @@ xlog_recover_do_efd_trans(
                return;
        }
-        efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+        efd_formatp = item->ri_buf[0].i_addr;
        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
@@ -2758,11 +2803,12 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(trans);
+        error = xlog_recover_reorder_trans(log, trans, pass);
        if (error)
                return error;
        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                trace_xfs_log_recover_item_recover(log, trans, item, pass);
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2965,9 @@ xlog_recover_process_data(
                                error = xlog_recover_unmount_trans(trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
-                                error = xlog_recover_add_to_cont_trans(trans,
+                                error = xlog_recover_add_to_cont_trans(log,
-                                                dp, be32_to_cpu(ohead->oh_len));
+                                                trans, dp,
+                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
                                xlog_warn(
@@ -2930,7 +2977,7 @@ xlog_recover_process_data(
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
-                                error = xlog_recover_add_to_trans(trans,
+                                error = xlog_recover_add_to_trans(log, trans,
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
@@ -3139,7 +3186,7 @@ xlog_recover_process_one_iunlink(
        int                             error;
        ino = XFS_AGINO_TO_INO(mp, agno, agino);
-        error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
        if (error)
                goto fail;
@@ -3331,42 +3378,6 @@ xlog_pack_data(
        }
 }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
-        xlog_rec_header_t       *rhead,
-        xfs_caddr_t             dp,
-        xlog_t                  *log)
-{
-        __be32                  *up = (__be32 *)dp;
-        uint                    chksum = 0;
-        int                     i;
-        /* divide length by 4 to get # words */
-        for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        if (chksum != be32_to_cpu(rhead->h_chksum)) {
-            if (rhead->h_chksum ||
-                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
-                    cmn_err(CE_DEBUG,
-                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
-                            be32_to_cpu(rhead->h_chksum), chksum);
-                    cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                            cmn_err(CE_DEBUG,
-                                "XFS: LogR this is a LogV2 filesystem\n");
-                    }
-                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
-            }
-        }
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
 STATIC void
 xlog_unpack_data(
        xlog_rec_header_t       *rhead,
@@ -3390,8 +3401,6 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
-        xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
@@ -3490,7 +3499,7 @@ xlog_do_recovery_pass(
                        hblks = 1;
                }
        } else {
-                ASSERT(log->l_sectbb_log == 0);
+                ASSERT(log->l_sectBBsize == 1);
                hblks = 1;
                hbp = xlog_get_bp(log, 1);
                h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3955,6 @@ xlog_recover_check_summary(
        xfs_agf_t       *agfp;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *sbbp;
-#ifdef XFS_LOUD_RECOVERY
-        xfs_sb_t        *sbp;
-#endif
        xfs_agnumber_t  agno;
        __uint64_t      freeblks;
        __uint64_t      itotal;
@@ -3984,30 +3989,5 @@ xlog_recover_check_summary(
                        xfs_buf_relse(agibp);
                }
        }
-        sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
-        sbp = &mp->m_sb;
-        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
-                sbp->sb_icount, itotal);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
-                sbp->sb_ifree, ifree);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
-                sbp->sb_fdblocks, freeblks);
-#if 0
-        /*
-         * This is turned off until I account for the allocation
-         * btree blocks which live in free space.
-         */
-        ASSERT(sbp->sb_icount == itotal);
-        ASSERT(sbp->sb_ifree == ifree);
-        ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
-        xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
 #define XLOG_RHASH(tid) \
        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..aeb9d72ebf6e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -25,13 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -268,10 +265,10 @@ xfs_sb_validate_fsb_count(
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-                return E2BIG;
+                return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
        if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-                return E2BIG;
+                return EFBIG;
 #endif
        return 0;
 }
@@ -393,7 +390,7 @@ xfs_mount_validate_sb(
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
                        "file system too large to be mounted on this system.");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +410,6 @@ xfs_mount_validate_sb(
        return 0;
 }
-STATIC void
-xfs_initialize_perag_icache(
-        xfs_perag_t     *pag)
-{
-        if (!pag->pag_ici_init) {
-                rwlock_init(&pag->pag_ici_lock);
-                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-                pag->pag_ici_init = 1;
-        }
-}
 int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
@@ -436,13 +422,8 @@ xfs_initialize_perag(
        xfs_agino_t     agino;
        xfs_ino_t       ino;
        xfs_sb_t        *sbp = &mp->m_sb;
-        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
        int             error = -ENOMEM;
-        /* Check to see if the filesystem can overflow 32 bit inodes */
-        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
        /*
         * Walk the current per-ag tree so we don't try to initialise AGs
         * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +437,18 @@ xfs_initialize_perag(
                }
                if (!first_initialised)
                        first_initialised = index;
                pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
                if (!pag)
                        goto out_unwind;
+                pag->pag_agno = index;
+                pag->pag_mount = mp;
+                rwlock_init(&pag->pag_ici_lock);
+                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
                        BUG();
@@ -469,25 +457,26 @@ xfs_initialize_perag(
                        error = -EEXIST;
                        goto out_unwind;
                }
-                pag->pag_agno = index;
-                pag->pag_mount = mp;
                spin_unlock(&mp->m_perag_lock);
                radix_tree_preload_end();
        }
-        /* Clear the mount flag if no inode can overflow 32 bits
+        /*
-         * on this filesystem, or if specifically requested..
+         * If we mount with the inode64 option, or no inode overflows
+         * the legacy 32-bit address space clear the inode32 option.
         */
-        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        } else {
+        else
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-        }
-        /* If we can overflow then setup the ag headers accordingly */
        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-                /* Calculate how much should be reserved for inodes to
+                /*
-                 * meet the max inode percentage.
+                 * Calculate how much should be reserved for inodes to meet
+                 * the max inode percentage.
                 */
                if (mp->m_maxicount) {
                        __uint64_t      icount;
@@ -500,30 +489,28 @@ xfs_initialize_perag(
                } else {
                        max_metadata = agcount;
                }
                for (index = 0; index < agcount; index++) {
                        ino = XFS_AGINO_TO_INO(mp, index, agino);
-                        if (ino > max_inum) {
+                        if (ino > XFS_MAXINUMBER_32) {
                                index++;
                                break;
                        }
-                        /* This ag is preferred for inodes */
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        if (index < max_metadata)
                                pag->pagf_metadata = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        } else {
-                /* Setup default behavior for smaller filesystems */
                for (index = 0; index < agcount; index++) {
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        }
        if (maxagi)
                *maxagi = index;
        return 0;
@@ -1009,7 +996,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                cmn_err(CE_WARN, "XFS: size check 1 failed");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_ddev_targp,
                             d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1006,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        } else {
                cmn_err(CE_WARN, "XFS: size check 2 failed");
                if (error == ENOSPC)
-                        error = XFS_ERROR(E2BIG);
+                        error = XFS_ERROR(EFBIG);
                return error;
        }
@@ -1027,7 +1014,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                }
                error = xfs_read_buf(mp, mp->m_logdev_targp,
                                     d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1024,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                } else {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
                        if (error == ENOSPC)
-                                error = XFS_ERROR(E2BIG);
+                                error = XFS_ERROR(EFBIG);
                        return error;
                }
        }
@@ -1254,7 +1241,7 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        spin_lock_init(&mp->m_perag_lock);
-        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
@@ -1310,7 +1297,7 @@ xfs_mountfs(
         * Get and sanity-check the root inode.
         * Save the pointer to it in the mount structure.
         */
-        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
+        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
        if (error) {
                cmn_err(CE_WARN, "XFS: failed to read root inode");
                goto out_log_dealloc;
@@ -1405,13 +1392,6 @@ xfs_mountfs(
                xfs_qm_mount_quotas(mp);
        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (XFS_IS_QUOTA_ON(mp))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
         * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..622da2179a57 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,65 +66,6 @@ struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
-/*
- * Prototypes and functions for the Data Migration subsystem.
- */
-typedef int     (*xfs_send_data_t)(int, struct xfs_inode *,
-                        xfs_off_t, size_t, int, int *);
-typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int     (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
-typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
-                        struct xfs_inode *, dm_right_t,
-                        struct xfs_inode *, dm_right_t,
-                        const unsigned char *, const unsigned char *,
-                        mode_t, int, int);
-typedef int     (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
-                        char *, char *);
-typedef void    (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
-                        dm_right_t, mode_t, int, int);
-typedef struct xfs_dmops {
-        xfs_send_data_t         xfs_send_data;
-        xfs_send_mmap_t         xfs_send_mmap;
-        xfs_send_destroy_t      xfs_send_destroy;
-        xfs_send_namesp_t       xfs_send_namesp;
-        xfs_send_mount_t        xfs_send_mount;
-        xfs_send_unmount_t      xfs_send_unmount;
-} xfs_dmops_t;
-#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \
-        (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED)
-#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
-        (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
-#define XFS_SEND_MMAP(mp, vma,fl) \
-        (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
-#define XFS_SEND_DESTROY(mp, ip,right) \
-        (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
-#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
-        (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
-#define XFS_SEND_MOUNT(mp,right,path,name) \
-        (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
-#define XFS_SEND_PREUNMOUNT(mp) \
-do { \
-        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-                (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \
-                        (mp)->m_rootip, DM_RIGHT_NULL, \
-                        (mp)->m_rootip, DM_RIGHT_NULL, \
-                        NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-        } \
-} while (0)
-#define XFS_SEND_UNMOUNT(mp) \
-do { \
-        if (mp->m_flags & XFS_MOUNT_DMAPI) { \
-                (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \
-                        DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \
-        } \
-} while (0)
 #ifdef HAVE_PERCPU_SB
 /*
@@ -241,8 +182,6 @@ typedef struct xfs_mount {
        uint                    m_chsize;       /* size of next field */
        struct xfs_chash        *m_chash;       /* fs private inode per-cluster
                                                 * hash table */
-        struct xfs_dmops        *m_dm_ops;      /* vector of DMI ops */
-        struct xfs_qmops        *m_qm_ops;      /* vector of XQM ops */
        atomic_t                m_active_trans; /* number trans frozen */
 #ifdef HAVE_PERCPU_SB
        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
@@ -259,7 +198,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
-        struct list_head        m_mplist;       /* inode shrinker mount list */
+        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
 } xfs_mount_t;
 /*
@@ -268,7 +207,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
-#define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
+#define XFS_MOUNT_DELAYLOG      (1ULL << 1)     /* delayed logging is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
                                                   operations, typically for
@@ -281,8 +220,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
-#define XFS_MOUNT_OSYNCISOSYNC  (1ULL << 13)    /* o_sync is REALLY o_sync */
-                                                /* osyncisdsync is now default*/
 #define XFS_MOUNT_32BITINODES   (1ULL << 14)    /* do not create inodes above
                                                 * 32 bits in size */
 #define XFS_MOUNT_SMALL_INUMS   (1ULL << 15)    /* users wants 32bit inodes */
@@ -439,11 +376,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dev_is_read_only(struct xfs_mount *, char *);
-extern int      xfs_dmops_get(struct xfs_mount *);
-extern void     xfs_dmops_put(struct xfs_mount *);
-extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_DQSUSER       0x0000020 /* don't cache super users dquot */
 #define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF      0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index fc1cda23b817..8fca957200df 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -24,12 +24,9 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -116,20 +113,7 @@ xfs_rename(
        int             spaceres;
        int             num_inodes;
-        xfs_itrace_entry(src_dp);
+        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-        xfs_itrace_entry(target_dp);
-        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
-            DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
-                                        src_dp, DM_RIGHT_NULL,
-                                        target_dp, DM_RIGHT_NULL,
-                                        src_name->name, target_name->name,
-                                        0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        new_parent = (src_dp != target_dp);
        src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
@@ -184,26 +168,14 @@ xfs_rename(
        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
-         * them.  Note that we need to add a vnode reference to the
+         * them.
-         * directories since trans_commit & trans_cancel will decrement
-         * them when they unlock the inodes.  Also, we need to be careful
-         * not to add an inode to the transaction more than once.
         */
-        IHOLD(src_dp);
+        xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+        if (new_parent)
+                xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
-        if (new_parent) {
+        xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
-                IHOLD(target_dp);
+        if (target_ip)
-                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
-        }
-        IHOLD(src_ip);
-        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-        if (target_ip) {
-                IHOLD(target_ip);
-                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-        }
        /*
         * If we are using project inheritance, we only allow renames
@@ -369,26 +341,13 @@ xfs_rename(
         * trans_commit will unlock src_ip, target_ip & decrement
         * the vnode references.
         */
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        /* Fall through to std_return with error = 0 or errno from
-         * xfs_trans_commit      */
-std_return:
-        if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
-            DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
-                (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
-                                        src_dp, DM_RIGHT_NULL,
-                                        target_dp, DM_RIGHT_NULL,
-                                        src_name->name, target_name->name,
-                                        0, error, 0);
-        }
-        return error;
 abort_return:
        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..891260fea11e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,17 +25,10 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -129,7 +122,7 @@ xfs_growfs_rt_alloc(
                cancelflags |= XFS_TRANS_ABORT;
                error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
                        XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-                        resblks, &map, &nmap, &flist, NULL);
+                        resblks, &map, &nmap, &flist);
                if (!error && nmap < 1)
                        error = XFS_ERROR(ENOSPC);
                if (error)
@@ -2247,7 +2240,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_rtdev_targp,
                                d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2249,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN,
        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
                if (error == ENOSPC)
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                return error;
        }
        xfs_buf_relse(bp);
@@ -2277,12 +2270,12 @@ xfs_rtmount_inodes(
        sbp = &mp->m_sb;
        if (sbp->sb_rbmino == NULLFSINO)
                return 0;
-        error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0);
+        error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
        if (error)
                return error;
        ASSERT(mp->m_rbmip != NULL);
        ASSERT(sbp->sb_rsumino != NULLFSINO);
-        error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
+        error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
        if (error) {
                IRELE(mp->m_rbmip);
                return error;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int               /* error */
+xfs_rtmount_init(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        if (mp->m_sb.sb_rblocks == 0)
+                return 0;
+        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif  /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index e336742a58a4..56861d5daaef 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,27 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
 #include "xfs_error.h"
-#include "xfs_buf_item.h"
 #include "xfs_rw.h"
-#include "xfs_trace.h"
 /*
 * Force a shutdown of the filesystem instantly while keeping
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -24,16 +25,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
@@ -44,148 +41,494 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
-STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint     xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void     xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void     xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void     xfs_trans_committed(xfs_trans_t *, int);
-STATIC void     xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void     xfs_trans_free(xfs_trans_t *);
 kmem_zone_t     *xfs_trans_zone;
+kmem_zone_t     *xfs_log_item_desc_zone;
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
+ * Various log reservation values.
- * due to register overflow from temporaries in the calculations.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
 */
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+                     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (4 * mp->m_sb.sb_sectsize +
+                     4 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+                     128 * 5 +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((4 * mp->m_sb.sb_inodesize +
+                     2 * XFS_DIROP_LOG_RES(mp) +
+                     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     3 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+                     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     1024 +
+                     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return xfs_calc_create_reservation(mp);
 }
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, 1) +
+                MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                    XFS_INODE_CLUSTER_SIZE(mp)) +
+                128 * 5 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                512;
 }
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWDATA_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize * 3 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+        return 2 * mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                mp->m_sb.sb_inodesize +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+        return mp->m_sb.sb_blocksize + 128;
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize +
+                2 * mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_blocksize +
+                mp->m_rsumsize +
+                128 * 5;
 }
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SWRITE_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-        return XFS_CALC_WRITEID_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize * 2 +
+                mp->m_dirblksize +
+                XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+        return MAX((mp->m_sb.sb_inodesize +
+                    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+                   (4 * mp->m_sb.sb_sectsize +
+                    4 * mp->m_sb.sb_sectsize +
+                    mp->m_sb.sb_sectsize +
+                    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
+/*
+ * Setting an attribute.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+                            XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize + 128;
 }
 /*
@@ -194,11 +537,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
 */
 void
 xfs_trans_init(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_trans_reservations_t        *resp;
+        struct xfs_trans_reservations *resp = &mp->m_reservations;
-        resp = &(mp->m_reservations);
        resp->tr_write = xfs_calc_write_reservation(mp);
        resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
        resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -253,14 +595,30 @@ _xfs_trans_alloc(
        tp->t_magic = XFS_TRANS_MAGIC;
        tp->t_type = type;
        tp->t_mountp = mp;
-        tp->t_items_free = XFS_LIC_NUM_SLOTS;
+        INIT_LIST_HEAD(&tp->t_items);
-        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
+        INIT_LIST_HEAD(&tp->t_busy);
-        xfs_lic_init(&(tp->t_items));
-        XFS_LBC_INIT(&(tp->t_busy));
        return tp;
 }
 /*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+        struct xfs_trans        *tp)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+                xfs_alloc_busy_clear(tp->t_mountp, busyp);
+        atomic_dec(&tp->t_mountp->m_active_trans);
+        xfs_trans_free_dqinfo(tp);
+        kmem_zone_free(xfs_trans_zone, tp);
+}
+/*
 * This is called to create a new transaction which will share the
 * permanent log reservation of the given transaction.  The remaining
 * unused block and rt extent reservations are also inherited.  This
@@ -282,10 +640,8 @@ xfs_trans_dup(
        ntp->t_magic = XFS_TRANS_MAGIC;
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
-        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
+        INIT_LIST_HEAD(&ntp->t_items);
-        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
+        INIT_LIST_HEAD(&ntp->t_busy);
-        xfs_lic_init(&(ntp->t_items));
-        XFS_LBC_INIT(&(ntp->t_busy));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(tp->t_ticket != NULL);
@@ -421,7 +777,6 @@ undo_blocks:
        return error;
 }
 /*
 * Record the indicated change to the given field for application
 * to the file system's superblock when the transaction commits.
@@ -650,7 +1005,7 @@ xfs_trans_apply_sb_deltas(
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
 * still need to update the incore superblock with the changes.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
@@ -764,94 +1119,340 @@ xfs_trans_unreserve_and_mod_sb(
        }
 }
 /*
- * xfs_trans_commit
+ * Add the given log item to the transaction's list of log items.
 *
- * Commit the given transaction to the log a/synchronously.
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+        struct xfs_trans        *tp,
+        struct xfs_log_item     *lip)
+{
+        struct xfs_log_item_desc *lidp;
+        ASSERT(lip->li_mountp = tp->t_mountp);
+        ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+        lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+        lidp->lid_item = lip;
+        lidp->lid_flags = 0;
+        lidp->lid_size = 0;
+        list_add_tail(&lidp->lid_trans, &tp->t_items);
+        lip->li_desc = lidp;
+}
+STATIC void
+xfs_trans_free_item_desc(
+        struct xfs_log_item_desc *lidp)
+{
+        list_del_init(&lidp->lid_trans);
+        kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+        struct xfs_log_item     *lip)
+{
+        xfs_trans_free_item_desc(lip->li_desc);
+        lip->li_desc = NULL;
+}
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+void
+xfs_trans_free_items(
+        struct xfs_trans        *tp,
+        xfs_lsn_t               commit_lsn,
+        int                     flags)
+{
+        struct xfs_log_item_desc *lidp, *next;
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+                struct xfs_log_item     *lip = lidp->lid_item;
+                lip->li_desc = NULL;
+                if (commit_lsn != NULLCOMMITLSN)
+                        IOP_COMMITTING(lip, commit_lsn);
+                if (flags & XFS_TRANS_ABORT)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                IOP_UNLOCK(lip);
+                xfs_trans_free_item_desc(lidp);
+        }
+}
+/*
+ * Unlock the items associated with a transaction.
 *
- * XFS disk error handling mechanism is not based on a typical
+ * Items which were not logged should be freed.  Those which were logged must
- * transaction abort mechanism. Logically after the filesystem
+ * still be tracked so they can be unpinned when the transaction commits.
- * gets marked 'SHUTDOWN', we can't let any new transactions
- * be durable - ie. committed to disk - because some metadata might
- * be inconsistent. In such cases, this returns an error, and the
- * caller may assume that all locked objects joined to the transaction
- * have already been unlocked as if the commit had succeeded.
- * Do not reference the transaction structure after this call.
 */
- /*ARGSUSED*/
+STATIC void
-int
+xfs_trans_unlock_items(
-_xfs_trans_commit(
+        struct xfs_trans        *tp,
-        xfs_trans_t     *tp,
+        xfs_lsn_t               commit_lsn)
-        uint            flags,
-        int             *log_flushed)
 {
-        xfs_log_iovec_t         *log_vector;
+        struct xfs_log_item_desc *lidp, *next;
-        int                     nvec;
-        xfs_mount_t             *mp;
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
-        xfs_lsn_t               commit_lsn;
+                struct xfs_log_item     *lip = lidp->lid_item;
-        /* REFERENCED */
-        int                     error;
-        int                     log_flags;
-        int                     sync;
-#define XFS_TRANS_LOGVEC_COUNT  16
-        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-        struct xlog_in_core     *commit_iclog;
-        int                     shutdown;
-        commit_lsn = -1;
+                lip->li_desc = NULL;
+                if (commit_lsn != NULLCOMMITLSN)
+                        IOP_COMMITTING(lip, commit_lsn);
+                IOP_UNLOCK(lip);
+                /*
+                 * Free the descriptor if the item is not dirty
+                 * within this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                        xfs_trans_free_item_desc(lidp);
+        }
+}
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+        struct xfs_trans        *tp)
+{
+        int                     nvecs;
+        struct xfs_log_item_desc *lidp;
+        nvecs = 1;
+        /* In the non-debug case we need to start bailing out if we
+         * didn't find a log_item here, return zero and let trans_commit
+         * deal with it.
+         */
+        if (list_empty(&tp->t_items)) {
+                ASSERT(0);
+                return 0;
+        }
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                        continue;
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                nvecs += lidp->lid_size;
+        }
+        return nvecs;
+}
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+        struct xfs_trans        *tp,
+        struct xfs_log_iovec    *log_vector)
+{
+        struct xfs_log_item_desc *lidp;
+        struct xfs_log_iovec    *vecp;
+        uint                    nitems;
        /*
-         * Determine whether this commit is releasing a permanent
+         * Skip over the entry for the transaction header, we'll
-         * log reservation or not.
+         * fill that in at the end.
         */
-        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+        vecp = log_vector + 1;
-                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-                log_flags = XFS_LOG_REL_PERM_RESERV;
+        nitems = 0;
-        } else {
+        ASSERT(!list_empty(&tp->t_items));
-                log_flags = 0;
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                        continue;
+                /*
+                 * The item may be marked dirty but not log anything.  This can
+                 * be used to get called when a transaction is committed.
+                 */
+                if (lidp->lid_size)
+                        nitems++;
+                IOP_FORMAT(lidp->lid_item, vecp);
+                vecp += lidp->lid_size;
+                IOP_PIN(lidp->lid_item);
        }
-        mp = tp->t_mountp;
        /*
-         * If there is nothing to be logged by the transaction,
+         * Now that we've counted the number of items in this transaction, fill
-         * then unlock all of the items associated with the
+         * in the transaction header. Note that the transaction header does not
-         * transaction and free the transaction structure.
+         * have a log item.
-         * Also make sure to return any reserved blocks to
+         */
-         * the free pool.
+        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_header.th_type = tp->t_type;
+        tp->t_header.th_num_items = nitems;
+        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+        log_vector->i_len = sizeof(xfs_trans_header_t);
+        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore log, it is
+ * possible for two transactions to be completing and manipulating the same
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
+ * The value of this field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because otherwise
+ * they could be immediately flushed and we'd have to race with the flusher
+ * trying to pull the item from the AIL as we add it.
+ */
+void
+xfs_trans_item_committed(
+        struct xfs_log_item     *lip,
+        xfs_lsn_t               commit_lsn,
+        int                     aborted)
+{
+        xfs_lsn_t               item_lsn;
+        struct xfs_ail          *ailp;
+        if (aborted)
+                lip->li_flags |= XFS_LI_ABORTED;
+        item_lsn = IOP_COMMITTED(lip, commit_lsn);
+        /* If the committed routine returns -1, item has been freed. */
+        if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                return;
+        /*
+         * If the returned lsn is greater than what it contained before, update
+         * the location of the item in the AIL.  If it is not, then do nothing.
+         * Items can never move backwards in the AIL.
+         *
+         * While the new lsn should usually be greater, it is possible that a
+         * later transaction completing simultaneously with an earlier one
+         * using the same item could complete first with a higher lsn.  This
+         * would cause the earlier transaction to fail the test below.
         */
-shut_us_down:
+        ailp = lip->li_ailp;
-        shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+        spin_lock(&ailp->xa_lock);
-        if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
+        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-                xfs_trans_unreserve_and_mod_sb(tp);
                /*
-                 * It is indeed possible for the transaction to be
+                 * This will set the item's lsn to item_lsn and update the
-                 * not dirty but the dqinfo portion to be. All that
+                 * position of the item in the AIL.
-                 * means is that we have some (non-persistent) quota
+                 *
-                 * reservations that need to be unreserved.
+                 * xfs_trans_ail_update() drops the AIL lock.
                 */
-                xfs_trans_unreserve_and_mod_dquots(tp);
+                xfs_trans_ail_update(ailp, lip, item_lsn);
-                if (tp->t_ticket) {
+        } else {
-                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
+                spin_unlock(&ailp->xa_lock);
-                                                        NULL, log_flags);
-                        if (commit_lsn == -1 && !shutdown)
-                                shutdown = XFS_ERROR(EIO);
-                }
-                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
-                xfs_trans_free_busy(tp);
-                xfs_trans_free(tp);
-                XFS_STATS_INC(xs_trans_empty);
-                return (shutdown);
        }
-        ASSERT(tp->t_ticket != NULL);
        /*
-         * If we need to update the superblock, then do it now.
+         * Now that we've repositioned the item in the AIL, unpin it so it can
+         * be flushed. Pass information about buffer stale state down from the
+         * log item flags, if anyone else stales the buffer we do not want to
+         * pay any attention to it.
         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+        IOP_UNPIN(lip, 0);
-                xfs_trans_apply_sb_deltas(tp);
+}
-        xfs_trans_apply_dquot_deltas(tp);
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+        struct xfs_trans        *tp,
+        int                     abortflag)
+{
+        struct xfs_log_item_desc *lidp, *next;
+        /* Call the transaction's completion callback if there is one. */
+        if (tp->t_callback != NULL)
+                tp->t_callback(tp, tp->t_callarg);
+        list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+                xfs_trans_free_item_desc(lidp);
+        }
+        xfs_trans_free(tp);
+}
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+        struct xfs_trans        *tp,
+        uint                    flags)
+{
+        struct xfs_log_item_desc *lidp;
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+                /*
+                 * Unpin all but those that aren't dirty.
+                 */
+                if (lidp->lid_flags & XFS_LID_DIRTY)
+                        IOP_UNPIN(lidp->lid_item, 1);
+        }
+        xfs_trans_unreserve_and_mod_sb(tp);
+        xfs_trans_unreserve_and_mod_dquots(tp);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+        xfs_trans_free(tp);
+}
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        int                     shutdown;
+        int                     error;
+        int                     log_flags = 0;
+        struct xlog_in_core     *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+        struct xfs_log_iovec    log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+        struct xfs_log_iovec    *log_vector;
+        uint                    nvec;
        /*
         * Ask each log item how many log_vector entries it will
@@ -861,8 +1462,7 @@ shut_us_down:
         */
        nvec = xfs_trans_count_vecs(tp);
        if (nvec == 0) {
-                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                return ENOMEM;  /* triggers a shutdown! */
-                goto shut_us_down;
        } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
                log_vector = log_vector_fast;
        } else {
@@ -877,6 +1477,9 @@ shut_us_down:
         */
        xfs_trans_fill_vecs(tp, log_vector);
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
        error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
        /*
@@ -884,18 +1487,19 @@ shut_us_down:
         * at any time after this call.  However, all the items associated
         * with the transaction are still locked and pinned in memory.
         */
-        commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+        *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
-        tp->t_commit_lsn = commit_lsn;
+        tp->t_commit_lsn = *commit_lsn;
-        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+        trace_xfs_trans_commit_lsn(tp);
+        if (nvec > XFS_TRANS_LOGVEC_COUNT)
                kmem_free(log_vector);
-        }
        /*
         * If we got a log write error. Unpin the logitems that we
         * had pinned, clean up, free trans structure, and return error.
         */
-        if (error || commit_lsn == -1) {
+        if (error || *commit_lsn == -1) {
                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
                return XFS_ERROR(EIO);
@@ -909,8 +1513,6 @@ shut_us_down:
         */
        xfs_trans_unreserve_and_mod_sb(tp);
-        sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
         * Tell the LM to call the transaction completion routine
         * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1555,7 @@ shut_us_down:
         * the commit lsn of this transaction for dependency tracking
         * purposes.
         */
-        xfs_trans_unlock_items(tp, commit_lsn);
+        xfs_trans_unlock_items(tp, *commit_lsn);
        /*
         * If we detected a log error earlier, finish committing
@@ -973,156 +1575,195 @@ shut_us_down:
         * and the items are released we can finally allow the iclog to
         * go to disk.
         */
-        error = xfs_log_release_iclog(mp, commit_iclog);
+        return xfs_log_release_iclog(mp, commit_iclog);
-        /*
-         * If the transaction needs to be synchronous, then force the
-         * log out now and wait for it.
-         */
-        if (sync) {
-                if (!error) {
-                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, log_flushed);
-                }
-                XFS_STATS_INC(xs_trans_sync);
-        } else {
-                XFS_STATS_INC(xs_trans_async);
-        }
-        return (error);
 }
 /*
- * Total up the number of log iovecs needed to commit this
+ * Walk the log items and allocate log vector structures for
- * transaction.  The transaction itself needs one for the
+ * each item large enough to fit all the vectors they require.
- * transaction header.  Ask each dirty item in turn how many
+ * Note that this format differs from the old log vector format in
- * it needs to get the total.
+ * that there is no transaction header in these log vectors.
 */
-STATIC uint
+STATIC struct xfs_log_vec *
-xfs_trans_count_vecs(
+xfs_trans_alloc_log_vecs(
        xfs_trans_t     *tp)
 {
-        int                     nvecs;
+        struct xfs_log_item_desc *lidp;
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *lv = NULL;
+        struct xfs_log_vec      *ret_lv = NULL;
-        nvecs = 1;
-        lidp = xfs_trans_first_item(tp);
-        ASSERT(lidp != NULL);
-        /* In the non-debug case we need to start bailing out if we
+        /* Bail out if we didn't find a log item.  */
-         * didn't find a log_item here, return zero and let trans_commit
+        if (list_empty(&tp->t_items)) {
-         * deal with it.
+                ASSERT(0);
-         */
+                return NULL;
-        if (lidp == NULL)
+        }
-                return 0;
-        while (lidp != NULL) {
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-                /*
+                struct xfs_log_vec *new_lv;
-                 * Skip items which aren't dirty in this transaction.
-                 */
+                /* Skip items which aren't dirty in this transaction. */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                if (!(lidp->lid_flags & XFS_LID_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
-                }
+                /* Skip items that do not have any vectors for writing */
                lidp->lid_size = IOP_SIZE(lidp->lid_item);
-                nvecs += lidp->lid_size;
+                if (!lidp->lid_size)
-                lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                new_lv = kmem_zalloc(sizeof(*new_lv) +
+                                lidp->lid_size * sizeof(struct xfs_log_iovec),
+                                KM_SLEEP);
+                /* The allocated iovec region lies beyond the log vector. */
+                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+                new_lv->lv_niovecs = lidp->lid_size;
+                new_lv->lv_item = lidp->lid_item;
+                if (!ret_lv)
+                        ret_lv = new_lv;
+                else
+                        lv->lv_next = new_lv;
+                lv = new_lv;
        }
-        return nvecs;
+        return ret_lv;
 }
-/*
+static int
- * Called from the trans_commit code when we notice that
+xfs_trans_commit_cil(
- * the filesystem is in the middle of a forced shutdown.
+        struct xfs_mount        *mp,
- */
+        struct xfs_trans        *tp,
-STATIC void
+        xfs_lsn_t               *commit_lsn,
-xfs_trans_uncommit(
+        int                     flags)
-        xfs_trans_t     *tp,
-        uint            flags)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *log_vector;
+        int                     error;
-        for (lidp = xfs_trans_first_item(tp);
+        /*
-             lidp != NULL;
+         * Get each log item to allocate a vector structure for
-             lidp = xfs_trans_next_item(tp, lidp)) {
+         * the log item to to pass to the log write code. The
-                /*
+         * CIL commit code will format the vector and save it away.
-                 * Unpin all but those that aren't dirty.
+         */
-                 */
+        log_vector = xfs_trans_alloc_log_vecs(tp);
-                if (lidp->lid_flags & XFS_LID_DIRTY)
+        if (!log_vector)
-                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+                return ENOMEM;
-        }
-        xfs_trans_unreserve_and_mod_sb(tp);
+        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        xfs_trans_unreserve_and_mod_dquots(tp);
+        if (error)
+                return error;
-        xfs_trans_free_items(tp, flags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
+        return 0;
 }
 /*
- * Fill in the vector with pointers to data to be logged
+ * xfs_trans_commit
- * by this transaction.  The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * As each item fills in the entries it needs, also pin the item
+ * Commit the given transaction to the log a/synchronously.
- * so that it cannot be flushed out until the log write completes.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
 */
-STATIC void
+int
-xfs_trans_fill_vecs(
+_xfs_trans_commit(
-        xfs_trans_t             *tp,
+        struct xfs_trans        *tp,
-        xfs_log_iovec_t         *log_vector)
+        uint                    flags,
+        int                     *log_flushed)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_log_iovec_t         *vecp;
+        xfs_lsn_t               commit_lsn = -1;
-        uint                    nitems;
+        int                     error = 0;
+        int                     log_flags = 0;
+        int                     sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
-         * Skip over the entry for the transaction header, we'll
+         * Determine whether this commit is releasing a permanent
-         * fill that in at the end.
+         * log reservation or not.
         */
-        vecp = log_vector + 1;          /* pointer arithmetic */
+        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        }
-        nitems = 0;
+        /*
-        lidp = xfs_trans_first_item(tp);
+         * If there is nothing to be logged by the transaction,
-        ASSERT(lidp != NULL);
+         * then unlock all of the items associated with the
-        while (lidp != NULL) {
+         * transaction and free the transaction structure.
-                /*
+         * Also make sure to return any reserved blocks to
-                 * Skip items which aren't dirty in this transaction.
+         * the free pool.
-                 */
+         */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+        if (!(tp->t_flags & XFS_TRANS_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
+                goto out_unreserve;
-                        continue;
-                }
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                /*
+                error = XFS_ERROR(EIO);
-                 * The item may be marked dirty but not log anything.
+                goto out_unreserve;
-                 * This can be used to get called when a transaction
+        }
-                 * is committed.
-                 */
+        ASSERT(tp->t_ticket != NULL);
-                if (lidp->lid_size) {
-                        nitems++;
+        /*
+         * If we need to update the superblock, then do it now.
+         */
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_apply_dquot_deltas(tp);
+        if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+                error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+        else
+                error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+        if (error == ENOMEM) {
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                error = XFS_ERROR(EIO);
+                goto out_unreserve;
+        }
+        /*
+         * If the transaction needs to be synchronous, then force the
+         * log out now and wait for it.
+         */
+        if (sync) {
+                if (!error) {
+                        error = _xfs_log_force_lsn(mp, commit_lsn,
+                                      XFS_LOG_SYNC, log_flushed);
                }
-                IOP_FORMAT(lidp->lid_item, vecp);
+                XFS_STATS_INC(xs_trans_sync);
-                vecp += lidp->lid_size;         /* pointer arithmetic */
+        } else {
-                IOP_PIN(lidp->lid_item);
+                XFS_STATS_INC(xs_trans_async);
-                lidp = xfs_trans_next_item(tp, lidp);
        }
+        return error;
+out_unreserve:
+        xfs_trans_unreserve_and_mod_sb(tp);
        /*
-         * Now that we've counted the number of items in this
+         * It is indeed possible for the transaction to be not dirty but
-         * transaction, fill in the transaction header.
+         * the dqinfo portion to be.  All that means is that we have some
+         * (non-persistent) quota reservations that need to be unreserved.
         */
-        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        xfs_trans_unreserve_and_mod_dquots(tp);
-        tp->t_header.th_type = tp->t_type;
+        if (tp->t_ticket) {
-        tp->t_header.th_num_items = nitems;
+                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+                if (commit_lsn == -1 && !error)
-        log_vector->i_len = sizeof(xfs_trans_header_t);
+                        error = XFS_ERROR(EIO);
-        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+        }
-}
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+        xfs_trans_free(tp);
+        XFS_STATS_INC(xs_trans_empty);
+        return error;
+}
 /*
 * Unlock all of the transaction's items and free the transaction.
@@ -1138,12 +1779,6 @@ xfs_trans_cancel(
        int                     flags)
 {
        int                     log_flags;
-#ifdef DEBUG
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        int                     i;
-#endif
        xfs_mount_t             *mp = tp->t_mountp;
        /*
@@ -1162,21 +1797,11 @@ xfs_trans_cancel(
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
        }
 #ifdef DEBUG
-        if (!(flags & XFS_TRANS_ABORT)) {
+        if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
-                licp = &(tp->t_items);
+                struct xfs_log_item_desc *lidp;
-                while (licp != NULL) {
-                        lidp = licp->lic_descs;
+                list_for_each_entry(lidp, &tp->t_items, lid_trans)
-                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                        ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
-                                if (xfs_lic_isfree(licp, i)) {
-                                        continue;
-                                }
-                                lip = lidp->lid_item;
-                                if (!XFS_FORCED_SHUTDOWN(mp))
-                                        ASSERT(!(lip->li_type == XFS_LI_EFD));
-                        }
-                        licp = licp->lic_next;
-                }
        }
 #endif
        xfs_trans_unreserve_and_mod_sb(tp);
@@ -1195,25 +1820,10 @@ xfs_trans_cancel(
        /* mark this thread as no longer being in a transaction */
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
 }
-/*
- * Free the transaction structure.  If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
-        xfs_trans_t     *tp)
-{
-        atomic_dec(&tp->t_mountp->m_active_trans);
-        xfs_trans_free_dqinfo(tp);
-        kmem_zone_free(xfs_trans_zone, tp);
-}
 /*
 * Roll from one trans in the sequence of PERMANENT transactions to
 * the next: permanent transactions are only flushed out when
@@ -1279,178 +1889,6 @@ xfs_trans_roll(
        if (error)
                return error;
-        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(trans, dp);
-        xfs_trans_ihold(trans, dp);
        return 0;
 }
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk.  It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
-        xfs_trans_t     *tp,
-        int             abortflag)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i;
-        /*
-         * Call the transaction's completion callback if there
-         * is one.
-         */
-        if (tp->t_callback != NULL) {
-                tp->t_callback(tp, tp->t_callarg);
-        }
-        /*
-         * Special case the chunk embedded in the transaction.
-         */
-        licp = &(tp->t_items);
-        if (!(xfs_lic_are_all_free(licp))) {
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-        }
-        /*
-         * Process the items in each chunk in turn.
-         */
-        licp = licp->lic_next;
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Clear all the per-AG busy list items listed in this transaction
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-                        if (!XFS_LBC_ISFREE(lbcp, i)) {
-                                xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
-                                                     lbsp->lbc_idx);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        xfs_trans_free_busy(tp);
-        /*
-         * That's it for the transaction structure.  Free it.
-         */
-        xfs_trans_free(tp);
-}
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item.  If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously.  The AIL lock
- * will protect the lsn field of each item.  The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
-        xfs_log_item_chunk_t    *licp,
-        xfs_lsn_t               lsn,
-        int                     aborted)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        xfs_lsn_t               item_lsn;
-        int                     i;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                struct xfs_ail          *ailp;
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                if (aborted)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                /*
-                 * Send in the ABORTED flag to the COMMITTED routine
-                 * so that it knows whether the transaction was aborted
-                 * or not.
-                 */
-                item_lsn = IOP_COMMITTED(lip, lsn);
-                /*
-                 * If the committed routine returns -1, make
-                 * no more references to the item.
-                 */
-                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
-                        continue;
-                }
-                /*
-                 * If the returned lsn is greater than what it
-                 * contained before, update the location of the
-                 * item in the AIL.  If it is not, then do nothing.
-                 * Items can never move backwards in the AIL.
-                 *
-                 * While the new lsn should usually be greater, it
-                 * is possible that a later transaction completing
-                 * simultaneously with an earlier one using the
-                 * same item could complete first with a higher lsn.
-                 * This would cause the earlier transaction to fail
-                 * the test below.
-                 */
-                ailp = lip->li_ailp;
-                spin_lock(&ailp->xa_lock);
-                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-                        /*
-                         * This will set the item's lsn to item_lsn
-                         * and update the position of the item in
-                         * the AIL.
-                         *
-                         * xfs_trans_ail_update() drops the AIL lock.
-                         */
-                        xfs_trans_ail_update(ailp, lip, item_lsn);
-                } else {
-                        spin_unlock(&ailp->xa_lock);
-                }
-                /*
-                 * Now that we've repositioned the item in the AIL,
-                 * unpin it so it can be flushed. Pass information
-                 * about buffer stale state down from the log item
-                 * flags, if anyone else stales the buffer we do not
-                 * want to pay any attention to it.
-                 */
-                IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
-        }
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..c13c0f97b494 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
 #define XFS_LI_DQUOT            0x123d
 #define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_TYPE_DESC \
+        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+        { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+        { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+        { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
 /*
 * Transaction types.  Used to distinguish types of buffers.
 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFSRT_FREE         39
 #define XFS_TRANS_SWAPEXT               40
 #define XFS_TRANS_SB_COUNT              41
-#define XFS_TRANS_TYPE_MAX              41
+#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_TYPE_MAX              42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
@@ -150,106 +161,14 @@ typedef struct xfs_trans_header {
 * the amount of space needed to log the item it describes
 * once we get to commit processing (see xfs_trans_commit()).
 */
-typedef struct xfs_log_item_desc {
+struct xfs_log_item_desc {
        struct xfs_log_item     *lid_item;
-        ushort          lid_size;
+        ushort                  lid_size;
-        unsigned char   lid_flags;
+        unsigned char           lid_flags;
-        unsigned char   lid_index;
+        struct list_head        lid_trans;
-} xfs_log_item_desc_t;
+};
 #define XFS_LID_DIRTY           0x1
-#define XFS_LID_PINNED          0x2
-#define XFS_LID_BUF_STALE       0x8
-/*
- * This structure is used to maintain a chunk list of log_item_desc
- * structures. The free field is a bitmask indicating which descriptors
- * in this chunk's array are free.  The unused field is the first value
- * not used since this chunk was allocated.
- */
-#define XFS_LIC_NUM_SLOTS       15
-typedef struct xfs_log_item_chunk {
-        struct xfs_log_item_chunk       *lic_next;
-        ushort                          lic_free;
-        ushort                          lic_unused;
-        xfs_log_item_desc_t             lic_descs[XFS_LIC_NUM_SLOTS];
-} xfs_log_item_chunk_t;
-#define XFS_LIC_MAX_SLOT        (XFS_LIC_NUM_SLOTS - 1)
-#define XFS_LIC_FREEMASK        ((1 << XFS_LIC_NUM_SLOTS) - 1)
-/*
- * Initialize the given chunk.  Set the chunk's free descriptor mask
- * to indicate that all descriptors are free.  The caller gets to set
- * lic_unused to the right value (0 matches all free).  The
- * lic_descs.lid_index values are set up as each desc is allocated.
- */
-static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
-{
-        cp->lic_free = XFS_LIC_FREEMASK;
-}
-static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_descs[slot].lid_index = (unsigned char)(slot);
-}
-static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
-{
-        return cp->lic_free & XFS_LIC_FREEMASK;
-}
-static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
-{
-        cp->lic_free = XFS_LIC_FREEMASK;
-}
-static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
-{
-        return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
-}
-static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
-{
-        return (cp->lic_free & (1 << slot));
-}
-static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_free &= ~(1 << slot);
-}
-static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
-{
-        cp->lic_free |= 1 << slot;
-}
-static inline xfs_log_item_desc_t *
-xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
-{
-        return &(cp->lic_descs[slot]);
-}
-static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
-{
-        return (uint)dp->lid_index;
-}
-/*
- * Calculate the address of a chunk given a descriptor pointer:
- * dp - dp->lid_index give the address of the start of the lic_descs array.
- * From this we subtract the offset of the lic_descs field in a chunk.
- * All of this yields the address of the chunk, which is
- * cast to a chunk pointer.
- */
-static inline xfs_log_item_chunk_t *
-xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
-{
-        return (xfs_log_item_chunk_t*) \
-                (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \
-                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
-}
 #define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
@@ -265,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
 * Values for call flags parameter.
 */
-#define XFS_TRANS_NOSLEEP               0x1
-#define XFS_TRANS_WAIT                  0x2
 #define XFS_TRANS_RELEASE_LOG_RES       0x4
 #define XFS_TRANS_ABORT                 0x8
@@ -290,24 +207,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-/*
 * Per-extent log reservation for the allocation btree changes
 * involved in freeing or allocating an extent.
 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -331,429 +230,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-          (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-          (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *      of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
-        (MAX( \
-         ((4 * (mp)->m_sb.sb_inodesize) + \
-          (2 * XFS_DIROP_LOG_RES(mp)) + \
-          (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-         ((3 * (mp)->m_sb.sb_sectsize) + \
-          (3 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-          (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
 #define XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp)     \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp)            \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          1024 + \
-          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (2 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp)             \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (3 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp)      XFS_CALC_CREATE_LOG_RES(mp)
 #define XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-         (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), 1) + \
-         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-         (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp)    ((mp)->m_sb.sb_inodesize + \
-                                         (mp)->m_sb.sb_sectsize + 512)
 #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-/*
- * Growing the data section of the filesystem.
- *      superblock
- *      agi and agf
- *      allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize * 3 + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *      superblock: sector size
- *      agf of the ag from which the extent is allocated: sector size
- *      bmap btree for bitmap/summary inode: max depth * blocksize
- *      bitmap/summary inode: inode size
- *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-        (2 * (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-         (mp)->m_sb.sb_inodesize + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * \
-          (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *      one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-        ((mp)->m_sb.sb_blocksize + 128)
 #define XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *      superblock: sector size
- *      bitmap inode: inode size
- *      summary inode: inode size
- *      one bitmap block: blocksize
- *      summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + \
-         2 * (mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_blocksize + \
-         (mp)->m_rsumsize + \
-         (128 * 5))
 #define XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *      inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
 /*
 * Logging the inode timestamps on an fsync -- same as SWRITE
 * as long as SWRITE logs the entire inode core
 */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *      inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-/*
- * Converting the inode from non-attributed to attributed.
- *      the inode being converted: inode size
- *      agf block and superblock (for block allocation)
- *      the new block (directory sized)
- *      bmap blocks for the new directory block
- *      allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp)   \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize * 2 + \
-         (mp)->m_dirblksize + \
-         XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp)  \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
 #define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-/*
- * Setting an attribute.
- *      the inode getting the attribute
- *      the superblock for allocations
- *      the agfs extents are allocated from
- *      the attribute btree * max depth
- *      the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp)    \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
 #define XFS_ATTRSET_LOG_RES(mp, ext)    \
        ((mp)->m_reservations.tr_attrset + \
         (ext * (mp)->m_sb.sb_sectsize) + \
         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp)     \
-        (MAX( \
-          ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + 128)
 #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
@@ -805,6 +311,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 typedef struct xfs_log_item {
        struct list_head                li_ail;         /* AIL pointers */
@@ -820,6 +327,11 @@ typedef struct xfs_log_item {
                                                        /* buffer item iodone */
                                                        /* callback func */
        struct xfs_item_ops             *li_ops;        /* function list */
+        /* delayed logging */
+        struct list_head                li_cil;         /* CIL pointers */
+        struct xfs_log_vec              *li_lv;         /* active log vector */
+        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
 #define XFS_LI_IN_AIL   0x1
@@ -833,8 +345,7 @@ typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin)(xfs_log_item_t *, int remove);
-        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
        uint (*iop_trylock)(xfs_log_item_t *);
        void (*iop_unlock)(xfs_log_item_t *);
        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
@@ -846,8 +357,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip, remove)   (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
 #define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
@@ -864,34 +374,6 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_PUSHBUF        3
 /*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
 * This is the type of function which can be given to xfs_trans_callback()
 * to be called upon the transaction's commit to disk.
 */
@@ -939,11 +421,9 @@ typedef struct xfs_trans {
        int64_t                 t_rblocks_delta;/* superblock rblocks change */
        int64_t                 t_rextents_delta;/* superblocks rextents chg */
        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-        unsigned int            t_items_free;   /* log item descs free */
+        struct list_head        t_items;        /* log item descriptors */
-        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
+        struct list_head        t_busy;         /* list of busy extents */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
        unsigned long           t_pflags;       /* saved process flags state */
 } xfs_trans_t;
@@ -993,8 +473,8 @@ void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
                               xfs_ino_t , uint, uint, struct xfs_inode **);
-void            xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
+void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
-void            xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
 void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void            xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
@@ -1017,11 +497,9 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void            xfs_trans_cancel(xfs_trans_t *, int);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-                                        xfs_agnumber_t ag,
-                                        xfs_extlen_t idx);
 extern kmem_zone_t      *xfs_trans_zone;
+extern kmem_zone_t      *xfs_log_item_desc_zone;
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e799824f7245..dc9069568ff7 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..90af025e6839 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_buf_item.h"
@@ -40,11 +36,32 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+        struct xfs_trans        *tp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             blkno,
+        int                     len)
+{
+        struct xfs_log_item_desc *lidp;
+        struct xfs_buf_log_item *blip;
+        len = BBTOB(len);
+        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+                blip = (struct xfs_buf_log_item *)lidp->lid_item;
+                if (blip->bli_item.li_type == XFS_LI_BUF &&
+                    XFS_BUF_TARGET(blip->bli_buf) == target &&
+                    XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+                    XFS_BUF_COUNT(blip->bli_buf) == len)
+                        return blip->bli_buf;
+        }
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+        return NULL;
-                xfs_daddr_t, int);
+}
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
-                xfs_daddr_t, int);
 /*
 * Add the locked buffer to the transaction.
@@ -74,7 +91,7 @@ _xfs_trans_bjoin(
        xfs_buf_item_init(bp, tp->t_mountp);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
        if (reset_recur)
                bip->bli_recur = 0;
@@ -87,7 +104,7 @@ _xfs_trans_bjoin(
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+        xfs_trans_add_item(tp, &bip->bli_item);
        /*
         * Initialize b_fsprivate2 so we can find it with incore_match()
@@ -112,14 +129,6 @@ xfs_trans_bjoin(
 * within the transaction, just increment its lock recursion count
 * and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * get_buf() call.
 */
@@ -149,11 +158,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
         * have it locked.  In this case we just increment the lock
         * recursion count and return the buffer to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-        } else {
-                bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +264,6 @@ int	xfs_error_mod = 33;
 * within the transaction and already read in, just increment its
 * lock recursion count and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * read_buf() call.
 */
@@ -328,11 +325,7 @@ xfs_trans_read_buf(
         * If the buffer is not yet read in, then we read it in, increment
         * the lock recursion count, and return it to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-        } else {
-                bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -467,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t	*tp,
 {
        xfs_buf_log_item_t      *bip;
        xfs_log_item_t          *lip;
-        xfs_log_item_desc_t     *lidp;
        /*
         * Default to a normal brelse() call if the tp is NULL.
@@ -495,16 +487,9 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        /*
-         * Find the item descriptor pointing to this buffer's
-         * log item.  It must be there.
-         */
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        trace_xfs_trans_brelse(bip);
        /*
@@ -520,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * If the buffer is dirty within this transaction, we can't
         * release it until we commit.
         */
-        if (lidp->lid_flags & XFS_LID_DIRTY)
+        if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
                return;
        /*
@@ -537,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        /*
         * Free up the log item descriptor tracking the released item.
         */
-        xfs_trans_free_item(tp, lidp);
+        xfs_trans_del_item(&bip->bli_item);
        /*
         * Clear the hold flag in the buf log item if it is set.
@@ -603,7 +588,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
        trace_xfs_trans_bhold(bip);
@@ -625,7 +610,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
        bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -649,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                  uint          last)
 {
        xfs_buf_log_item_t      *bip;
-        xfs_log_item_desc_t     *lidp;
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -674,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
-        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+        bip->bli_item.li_cb = xfs_buf_iodone;
        trace_xfs_trans_log_buf(bip);
@@ -688,15 +672,11 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
-                bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
 }
@@ -725,7 +705,6 @@ xfs_trans_binval(
        xfs_trans_t     *tp,
        xfs_buf_t       *bp)
 {
-        xfs_log_item_desc_t     *lidp;
        xfs_buf_log_item_t      *bip;
        ASSERT(XFS_BUF_ISBUSY(bp));
@@ -733,8 +712,6 @@ xfs_trans_binval(
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
-        ASSERT(lidp != NULL);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        trace_xfs_trans_binval(bip);
@@ -747,9 +724,9 @@ xfs_trans_binval(
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-                ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+                ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
        }
@@ -759,7 +736,7 @@ xfs_trans_binval(
         * in the buf log item.  The STALE flag will be used in
         * xfs_buf_item_unpin() to determine if it should clean up
         * when the last reference to the buf item is given up.
-         * We set the XFS_BLI_CANCEL flag in the buf log format structure
+         * We set the XFS_BLF_CANCEL flag in the buf log format structure
         * and log the buf item.  This will be used at recovery time
         * to determine that copies of the buffer in the log before
         * this should not be replayed.
@@ -777,26 +754,26 @@ xfs_trans_binval(
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_STALE(bp);
        bip->bli_flags |= XFS_BLI_STALE;
-        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-        bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
-        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
 /*
- * This call is used to indicate that the buffer contains on-disk
+ * This call is used to indicate that the buffer contains on-disk inodes which
- * inodes which must be handled specially during recovery.  They
+ * must be handled specially during recovery.  They require special handling
- * require special handling because only the di_next_unlinked from
+ * because only the di_next_unlinked from the inodes in the buffer should be
- * the inodes in the buffer should be recovered.  The rest of the
+ * recovered.  The rest of the data in the buffer is logged via the inodes
- * data in the buffer is logged via the inodes themselves.
+ * themselves.
 *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
- * format structure so that we'll know what to do at recovery time.
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
 */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
        xfs_trans_t     *tp,
@@ -811,7 +788,7 @@ xfs_trans_inode_buf(
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+        bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 /*
@@ -838,12 +815,9 @@ xfs_trans_stale_inode_buf(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_STALE_INODE;
-        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
+        bip->bli_item.li_cb = xfs_buf_iodone;
-                xfs_buf_iodone;
 }
 /*
 * Mark the buffer as being one which contains newly allocated
 * inodes.  We need to make sure that even if this buffer is
@@ -893,120 +867,12 @@ xfs_trans_dquot_buf(
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-        ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
-               type == XFS_BLI_PDQUOT_BUF ||
+               type == XFS_BLF_PDQUOT_BUF ||
-               type == XFS_BLI_GDQUOT_BUF);
+               type == XFS_BLF_GDQUOT_BUF);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_format.blf_flags |= type;
 }
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        licp = &tp->t_items;
-        if (!xfs_lic_are_all_free(licp)) {
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                break;
-                        } else {
-                                bp = NULL;
-                        }
-                }
-        }
-        return bp;
-}
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (xfs_lic_are_all_free(licp)) {
-                        ASSERT(licp == &tp->t_items);
-                        ASSERT(licp->lic_next == NULL);
-                        return NULL;
-                }
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                return bp;
-                        }
-                }
-        }
-        return NULL;
-}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 27cce2a9c7e9..f783d5e9fa70 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
@@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t	*tp,
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
+        xfs_trans_add_item(tp, &efip->efi_item);
+        return efip;
-        return (efip);
 }
 /*
@@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t		*tp,
                         xfs_fsblock_t          start_block,
                         xfs_extlen_t           ext_len)
 {
-        xfs_log_item_desc_t     *lidp;
        uint                    next_extent;
        xfs_extent_t            *extp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        next_extent = efip->efi_next_extent;
        ASSERT(next_extent < efip->efi_format.efi_nextents);
@@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t		*tp,
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
+        xfs_trans_add_item(tp, &efdp->efd_item);
+        return efdp;
-        return (efdp);
 }
 /*
@@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t		*tp,
                         xfs_fsblock_t          start_block,
                         xfs_extlen_t           ext_len)
 {
-        xfs_log_item_desc_t     *lidp;
        uint                    next_extent;
        xfs_extent_t            *extp;
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        next_extent = efdp->efd_next_extent;
        ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 785ff101da0a..cdc53a1050c5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,20 +24,16 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 #ifdef XFS_TRANS_DEBUG
 STATIC void
@@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug(
 #define xfs_trans_inode_broot_debug(ip)
 #endif
 /*
 * Get an inode and join it to the transaction.
 */
@@ -62,78 +57,66 @@ xfs_trans_iget(
 {
        int                     error;
-        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp)
+        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp, lock_flags);
+                xfs_trans_ijoin(tp, *ipp);
+                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
+        }
        return error;
 }
 /*
- * Add the locked inode to the transaction.
+ * Add a locked inode to the transaction.
- * The inode must be locked, and it cannot be associated with any
+ *
- * transaction.  The caller must specify the locks already held
+ * The inode must be locked, and it cannot be associated with any transaction.
- * on the inode.
 */
 void
 xfs_trans_ijoin(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip)
-        uint            lock_flags)
 {
        xfs_inode_log_item_t    *iip;
        ASSERT(ip->i_transp == NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(lock_flags & XFS_ILOCK_EXCL);
        if (ip->i_itemp == NULL)
                xfs_inode_item_init(ip, ip->i_mount);
        iip = ip->i_itemp;
-        ASSERT(iip->ili_flags == 0);
+        ASSERT(iip->ili_lock_flags == 0);
        /*
         * Get a log_item_desc to point at the new item.
         */
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+        xfs_trans_add_item(tp, &iip->ili_item);
        xfs_trans_inode_broot_debug(ip);
        /*
-         * If the IO lock is already held, mark that in the inode log item.
-         */
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-        }
-        /*
         * Initialize i_transp so we can find it with xfs_inode_incore()
         * in xfs_trans_iget() above.
         */
        ip->i_transp = tp;
 }
 /*
- * Mark the inode as not needing to be unlocked when the inode item's
+ * Add a locked inode to the transaction.
- * IOP_UNLOCK() routine is called.  The inode must already be locked
+ *
- * and associated with the given transaction.
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is commited.  The inode will also be unlocked at that point.  The inode
+ * must be locked, and it cannot be associated with any transaction.
 */
-/*ARGSUSED*/
 void
-xfs_trans_ihold(
+xfs_trans_ijoin_ref(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip)
+        struct xfs_inode        *ip,
+        uint                    lock_flags)
 {
-        ASSERT(ip->i_transp == tp);
+        xfs_trans_ijoin(tp, ip);
-        ASSERT(ip->i_itemp != NULL);
+        IHOLD(ip);
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ip->i_itemp->ili_lock_flags = lock_flags;
-        ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
 }
 /*
 * This is called to mark the fields indicated in fieldmask as needing
 * to be logged when the transaction is committed.  The inode must
@@ -149,17 +132,12 @@ xfs_trans_log_inode(
        xfs_inode_t     *ip,
        uint            flags)
 {
-        xfs_log_item_desc_t     *lidp;
        ASSERT(ip->i_transp == tp);
        ASSERT(ip->i_itemp != NULL);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
-        ASSERT(lidp != NULL);
        tp->t_flags |= XFS_TRANS_DIRTY;
-        lidp->lid_flags |= XFS_LID_DIRTY;
+        ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        /*
         * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
deleted file mode 100644
index eb3fc57f9eef..000000000000
--- a/fs/xfs/xfs_trans_item.c
+++ /dev/null
@@ -1,549 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has its own ailp */
-#include "xfs_bit.h"
-#include "xfs_buf_item.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
-                                        int, int, xfs_lsn_t);
-/*
- * This is called to add the given log item to the transaction's
- * list of log items.  It must find a free log item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to item descriptor used to point
- * to the new item.  The log item will now point to its new descriptor
- * with its li_desc field.
- */
-xfs_log_item_desc_t *
-xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_chunk_t    *licp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_items_free == 0) {
-                licp = (xfs_log_item_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
-                ASSERT(licp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                xfs_lic_init(licp);
-                xfs_lic_claim(licp, 0);
-                licp->lic_unused = 1;
-                xfs_lic_init_slot(licp, 0);
-                lidp = xfs_lic_slot(licp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                licp->lic_next = tp->t_items.lic_next;
-                tp->t_items.lic_next = licp;
-                tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lidp->lid_item = lip;
-                lidp->lid_flags = 0;
-                lidp->lid_size = 0;
-                lip->li_desc = lidp;
-                lip->li_mountp = tp->t_mountp;
-                lip->li_ailp = tp->t_mountp->m_ail;
-                return lidp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        licp = &tp->t_items;
-        while (licp != NULL) {
-                if (xfs_lic_vacancy(licp)) {
-                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
-                                i = licp->lic_unused;
-                                ASSERT(xfs_lic_isfree(licp, i));
-                                break;
-                        }
-                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-                                if (xfs_lic_isfree(licp, i))
-                                        break;
-                        }
-                        ASSERT(i <= XFS_LIC_MAX_SLOT);
-                        break;
-                }
-                licp = licp->lic_next;
-        }
-        ASSERT(licp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        xfs_lic_claim(licp, i);
-        if (licp->lic_unused <= i) {
-                licp->lic_unused = i + 1;
-                xfs_lic_init_slot(licp, i);
-        }
-        lidp = xfs_lic_slot(licp, i);
-        tp->t_items_free--;
-        lidp->lid_item = lip;
-        lidp->lid_flags = 0;
-        lidp->lid_size = 0;
-        lip->li_desc = lidp;
-        lip->li_mountp = tp->t_mountp;
-        lip->li_ailp = tp->t_mountp->m_ail;
-        return lidp;
-}
-/*
- * Free the given descriptor.
- *
- * This requires setting the bit in the chunk's free mask corresponding
- * to the given slot.
- */
-void
-xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
-        uint                    slot;
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    **licpp;
-        slot = xfs_lic_desc_to_slot(lidp);
-        licp = xfs_lic_desc_to_chunk(lidp);
-        xfs_lic_relse(licp, slot);
-        lidp->lid_item->li_desc = NULL;
-        tp->t_items_free++;
-        /*
-         * If there are no more used items in the chunk and this is not
-         * the chunk embedded in the transaction structure, then free
-         * the chunk. First pull it from the chunk list and then
-         * free it back to the heap.  We didn't bother with a doubly
-         * linked list here because the lists should be very short
-         * and this is not a performance path.  It's better to save
-         * the memory of the extra pointer.
-         *
-         * Also decrement the transaction structure's count of free items
-         * by the number in a chunk since we are freeing an empty chunk.
-         */
-        if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
-                licpp = &(tp->t_items.lic_next);
-                while (*licpp != licp) {
-                        ASSERT(*licpp != NULL);
-                        licpp = &((*licpp)->lic_next);
-                }
-                *licpp = licp->lic_next;
-                kmem_free(licp);
-                tp->t_items_free -= XFS_LIC_NUM_SLOTS;
-        }
-}
-/*
- * This is called to find the descriptor corresponding to the given
- * log item.  It returns a pointer to the descriptor.
- * The log item MUST have a corresponding descriptor in the given
- * transaction.  This routine does not return NULL, it panics.
- *
- * The descriptor pointer is kept in the log item's li_desc field.
- * Just return it.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
-{
-        ASSERT(lip->li_desc != NULL);
-        return lip->li_desc;
-}
-/*
- * Return a pointer to the first descriptor in the chunk list.
- * This does not return NULL if there are none, it panics.
- *
- * The first descriptor must be in either the first or second chunk.
- * This is because the only chunk allowed to be empty is the first.
- * All others are freed when they become empty.
- *
- * At some point this and xfs_trans_next_item() should be optimized
- * to quickly look at the mask to determine if there is anything to
- * look at.
- */
-xfs_log_item_desc_t *
-xfs_trans_first_item(xfs_trans_t *tp)
-{
-        xfs_log_item_chunk_t    *licp;
-        int                     i;
-        licp = &tp->t_items;
-        /*
-         * If it's not in the first chunk, skip to the second.
-         */
-        if (xfs_lic_are_all_free(licp)) {
-                licp = licp->lic_next;
-        }
-        /*
-         * Return the first non-free descriptor in the chunk.
-         */
-        ASSERT(!xfs_lic_are_all_free(licp));
-        for (i = 0; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
-        return NULL;
-}
-/*
- * Given a descriptor, return the next descriptor in the chunk list.
- * This returns NULL if there are no more used descriptors in the list.
- *
- * We do this by first locating the chunk in which the descriptor resides,
- * and then scanning forward in the chunk and the list for the next
- * used descriptor.
- */
-/*ARGSUSED*/
-xfs_log_item_desc_t *
-xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
-{
-        xfs_log_item_chunk_t    *licp;
-        int                     i;
-        licp = xfs_lic_desc_to_chunk(lidp);
-        /*
-         * First search the rest of the chunk. The for loop keeps us
-         * from referencing things beyond the end of the chunk.
-         */
-        for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        /*
-         * Now search the next chunk.  It must be there, because the
-         * next chunk would have been freed if it were empty.
-         * If there is no next chunk, return NULL.
-         */
-        if (licp->lic_next == NULL) {
-                return NULL;
-        }
-        licp = licp->lic_next;
-        ASSERT(!xfs_lic_are_all_free(licp));
-        for (i = 0; i < licp->lic_unused; i++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                return xfs_lic_slot(licp, i);
-        }
-        ASSERT(0);
-        /* NOTREACHED */
-        return NULL; /* keep gcc quite */
-}
-/*
- * This is called to unlock all of the items of a transaction and to free
- * all the descriptors of that transaction.
- *
- * It walks the list of descriptors and unlocks each item.  It frees
- * each chunk except that embedded in the transaction as it goes along.
- */
-void
-xfs_trans_free_items(
-        xfs_trans_t     *tp,
-        int             flags)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        int                     abort;
-        abort = flags & XFS_TRANS_ABORT;
-        licp = &tp->t_items;
-        /*
-         * Special case the embedded chunk so we don't free it below.
-         */
-        if (!xfs_lic_are_all_free(licp)) {
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-                xfs_lic_all_free(licp);
-                licp->lic_unused = 0;
-        }
-        licp = licp->lic_next;
-        /*
-         * Unlock each item in each chunk and free the chunks.
-         */
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Reset the transaction structure's free item count.
-         */
-        tp->t_items_free = XFS_LIC_NUM_SLOTS;
-        tp->t_items.lic_next = NULL;
-}
-/*
- * This is called to unlock the items associated with a transaction.
- * Items which were not logged should be freed.
- * Those which were logged must still be tracked so they can be unpinned
- * when the transaction commits.
- */
-void
-xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_item_chunk_t    **licpp;
-        int                     freed;
-        freed = 0;
-        licp = &tp->t_items;
-        /*
-         * Special case the embedded chunk so we don't free.
-         */
-        if (!xfs_lic_are_all_free(licp)) {
-                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-        }
-        licpp = &(tp->t_items.lic_next);
-        licp = licp->lic_next;
-        /*
-         * Unlock each item in each chunk, free non-dirty descriptors,
-         * and free empty chunks.
-         */
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
-                next_licp = licp->lic_next;
-                if (xfs_lic_are_all_free(licp)) {
-                        *licpp = next_licp;
-                        kmem_free(licp);
-                        freed -= XFS_LIC_NUM_SLOTS;
-                } else {
-                        licpp = &(licp->lic_next);
-                }
-                ASSERT(*licpp == next_licp);
-                licp = next_licp;
-        }
-        /*
-         * Fix the free descriptor count in the transaction.
-         */
-        tp->t_items_free += freed;
-}
-/*
- * Unlock each item pointed to by a descriptor in the given chunk.
- * Stamp the commit lsn into each item if necessary.
- * Free descriptors pointing to items which are not dirty if freeing_chunk
- * is zero. If freeing_chunk is non-zero, then we need to unlock all
- * items in the chunk.
- * 
- * Return the number of descriptors freed.
- */
-STATIC int
-xfs_trans_unlock_chunk(
-        xfs_log_item_chunk_t    *licp,
-        int                     freeing_chunk,
-        int                     abort,
-        xfs_lsn_t               commit_lsn)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        int                     i;
-        int                     freed;
-        freed = 0;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                lip->li_desc = NULL;
-                if (commit_lsn != NULLCOMMITLSN)
-                        IOP_COMMITTING(lip, commit_lsn);
-                if (abort)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                IOP_UNLOCK(lip);
-                /*
-                 * Free the descriptor if the item is not dirty
-                 * within this transaction and the caller is not
-                 * going to just free the entire thing regardless.
-                 */
-                if (!(freeing_chunk) &&
-                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-                        xfs_lic_relse(licp, i);
-                        freed++;
-                }
-        }
-        return freed;
-}
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_busy_free == 0) {
-                lbcp = (xfs_log_busy_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-                ASSERT(lbcp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                XFS_LBC_INIT(lbcp);
-                XFS_LBC_CLAIM(lbcp, 0);
-                lbcp->lbc_unused = 1;
-                lbsp = XFS_LBC_SLOT(lbcp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                lbcp->lbc_next = tp->t_busy.lbc_next;
-                tp->t_busy.lbc_next = lbcp;
-                tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lbsp->lbc_ag = ag;
-                lbsp->lbc_idx = idx;
-                return lbsp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                if (XFS_LBC_VACANCY(lbcp)) {
-                        if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-                                i = lbcp->lbc_unused;
-                                break;
-                        } else {
-                                /* out-of-order vacancy */
-                                cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-                                ASSERT(0);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        ASSERT(lbcp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        XFS_LBC_CLAIM(lbcp, i);
-        if (lbcp->lbc_unused <= i) {
-                lbcp->lbc_unused = i + 1;
-        }
-        lbsp = XFS_LBC_SLOT(lbcp, i);
-        tp->t_busy_free--;
-        lbsp->lbc_ag = ag;
-        lbsp->lbc_idx = idx;
-        return lbsp;
-}
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_chunk_t    *lbcq;
-        lbcp = tp->t_busy.lbc_next;
-        while (lbcp != NULL) {
-                lbcq = lbcp->lbc_next;
-                kmem_free(lbcp);
-                lbcp = lbcq;
-        }
-        XFS_LBC_INIT(&tp->t_busy);
-        tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -23,25 +23,13 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
-/*
+void    xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
- * From xfs_trans_item.c
+void    xfs_trans_del_item(struct xfs_log_item *);
- */
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-struct xfs_log_item_desc        *xfs_trans_add_item(struct xfs_trans *,
+                                int flags);
-                                            struct xfs_log_item *);
+void    xfs_trans_item_committed(struct xfs_log_item *lip,
-void                            xfs_trans_free_item(struct xfs_trans *,
+                                xfs_lsn_t commit_lsn, int aborted);
-                                            struct xfs_log_item_desc *);
+void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-struct xfs_log_item_desc        *xfs_trans_find_item(struct xfs_trans *,
-                                             struct xfs_log_item *);
-struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
-struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
-                                             struct xfs_log_item_desc *);
-void                            xfs_trans_free_items(struct xfs_trans *, int);
-void                            xfs_trans_unlock_items(struct xfs_trans *,
-                                                        xfs_lsn_t);
-void                            xfs_trans_free_busy(xfs_trans_t *tp);
-xfs_log_busy_slot_t             *xfs_trans_add_busy(xfs_trans_t *tp,
-                                                    xfs_agnumber_t ag,
-                                                    xfs_extlen_t idx);
 /*
 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
+typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types:
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 4d88616bde91..b7d5769d2df0 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -25,18 +25,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_rw.h"
 #include "xfs_itable.h"
 #include "xfs_utils.h"
@@ -324,86 +320,3 @@ xfs_bumplink(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        return 0;
 }
-/*
- * Try to truncate the given file to 0 length.  Currently called
- * only out of xfs_remove when it has to truncate a file to free
- * up space for the remove to proceed.
- */
-int
-xfs_truncate_file(
-        xfs_mount_t     *mp,
-        xfs_inode_t     *ip)
-{
-        xfs_trans_t     *tp;
-        int             error;
-#ifdef QUOTADEBUG
-        /*
-         * This is called to truncate the quotainodes too.
-         */
-        if (XFS_IS_UQUOTA_ON(mp)) {
-                if (ip->i_ino != mp->m_sb.sb_uquotino)
-                        ASSERT(ip->i_udquot);
-        }
-        if (XFS_IS_OQUOTA_ON(mp)) {
-                if (ip->i_ino != mp->m_sb.sb_gquotino)
-                        ASSERT(ip->i_gdquot);
-        }
-#endif
-        /*
-         * Make the call to xfs_itruncate_start before starting the
-         * transaction, because we cannot make the call while we're
-         * in a transaction.
-         */
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
-        if (error) {
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                      XFS_TRANS_PERM_LOG_RES,
-                                      XFS_ITRUNCATE_LOG_COUNT))) {
-                xfs_trans_cancel(tp, 0);
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                return error;
-        }
-        /*
-         * Follow the normal truncate locking protocol.  Since we
-         * hold the inode in the transaction, we know that its number
-         * of references will stay constant.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        xfs_trans_ihold(tp, ip);
-        /*
-         * Signal a sync xaction.  The only case where that isn't
-         * the case is if we're truncating an already unlinked file
-         * on a wsync fs.  In that case, we know the blocks can't
-         * reappear in the file because the links to file are
-         * permanently toast.  Currently, we're always going to
-         * want a sync transaction because this code is being
-         * called from places where nlink is guaranteed to be 1
-         * but I'm leaving the tests in to protect against future
-         * changes -- rcc.
-         */
-        error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
-                                     XFS_DATA_FORK,
-                                     ((ip->i_d.di_nlink != 0 ||
-                                       !(mp->m_flags & XFS_MOUNT_WSYNC))
-                                      ? 1 : 0));
-        if (error) {
-                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                 XFS_TRANS_ABORT);
-        } else {
-                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        return error;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index ef321225d269..f55b9678264f 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
-extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
                                xfs_inode_t **, int *);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -26,19 +26,14 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_itable.h"
-#include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
@@ -73,7 +68,7 @@ xfs_setattr(
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
        int                     need_iolock = 1;
-        xfs_itrace_entry(ip);
+        trace_xfs_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return XFS_ERROR(EROFS);
@@ -143,16 +138,6 @@ xfs_setattr(
                        goto error_return;
                }
        } else {
-                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
-                    !(flags & XFS_ATTR_DMI)) {
-                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
-                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
-                                iattr->ia_size, 0, dmflags, NULL);
-                        if (code) {
-                                lock_flags = 0;
-                                goto error_return;
-                        }
-                }
                if (need_iolock)
                        lock_flags |= XFS_IOLOCK_EXCL;
        }
@@ -236,8 +221,11 @@ xfs_setattr(
                         * transaction to modify the i_size.
                         */
                        code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+                        if (code)
+                                goto error_return;
                }
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                lock_flags &= ~XFS_ILOCK_EXCL;
                /*
                 * We are going to log the inode size change in this
@@ -251,40 +239,38 @@ xfs_setattr(
                 * really care about here and prevents waiting for other data
                 * not within the range we care about here.
                 */
-                if (!code &&
+                if (ip->i_size != ip->i_d.di_size &&
-                    ip->i_size != ip->i_d.di_size &&
                    iattr->ia_size > ip->i_d.di_size) {
                        code = xfs_flush_pages(ip,
                                        ip->i_d.di_size, iattr->ia_size,
                                        XBF_ASYNC, FI_NONE);
+                        if (code)
+                                goto error_return;
                }
                /* wait for all I/O to complete */
                xfs_ioend_wait(ip);
-                if (!code)
+                code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-                        code = xfs_itruncate_data(ip, iattr->ia_size);
+                                            xfs_get_blocks);
-                if (code) {
+                if (code)
-                        ASSERT(tp == NULL);
-                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
                        goto error_return;
-                }
                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-                if ((code = xfs_trans_reserve(tp, 0,
+                code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                             XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                         XFS_TRANS_PERM_LOG_RES,
-                                             XFS_TRANS_PERM_LOG_RES,
+                                         XFS_ITRUNCATE_LOG_COUNT);
-                                             XFS_ITRUNCATE_LOG_COUNT))) {
+                if (code)
-                        xfs_trans_cancel(tp, 0);
+                        goto error_return;
-                        if (need_iolock)
-                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                truncate_setsize(inode, iattr->ia_size);
-                        return code;
-                }
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+                lock_flags |= XFS_ILOCK_EXCL;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, lock_flags);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Only change the c/mtime if we are changing the size
@@ -334,8 +320,7 @@ xfs_setattr(
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
        } else if (tp) {
-                xfs_trans_ijoin(tp, ip, lock_flags);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        }
        /*
@@ -470,17 +455,10 @@ xfs_setattr(
                        return XFS_ERROR(code);
        }
-        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
-            !(flags & XFS_ATTR_DMI)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL, NULL, NULL,
-                                        0, 0, AT_DELAY_FLAG(flags));
-        }
        return 0;
 abort_return:
        commit_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -516,7 +494,7 @@ xfs_readlink_bmap(
        int             error = 0;
        error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
-                        mval, &nmaps, NULL, NULL);
+                        mval, &nmaps, NULL);
        if (error)
                goto out;
@@ -557,7 +535,7 @@ xfs_readlink(
        int             pathlen;
        int             error = 0;
-        xfs_itrace_entry(ip);
+        trace_xfs_readlink(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -613,14 +591,14 @@ xfs_free_eofblocks(
         */
        end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-        map_len = last_fsb - end_fsb;
+        if (last_fsb <= end_fsb)
-        if (map_len <= 0)
                return 0;
+        map_len = last_fsb - end_fsb;
        nimaps = 1;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
        error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-                          NULL, 0, &imap, &nimaps, NULL, NULL);
+                          NULL, 0, &imap, &nimaps, NULL);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!error && (nimaps != 0) &&
@@ -675,10 +653,7 @@ xfs_free_eofblocks(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip,
+                xfs_trans_ijoin(tp, ip);
-                                XFS_IOLOCK_EXCL |
-                                XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
                error = xfs_itruncate_finish(&tp, ip,
                                             ip->i_size,
@@ -750,8 +725,7 @@ xfs_inactive_symlink_rmt(
        xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        size = (int)ip->i_d.di_size;
        ip->i_d.di_size = 0;
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Find the block(s) so we can inval and unmap them.
@@ -761,7 +735,7 @@ xfs_inactive_symlink_rmt(
        nmaps = ARRAY_SIZE(mval);
        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-                        &free_list, NULL)))
+                        &free_list)))
                goto error0;
        /*
         * Invalidate the block(s).
@@ -776,7 +750,7 @@ xfs_inactive_symlink_rmt(
         * Unmap the dead block(s) to the free_list.
         */
        if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-                        &first_block, &free_list, NULL, &done)))
+                        &first_block, &free_list, &done)))
                goto error1;
        ASSERT(done);
        /*
@@ -795,8 +769,7 @@ xfs_inactive_symlink_rmt(
         * Mark it dirty so it will be logged and moved forward in the log as
         * part of every commit.
         */
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        /*
         * Get a new, empty transaction to return to our caller.
@@ -929,8 +902,7 @@ xfs_inactive_attrs(
                goto error_cancel;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        ASSERT(ip->i_d.di_anextents == 0);
@@ -1035,8 +1007,6 @@ xfs_inactive(
        int             error;
        int             truncate;
-        xfs_itrace_entry(ip);
        /*
         * If the inode is already free, then there can be nothing
         * to clean up here.
@@ -1060,9 +1030,6 @@ xfs_inactive(
        mp = ip->i_mount;
-        if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
-                XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
        error = 0;
        /* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1120,8 +1087,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * normally, we have to run xfs_itruncate_finish sync.
@@ -1154,8 +1120,7 @@ xfs_inactive(
                        return VN_INACTIVE_CACHE;
                }
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        } else {
                error = xfs_trans_reserve(tp, 0,
                                          XFS_IFREE_LOG_RES(mp),
@@ -1168,8 +1133,7 @@ xfs_inactive(
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
        }
        /*
@@ -1257,7 +1221,7 @@ xfs_lookup(
        int                     error;
        uint                    lock_mode;
-        xfs_itrace_entry(dp);
+        trace_xfs_lookup(dp, name);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return XFS_ERROR(EIO);
@@ -1269,7 +1233,7 @@ xfs_lookup(
        if (error)
                goto out;
-        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
+        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
        if (error)
                goto out_free_name;
@@ -1309,21 +1273,11 @@ xfs_create(
        uint                    log_res;
        uint                    log_count;
-        xfs_itrace_entry(dp);
+        trace_xfs_create(dp, name);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                dp, DM_RIGHT_NULL, NULL,
-                                DM_RIGHT_NULL, name->name, NULL,
-                                mode, 0, 0);
-                if (error)
-                        return error;
-        }
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
        else
@@ -1427,8 +1381,7 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        IHOLD(dp);
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1487,16 +1440,7 @@ xfs_create(
        xfs_qm_dqrele(gdqp);
        *ipp = ip;
+        return 0;
-        /* Fallthrough to std_return with error = 0  */
- std_return:
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
-                                ip, DM_RIGHT_NULL, name->name, NULL, mode,
-                                error, 0);
-        }
-        return error;
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
@@ -1510,8 +1454,8 @@ xfs_create(
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
-        goto std_return;
+        return error;
 out_abort_rele:
        /*
@@ -1726,20 +1670,11 @@ xfs_remove(
        uint                    resblks;
        uint                    log_count;
-        xfs_itrace_entry(dp);
+        trace_xfs_remove(dp, name);
-        xfs_itrace_entry(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL, name->name, NULL,
-                                        ip->i_d.di_mode, 0, 0);
-                if (error)
-                        return error;
-        }
        error = xfs_qm_dqattach(dp, 0);
        if (error)
                goto std_return;
@@ -1782,15 +1717,8 @@ xfs_remove(
        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        /*
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-         * At this point, we've gotten both the directory and the entry
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-         * inodes locked.
-         */
-        IHOLD(ip);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        IHOLD(dp);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        /*
         * If we're removing a directory perform some additional validation.
@@ -1877,21 +1805,15 @@ xfs_remove(
        if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
                xfs_filestream_deassociate(ip);
- std_return:
+        return 0;
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-                XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
-                                NULL, DM_RIGHT_NULL, name->name, NULL,
-                                ip->i_d.di_mode, error, 0);
-        }
-        return error;
 out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
 int
@@ -1909,25 +1831,13 @@ xfs_link(
        int                     committed;
        int                     resblks;
-        xfs_itrace_entry(tdp);
+        trace_xfs_link(tdp, target_name);
-        xfs_itrace_entry(sip);
        ASSERT(!S_ISDIR(sip->i_d.di_mode));
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
-                                        tdp, DM_RIGHT_NULL,
-                                        sip, DM_RIGHT_NULL,
-                                        target_name->name, NULL, 0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        error = xfs_qm_dqattach(sip, 0);
        if (error)
                goto std_return;
@@ -1953,15 +1863,8 @@ xfs_link(
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-        /*
+        xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
-         * Increment vnode ref counts since xfs_trans_commit &
+        xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
-         * xfs_trans_cancel will both unlock the inodes and
-         * decrement the associated ref counts.
-         */
-        IHOLD(sip);
-        IHOLD(tdp);
-        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
        /*
         * If the source has too many links, we can't make any more to it.
@@ -2014,27 +1917,14 @@ xfs_link(
                goto abort_return;
        }
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error)
-                goto std_return;
-        /* Fall through to std_return with error = 0. */
-std_return:
-        if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
-                                tdp, DM_RIGHT_NULL,
-                                sip, DM_RIGHT_NULL,
-                                target_name->name, NULL, 0, error, 0);
-        }
-        return error;
 abort_return:
        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
 error_return:
        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
+ std_return:
+        return error;
 }
 int
@@ -2074,7 +1964,7 @@ xfs_symlink(
        ip = NULL;
        tp = NULL;
-        xfs_itrace_entry(dp);
+        trace_xfs_symlink(dp, link_name);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -2086,17 +1976,6 @@ xfs_symlink(
        if (pathlen >= MAXPATHLEN)      /* total string too long */
                return XFS_ERROR(ENAMETOOLONG);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
-                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        link_name->name,
-                                        (unsigned char *)target_path, 0, 0, 0);
-                if (error)
-                        return error;
-        }
-        /* Return through std_return after this point. */
        udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
@@ -2180,8 +2059,7 @@ xfs_symlink(
         * transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-        IHOLD(dp);
+        xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
        /*
@@ -2215,7 +2093,7 @@ xfs_symlink(
                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
-                                  &free_list, NULL);
+                                  &free_list);
                if (error) {
                        goto error1;
                }
@@ -2278,21 +2156,8 @@ xfs_symlink(
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
-        /* Fall through to std_return with error = 0 or errno from
+        *ipp = ip;
-         * xfs_trans_commit     */
+        return 0;
-std_return:
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
-                                        dp, DM_RIGHT_NULL,
-                                        error ? NULL : ip,
-                                        DM_RIGHT_NULL, link_name->name,
-                                        (unsigned char *)target_path,
-                                        0, error, 0);
-        }
-        if (!error)
-                *ipp = ip;
-        return error;
 error2:
        IRELE(ip);
@@ -2306,8 +2171,8 @@ std_return:
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
-        goto std_return;
+        return error;
 }
 int
@@ -2333,13 +2198,12 @@ xfs_set_dmattrs(
                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        ip->i_d.di_dmevmask = evmask;
        ip->i_d.di_dmstate  = state;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        IHOLD(ip);
        error = xfs_trans_commit(tp, 0);
        return error;
@@ -2390,7 +2254,7 @@ xfs_alloc_file_space(
        int                     committed;
        int                     error;
-        xfs_itrace_entry(ip);
+        trace_xfs_alloc_file_space(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -2412,25 +2276,9 @@ xfs_alloc_file_space(
        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
-        /*      Generate a DMAPI event if needed.       */
-        if (alloc_type != 0 && offset < ip->i_size &&
-                        (attr_flags & XFS_ATTR_DMI) == 0  &&
-                        DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-                xfs_off_t           end_dmi_offset;
-                end_dmi_offset = offset+len;
-                if (end_dmi_offset > ip->i_size)
-                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
-                                      end_dmi_offset - offset, 0, NULL);
-                if (error)
-                        return error;
-        }
        /*
         * Allocate file space until done or until there is an error
         */
-retry:
        while (allocatesize_fsb && !error) {
                xfs_fileoff_t   s, e;
@@ -2451,15 +2299,22 @@ retry:
                        e = allocatesize_fsb;
                }
+                /*
+                 * The transaction reservation is limited to a 32-bit block
+                 * count, hence we need to limit the number of blocks we are
+                 * trying to reserve to avoid an overflow. We can't allocate
+                 * more than @nimaps extents, and an extent is limited on disk
+                 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                 */
+                resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
                if (unlikely(rt)) {
-                        resrtextents = qblocks = (uint)(e - s);
+                        resrtextents = qblocks = resblks;
                        resrtextents /= mp->m_sb.sb_rextsize;
                        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                        quota_flag = XFS_QMOPT_RES_RTBLKS;
                } else {
                        resrtextents = 0;
-                        resblks = qblocks = \
+                        resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
-                                XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
                        quota_flag = XFS_QMOPT_RES_REGBLKS;
                }
@@ -2488,8 +2343,7 @@ retry:
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * Issue the xfs_bmapi() call to allocate the blocks
@@ -2498,7 +2352,7 @@ retry:
                error = xfs_bmapi(tp, ip, startoffset_fsb,
                                  allocatesize_fsb, bmapi_flag,
                                  &firstfsb, 0, imapp, &nimaps,
-                                  &free_list, NULL);
+                                  &free_list);
                if (error) {
                        goto error0;
                }
@@ -2527,17 +2381,6 @@ retry:
                startoffset_fsb += allocated_fsb;
                allocatesize_fsb -= allocated_fsb;
        }
-dmapi_enospc_check:
-        if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
-                                ip, DM_RIGHT_NULL,
-                                ip, DM_RIGHT_NULL,
-                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
-                if (error == 0)
-                        goto retry;     /* Maybe DMAPI app. has made space */
-                /* else fall through with error from XFS_SEND_DATA */
-        }
        return error;
@@ -2548,7 +2391,7 @@ error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        goto dmapi_enospc_check;
+        return error;
 }
 /*
@@ -2598,7 +2441,7 @@ xfs_zero_remaining_bytes(
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
                nimap = 1;
                error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
-                        NULL, 0, &imap, &nimap, NULL, NULL);
+                        NULL, 0, &imap, &nimap, NULL);
                if (error || nimap < 1)
                        break;
                ASSERT(imap.br_blockcount >= 1);
@@ -2661,7 +2504,6 @@ xfs_free_file_space(
 {
        int                     committed;
        int                     done;
-        xfs_off_t               end_dmi_offset;
        xfs_fileoff_t           endoffset_fsb;
        int                     error;
        xfs_fsblock_t           firstfsb;
@@ -2680,7 +2522,7 @@ xfs_free_file_space(
        mp = ip->i_mount;
-        xfs_itrace_entry(ip);
+        trace_xfs_free_file_space(ip);
        error = xfs_qm_dqattach(ip, 0);
        if (error)
@@ -2691,19 +2533,7 @@ xfs_free_file_space(
                return error;
        rt = XFS_IS_REALTIME_INODE(ip);
        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-        end_dmi_offset = offset + len;
+        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-        endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
-        if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
-            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
-                if (end_dmi_offset > ip->i_size)
-                        end_dmi_offset = ip->i_size;
-                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
-                                offset, end_dmi_offset - offset,
-                                AT_DELAY_FLAG(attr_flags), NULL);
-                if (error)
-                        return error;
-        }
        if (attr_flags & XFS_ATTR_NOLOCK)
                need_iolock = 0;
@@ -2731,7 +2561,7 @@ xfs_free_file_space(
        if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
                nimap = 1;
                error = xfs_bmapi(NULL, ip, startoffset_fsb,
-                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2746,7 +2576,7 @@ xfs_free_file_space(
                }
                nimap = 1;
                error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
-                        1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
+                        1, 0, NULL, 0, &imap, &nimap, NULL);
                if (error)
                        goto out_unlock_iolock;
                ASSERT(nimap == 0 || nimap == 1);
@@ -2814,8 +2644,7 @@ xfs_free_file_space(
                if (error)
                        goto error1;
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip);
-                xfs_trans_ihold(tp, ip);
                /*
                 * issue the bunmapi() call to free the blocks
@@ -2823,7 +2652,7 @@ xfs_free_file_space(
                xfs_bmap_init(&free_list, &firstfsb);
                error = xfs_bunmapi(tp, ip, startoffset_fsb,
                                  endoffset_fsb - startoffset_fsb,
-                                  0, 2, &firstfsb, &free_list, NULL, &done);
+                                  0, 2, &firstfsb, &free_list, &done);
                if (error) {
                        goto error0;
                }
@@ -2883,8 +2712,6 @@ xfs_change_file_space(
        xfs_trans_t     *tp;
        struct iattr    iattr;
-        xfs_itrace_entry(ip);
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2985,8 +2812,7 @@ xfs_change_file_space(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip);
-        xfs_trans_ihold(tp, ip);
        if ((attr_flags & XFS_ATTR_DMI) == 0) {
                ip->i_d.di_mode &= ~S_ISUID;
author	Andrea Bastoni <bastoni@cs.unc.edu>	2010-10-23 01:01:49 -0400
committer	Andrea Bastoni <bastoni@cs.unc.edu>	2010-10-23 01:01:49 -0400
commit	3dd41424090a0ca3a660218d06afe6ff4441bad3 (patch)
tree	511ef1bb1799027fc5aad574adce49120ecadd87 /fs
parent	5c5456402d467969b217d7fdd6670f8c8600f5a8 (diff)
parent	f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff)